datupapi 1.114.0__py3-none-any.whl → 1.115.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2015 @@
1
+ import base64
2
+ from typing import Dict, List
3
+ import boto3
4
+ import json
5
+ import os
6
+ import pandas as pd
7
+ import requests
8
+ import time
9
+ from boto3.dynamodb.conditions import Key, Attr
10
+ from boto3.s3.transfer import TransferConfig
11
+ from boto3.session import Session
12
+ from botocore.exceptions import ClientError
13
+ from botocore.config import Config as BotocoreConfig
14
+ from decimal import Decimal
15
+ from datupapi.configure.config import Config
16
+ from google.cloud import bigquery
17
+ from google.oauth2 import service_account
18
+ import google
19
+ from hashlib import md5
20
+ from hdbcli import dbapi
21
+ from sqlalchemy import create_engine, Table, Column, MetaData
22
+ from sqlalchemy import Integer, Float, String, DECIMAL
23
+ from sqlalchemy import insert, delete, exists, schema, text
24
+ from concurrent.futures import ThreadPoolExecutor # para cargar en paralelo a GCP
25
+ from threading import current_thread
26
+
27
+
28
+ class IO_Optimized(Config):
29
+
30
+ def __init__(self, config_file, logfile, log_path, *args, **kwargs):
31
+ Config.__init__(self, config_file=config_file, logfile=logfile)
32
+ self.log_path = log_path
33
+
34
+ def _get_s3_file_size(self, s3_client, bucket, key):
35
+ """
36
+ Get file size from S3 without downloading using head_object.
37
+
38
+ :param s3_client: Boto3 S3 client
39
+ :param bucket: S3 bucket name
40
+ :param key: S3 object key
41
+ :return: File size in bytes, or None if error
42
+ """
43
+ try:
44
+ response = s3_client.head_object(Bucket=bucket, Key=key)
45
+ return response['ContentLength']
46
+ except ClientError:
47
+ return None
48
+
49
+ def _create_transfer_config(self, file_size_bytes, s3_client=None):
50
+ """
51
+ Create optimized TransferConfig based on file size for multipart transfers.
52
+ Also configures urllib3 connection pool to match concurrency settings.
53
+
54
+ Strategy:
55
+ - < 50 MB: No multipart (single-part is faster)
56
+ - 50-500 MB: Moderate multipart (16 MB chunks, 16 threads)
57
+ - 500MB-2GB: Aggressive multipart (32 MB chunks, 32 threads)
58
+ - > 2 GB: Ultra-aggressive multipart (64 MB chunks, 64 threads)
59
+
60
+ :param file_size_bytes: Size of file in bytes
61
+ :param s3_client: Boto3 S3 client (optional, for pool configuration)
62
+ :return: TransferConfig object or None for small files
63
+ """
64
+ MB = 1024 * 1024
65
+ GB = 1024 * MB
66
+
67
+ if file_size_bytes < 50 * MB:
68
+ # Small files: single-part is faster
69
+ return None
70
+ elif file_size_bytes < 500 * MB:
71
+ # Medium files (50MB - 500MB): moderate multipart
72
+ max_concurrency = 16
73
+ elif file_size_bytes < 2 * GB:
74
+ # Large files (500MB - 2GB): aggressive multipart
75
+ max_concurrency = 32
76
+ else:
77
+ # Very large files (2GB+): ultra-aggressive multipart
78
+ max_concurrency = 64
79
+
80
+ # Configure urllib3 connection pool to match max_concurrency
81
+ # This prevents "Connection pool is full" warnings
82
+ if s3_client is not None:
83
+ try:
84
+ # Increase connection pool size for the HTTP adapter
85
+ adapter = s3_client._client_config
86
+ if hasattr(s3_client.meta.client, '_endpoint'):
87
+ http_session = s3_client.meta.client._endpoint.http_session
88
+ if http_session:
89
+ # Configure pool for both http and https
90
+ http_session.adapters['https://'].poolmanager.connection_pool_kw['maxsize'] = max_concurrency + 10
91
+ http_session.adapters['http://'].poolmanager.connection_pool_kw['maxsize'] = max_concurrency + 10
92
+ except (AttributeError, KeyError):
93
+ # If we can't configure the pool, continue anyway
94
+ pass
95
+
96
+ # Determine chunk size based on file size
97
+ if file_size_bytes < 500 * MB:
98
+ chunksize = 16 * MB
99
+ elif file_size_bytes < 2 * GB:
100
+ chunksize = 32 * MB
101
+ else:
102
+ chunksize = 64 * MB
103
+
104
+ return TransferConfig(
105
+ multipart_threshold=8 * MB,
106
+ multipart_chunksize=chunksize,
107
+ max_concurrency=max_concurrency,
108
+ use_threads=True
109
+ )
110
+
111
+ def _get_optimized_read_params(self, file_size_bytes, date_cols=None):
112
+ """
113
+ Calculate optimized pandas read_csv parameters based on file size.
114
+
115
+ SIMPLIFIED OPTIMIZATION: Focus on what actually improves performance.
116
+
117
+ Strategy:
118
+ - Keep it simple - pandas defaults are well-optimized
119
+ - Only add parameters that provide measurable benefit
120
+ - Avoid deprecated or counterproductive options
121
+
122
+ :param file_size_bytes: Size of file in bytes
123
+ :param date_cols: List of date columns (if known)
124
+ :return: dict with optimized parameters
125
+ """
126
+ MB = 1024 * 1024
127
+ GB = 1024 * MB
128
+
129
+ # Start with minimal, proven parameters
130
+ params = {
131
+ 'engine': 'c', # C engine is faster than python
132
+ }
133
+
134
+ # For files > 100MB, use low_memory mode to process in chunks
135
+ if file_size_bytes > 100 * MB:
136
+ params['low_memory'] = True
137
+
138
+ # Only add date optimization if date columns are specified
139
+ # ISO8601 format is significantly faster for date parsing
140
+ if date_cols and len(date_cols) > 0:
141
+ params['cache_dates'] = True
142
+
143
+ return params
144
+
145
+ def get_secret(self, secret_name=None):
146
+ """
147
+ Return the credentials mapped to the entered secret name
148
+
149
+ :param secret_name: Name identifying the credentials in AWS.
150
+ :return response: Credential to authenticate against AWS resource
151
+
152
+ >>> creds = get_secret()
153
+ >>> creds = ...
154
+ """
155
+ session = boto3.session.Session()
156
+ client = session.client(service_name='secretsmanager',
157
+ region_name=self.region,
158
+ aws_access_key_id=self.access_key,
159
+ aws_secret_access_key=self.secret_key)
160
+ try:
161
+ if secret_name is not None:
162
+ get_secret_value_response = client.get_secret_value(SecretId=secret_name)
163
+ else:
164
+ get_secret_value_response = client.get_secret_value(SecretId=self.sql_database + 'secret')
165
+ except ClientError as e:
166
+ if e.response['Error']['Code'] == 'DecryptionFailureException':
167
+ # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
168
+ # Deal with the exception here, and/or rethrow at your discretion.
169
+ raise e
170
+ elif e.response['Error']['Code'] == 'InternalServiceErrorException':
171
+ # An error occurred on the server side.
172
+ # Deal with the exception here, and/or rethrow at your discretion.
173
+ raise e
174
+ elif e.response['Error']['Code'] == 'InvalidParameterException':
175
+ # You provided an invalid value for a parameter.
176
+ # Deal with the exception here, and/or rethrow at your discretion.
177
+ raise e
178
+ elif e.response['Error']['Code'] == 'InvalidRequestException':
179
+ # You provided a parameter value that is not valid for the current state of the resource.
180
+ # Deal with the exception here, and/or rethrow at your discretion.
181
+ raise e
182
+ elif e.response['Error']['Code'] == 'ResourceNotFoundException':
183
+ # We can't find the resource that you asked for.
184
+ # Deal with the exception here, and/or rethrow at your discretion.
185
+ raise e
186
+ else:
187
+ # Decrypts secret using the associated KMS CMK.
188
+ # Depending on whether the secret is a string or binary, one of these fields will be populated.
189
+ if 'SecretString' in get_secret_value_response:
190
+ secret = get_secret_value_response['SecretString']
191
+ else:
192
+ decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
193
+ return json.loads(get_secret_value_response['SecretString'])
194
+
195
+ def populate_snowflake_table(self,
196
+ df,
197
+ dwh_account=None,
198
+ dwh_name=None,
199
+ dwh_user=None,
200
+ dwh_passwd=None,
201
+ dwh_dbname=None,
202
+ dwh_schema=None,
203
+ table_name=None,
204
+ replace=True):
205
+ """
206
+ Create a table in Snowflake DWH and insert the records from a dataframe
207
+
208
+ :param df: Dataframe storing the records to insert into the database table
209
+ :param dwh_account: Snowflake account identifier
210
+ :param dwh_name: Snowflake datawarehouse name
211
+ :param dwh_user: Snowflake account username
212
+ :param dwh_passwd: Snowflake account password
213
+ :param db_name: Snowflake database name
214
+ :param dwh_schema: Snowflake database schema
215
+ :param table_name: Snowflake table name
216
+ :param replace: If True replace table records whether exists. Otherwise append records. Default True.
217
+ :return inserted_records: Number of records inserted
218
+
219
+ >>> records = populate_snowflake_table(df, dwh_account='xx12345.us-east-1', dwh_name='myDwh', dwh_user='myuser', dwh_passwd='12345', dwh_dbname='mydbname', dwh_schema='myschema', table_name='mytable')
220
+ >>> records = 1000
221
+ """
222
+ if self.tenant_id != '':
223
+ tenant_table_name = (self.tenant_id + table_name)
224
+ else:
225
+ tenant_table_name = table_name
226
+
227
+ url = URL(account=dwh_account, user=dwh_user, password=dwh_passwd, warehouse=dwh_name, database=dwh_dbname, schema=dwh_schema)
228
+ try:
229
+ engine = create_engine(url)
230
+ conn = engine.connect()
231
+ if replace:
232
+ df.to_sql(tenant_table_name, con=engine, if_exists='replace', index=False, chunksize=16000)
233
+ else:
234
+ df.to_sql(tenant_table_name, con=engine, if_exists='append', index=False, chunksize=16000)
235
+ inserted_records = conn.execute('select count(*) from ' + '"' + tenant_table_name + '"').fetchone()[0]
236
+ finally:
237
+ conn.close()
238
+ engine.dispose()
239
+ return inserted_records
240
+
241
+ def populate_bigquery_table(self, df, project_id=None, tenant_id=None, table_name=None, write_mode='overwrite', gcp_key='datup-supplyai-dev-gcp.json'):
242
+ """
243
+ Create a table in BigQuery DWH and insert the records from a dataframe
244
+ :param df: Dataframe storing the records to insert into the database table
245
+ :param projectId: Project identifier in GCP
246
+ :param tenantId: Tenant or customer identifier
247
+ :param table_name: BigQuery table name
248
+ :param write_mode: BigQuery table update method. Either overwrite or append
249
+ :param gcp_key: BigQuery credential key
250
+ :return: Number of records inserted
251
+
252
+ >>> records = populate_bigquery_table(df, project_id='myproject', tenant_id='acme', table_name='mytable')
253
+ >>> records = 1000
254
+ """
255
+ key = service_account.Credentials.from_service_account_file(os.path.join('/opt/ml/processing/input', gcp_key))
256
+ client = bigquery.Client(credentials=key)
257
+
258
+ try:
259
+ if write_mode == 'overwrite':
260
+ write_mode_ = 'WRITE_TRUNCATE'
261
+ elif write_mode == 'append':
262
+ write_mode_ = 'WRITE_APPEND'
263
+ else:
264
+ self.logger.exception(f'No valid BigQuery write mode. Please check valid types: overwrite or append')
265
+ table_id = project_id + '.' + tenant_id + '.' + table_name
266
+ job_config = bigquery.LoadJobConfig(autodetect=False,
267
+ source_format=bigquery.SourceFormat.CSV,
268
+ allow_quoted_newlines=True,
269
+ write_disposition=write_mode_)
270
+ #client.delete_table(table_id, not_found_ok=True)
271
+ load_job = client.load_table_from_dataframe(dataframe=df, destination=table_id, job_config=job_config)
272
+ load_job.result()
273
+ destination_table = client.get_table(table_id)
274
+ except google.api_core.exceptions.NotFound as err:
275
+ raise
276
+ return destination_table.num_rows
277
+
278
+ def populate_bigquery_table_with_schema(self, df, project_id=None, tenant_id=None, table_name=None, write_mode='overwrite', gcp_key='datup-supplyai-dev-gcp.json'):
279
+ """
280
+ Create a table in BigQuery DWH and insert the records from a dataframe
281
+ :param df: Dataframe storing the records to insert into the database table
282
+ :param projectId: Project identifier in GCP
283
+ :param tenantId: Tenant or customer identifier
284
+ :param table_name: BigQuery table name
285
+ :return: Number of records inserted
286
+
287
+ >>> records = populate_bigquery_table_with_schema(df, project_id='myproject', tenant_id='acme', table_name='mytable')
288
+ >>> records = 1000
289
+ """
290
+ key = service_account.Credentials.from_service_account_file(os.path.join('/opt/ml/processing/input', gcp_key))
291
+ client = bigquery.Client(credentials=key)
292
+
293
+ try:
294
+ if write_mode == 'overwrite':
295
+ write_mode_ = 'WRITE_TRUNCATE'
296
+ elif write_mode == 'append':
297
+ write_mode_ = 'WRITE_APPEND'
298
+ else:
299
+ self.logger.exception(f'No valid BigQuery write mode. Please check valid types: overwrite or append')
300
+
301
+ # Build schema dynamically
302
+ df_schema = []
303
+ date_cols = list(df.select_dtypes(include=['datetime64']).columns)
304
+ string_cols = list(df.select_dtypes(include=['object']).columns)
305
+ integer_cols = list(df.select_dtypes(include=['int64']).columns)
306
+ float_cols = list(df.select_dtypes(include=['float64']).columns)
307
+ [df_schema.append(bigquery.SchemaField(col, bigquery.enums.SqlTypeNames.DATE)) for col in date_cols]
308
+ [df_schema.append(bigquery.SchemaField(col, bigquery.enums.SqlTypeNames.INT64)) for col in integer_cols]
309
+ [df_schema.append(bigquery.SchemaField(col, bigquery.enums.SqlTypeNames.FLOAT64)) for col in float_cols]
310
+ [df_schema.append(bigquery.SchemaField(col, bigquery.enums.SqlTypeNames.STRING)) for col in string_cols]
311
+ #Load pandas dataframe into BigQuery table
312
+ table_id = project_id + '.' + tenant_id + '.' + table_name
313
+ job_config = bigquery.LoadJobConfig(autodetect=False,
314
+ schema=df_schema,
315
+ write_disposition=write_mode_)
316
+ load_job = client.load_table_from_dataframe(dataframe=df, destination=table_id, job_config=job_config)
317
+ load_job.result()
318
+ destination_table = client.get_table(table_id)
319
+ except google.api_core.exceptions.NotFound as err:
320
+ raise
321
+ return destination_table.num_rows
322
+
323
+ def get_secret_for_bigquery(self,secretname=None):
324
+ """
325
+ Get the secret from AWS and return a json with the keys for
326
+ made a query in Google BigQuery
327
+ :param secretname: Name of the AWS secret
328
+ """
329
+ json = {
330
+ "type":self.get_secret(secret_name=secretname)['type'],
331
+ "project_id":self.get_secret(secret_name=secretname)['project_id'],
332
+ "private_key_id":self.get_secret(secret_name=secretname)['private_key_id'],
333
+ "private_key":self.get_secret(secret_name=secretname)['private_key'],
334
+ "client_email":self.get_secret(secret_name=secretname)['client_email'],
335
+ "client_id":self.get_secret(secret_name=secretname)['client_id'],
336
+ "auth_uri":self.get_secret(secret_name=secretname)['auth_uri'],
337
+ "token_uri":self.get_secret(secret_name=secretname)['token_uri'],
338
+ "auth_provider_x509_cert_url":self.get_secret(secret_name=secretname)['auth_provider_x509_cert_url'],
339
+ "client_x509_cert_url":self.get_secret(secret_name=secretname)['client_x509_cert_url'],
340
+ "universe_domain":self.get_secret(secret_name=secretname)['universe_domain']
341
+ }
342
+
343
+ json['private_key']=json['private_key'].replace("\\n", "\n")
344
+
345
+ return json
346
+
347
+ def describe_bigquery_table(self, aws_secret=None, gcp_key=None) -> Dict[str, List[str]]:
348
+ """
349
+ Describe a table in BigQuery shows its data sets and tables contained.
350
+ :return: Project, datasets and tables.
351
+
352
+ >>> description = describe()
353
+ >>> descripton = ...
354
+ """
355
+ try:
356
+ if aws_secret==None:
357
+ print("Using json file")
358
+ key = service_account.Credentials.from_service_account_file(os.path.join('/opt/ml/processing/input', gcp_key))
359
+ else:
360
+ print("Using aws secret")
361
+ key = service_account.Credentials.from_service_account_info(self.get_secret_for_bigquery(aws_secret))
362
+
363
+ client = bigquery.Client(credentials=key)
364
+ print('Proyecto: ', client.project)
365
+ datasets = client.list_datasets()
366
+ table_client: List[str] = []
367
+ component_client: Dict[str, List[str]] = {}
368
+ for dataset in datasets:
369
+ print('\nDataset: ',dataset.reference ,'\nTablas:')
370
+ tables = client.list_tables(dataset.reference)
371
+ for table in tables:
372
+ print(' ',table.table_id)
373
+ table_client.append(table.table_id)
374
+ component_client[dataset.reference] = table_client
375
+
376
+ client.close()
377
+ except Exception as e:
378
+ print(f"Falla la consulta en base de datos big query: {e}")
379
+ raise
380
+ else:
381
+ return component_client
382
+
383
+ def download_bigquery_table(self,
384
+ project_id=None,
385
+ tenant_id=None,
386
+ table_name=None,
387
+ aws_secret=None,
388
+ gcp_key=None,
389
+ sqlQuery=None):
390
+ """
391
+ Download a query from a data set in BigQuery.
392
+
393
+ :param projectId: Project identifier in GCP
394
+ :param tenantId: Tenant or customer identifier
395
+ :param table_name: BigQuery table name
396
+ :param aw_secret: Name of the AWS secret
397
+ :param query: SQl query.
398
+ :return: Dataframe from query.
399
+
400
+ >>> records = populate_dbtable(df, hostname='202.10.0.1', db_user='johndoe', db_passwd='123456', db_name='dbo.TheDataBase')
401
+ >>> records = 1000
402
+ """
403
+
404
+ try:
405
+ if aws_secret==None: #If gcp key is read from json file
406
+ print("Using json file")
407
+ key = service_account.Credentials.from_service_account_file(os.path.join('/opt/ml/processing/input', gcp_key))
408
+ else: #If aws_secret has a value
409
+ print("Using aws secret")
410
+ key = service_account.Credentials.from_service_account_info(self.get_secret_for_bigquery(aws_secret))
411
+
412
+ except TypeError:
413
+ print("Please, use a valid aws_secret or json file")
414
+
415
+ client = bigquery.Client(credentials=key)
416
+ try:
417
+ sql = sqlQuery
418
+ df = client.query(sql).to_dataframe()
419
+ print(f"¡Historical forecast download success from date!")
420
+ except Exception as e:
421
+ raise f"Falla la consulta en base de datos big query: {e}"
422
+ else:
423
+ return df
424
+
425
+ def populate_dbtable(self,
426
+ df,
427
+ hostname=None,
428
+ db_user=None,
429
+ db_passwd=None,
430
+ db_name=None,
431
+ port='3306',
432
+ table_name=None,
433
+ db_type='mysql',
434
+ replace=True):
435
+ """
436
+ Create a table in a MySQL database and insert the records from a dataframe
437
+
438
+ :param df: Dataframe storing the records to insert into the database table
439
+ :param hostname: Public IP address or hostname of the remote database server
440
+ :param db_user: Username of the database
441
+ :param db_passwd: Password of the database
442
+ :param db_name: Name of the target database
443
+ :param port: TCP port number of the database (usually 3306)
444
+ :param table_name: Name of target table
445
+ :param db_type: Name of database type. Choose from mysql, mssql. Default mysql.
446
+ :param replace: If True replace table records whether exists. Otherwise append records. Default True.
447
+ :return inserted_records: Number of records inserted
448
+
449
+ >>> records = populate_dbtable(df, hostname='202.10.0.1', db_user='johndoe', db_passwd='123456', db_name='dbo.TheDataBase')
450
+ >>> records = 1000
451
+ """
452
+ if db_type == 'mysql':
453
+ db_api = 'mysql+mysqlconnector://'
454
+ elif db_type == 'mysql_legacy':
455
+ db_api = 'mysql+pymysql://'
456
+ elif db_type == 'mssql':
457
+ db_api = 'mssql+pymssql://'
458
+ else:
459
+ self.logger.exception(f'No valid database type. Please check valid types: mysql, mssql')
460
+
461
+ try:
462
+ engine = create_engine(db_api + db_user + ':' + db_passwd + '@' + hostname + ':' + str(port) + '/' + db_name)
463
+ if replace:
464
+ df.to_sql(table_name, con=engine, if_exists='replace', index=False)
465
+ else:
466
+ df.to_sql(table_name, con=engine, if_exists='append', index=False)
467
+ #inserted_records = engine.execute('SELECT COUNT(*) FROM ' + table_name).fetchall()[0][0]
468
+ inserted_records = df.shape[0]
469
+ except ConnectionRefusedError as err:
470
+ logger.exception(f'Refused connection to the database. Please check parameters: {err}')
471
+ raise
472
+ return inserted_records
473
+
474
+ def populate_dbtable_threads(self,
475
+ df,
476
+ hostname=None,
477
+ db_user=None,
478
+ db_passwd=None,
479
+ db_name=None,
480
+ port='3306',
481
+ table_name=None,
482
+ db_type='mysql',
483
+ replace = True,
484
+ chunk_size=500,
485
+ batch_size=10000,
486
+ threads=2):
487
+ """
488
+ Create a table in a MySQL database and insert the records from a dataframe
489
+
490
+ :param df: Dataframe storing the records to insert into the database table
491
+ :param hostname: Public IP address or hostname of the remote database server
492
+ :param db_user: Username of the database
493
+ :param db_passwd: Password of the database
494
+ :param db_name: Name of the target database
495
+ :param port: TCP port number of the database (usually 3306)
496
+ :param table_name: Name of target table
497
+ :param db_type: Name of database type. Choose from mysql, mssql. Default mysql.
498
+ :param replace: If True replace table records whether exists. Otherwise append records. Default True.
499
+ :param chunk_size: Number of records to insert. Default 500.
500
+ :param batch_size: Number of rows per batch. Default 10000.
501
+ :param threads: Number of threads to use for parallel execution. Default 2.
502
+ :return inserted_records: Number of records inserted
503
+
504
+ >>> records = populate_dbtable(df, hostname='202.10.0.1', db_user='johndoe', db_passwd='123456', db_name='dbo.TheDataBase')
505
+ >>> records = 1000
506
+ """
507
+ try:
508
+ if db_type == 'mysql':
509
+ db_api = 'mysql+mysqlconnector://'
510
+ elif db_type == 'mysql_legacy':
511
+ db_api = 'mysql+pymysql://'
512
+ elif db_type == 'mssql':
513
+ db_api = 'mssql+pymssql://'
514
+ else:
515
+ raise ValueError(f"No valid database type. Please check valid types: mysql, mssql")
516
+
517
+ # Validación inicial
518
+ if df.empty:
519
+ print(f"[WARNING] El DataFrame para la tabla {table_name} está vacío. Saltando...")
520
+ return 0
521
+ if not table_name:
522
+ print("[ERROR] No se especificó un nombre de tabla. Saltando...")
523
+ return 0
524
+
525
+ total_inserted_records = 0
526
+
527
+ # Creación de batches y el motor
528
+ total_rows = len(df)
529
+ batches = [df.iloc[start:start + batch_size] for start in range(0, total_rows, batch_size)]
530
+ engine = create_engine(f"{db_api}{db_user}:{db_passwd}@{hostname}:{port}/{db_name}")
531
+
532
+ #Si replace, realizamos ya la eliminación de los registros anteriores y dejamos creada la estructura nueva
533
+ #Se hace por fuera de los hilos para que no genere problemas con el acceso paralelo a la tabla
534
+ action = 'replace' if replace else 'append'
535
+
536
+ if replace:
537
+ print(f"[INFO] Reemplazando la tabla {table_name} antes de iniciar la carga.")
538
+ df.iloc[0:0].to_sql(table_name, con=engine, if_exists='replace', index=False)
539
+ action = 'append' # Evitamos que los hilos vuelvan a reemplazar
540
+
541
+ # Función interna para procesar cada batch.
542
+ # Si replace es True, al iniciar se reemplaza y todos los otros batches deben agregarse al primero
543
+ def process_batch(batch, start_idx):
544
+ try:
545
+ # Obtener rango de líneas
546
+ start_line = start_idx
547
+ end_line = start_idx + len(batch) - 1
548
+ thread_name = current_thread().name
549
+ print(f"[INFO] [{thread_name}] Procesando batch de líneas {start_line}-{end_line} en la tabla {table_name} con acción {action}.")
550
+ batch.to_sql(table_name, con=engine, if_exists='append', index=False, chunksize=chunk_size)
551
+ return len(batch)
552
+ except Exception as err:
553
+ print(f"[ERROR] Error al cargar batch: {err}")
554
+ raise
555
+
556
+ # Ejecutar la carga en paralelo
557
+ print(f"[INFO] Iniciando carga para la tabla {table_name}. Total de filas: {total_rows}")
558
+ with ThreadPoolExecutor(max_workers=threads) as executor:
559
+ results = executor.map(
560
+ lambda idx_batch: process_batch(batches[idx_batch], start_idx=(idx_batch * batch_size)),
561
+ range(len(batches)))
562
+ total_inserted_records = sum(results)
563
+
564
+ print(f"[INFO] Carga completa para la tabla {table_name}. Total de registros insertados: {total_inserted_records}")
565
+ return total_inserted_records
566
+
567
+ except Exception as e:
568
+ print(f"[ERROR] Error durante la carga de la tabla {table_name}: {e}")
569
+ return 0
570
+
571
+
572
+
573
+ def download_dbtable(self, hostname=None, db_user=None, db_passwd=None, db_name=None, port='3306', table_name=None, schema=None, db_type='mysql', query=None):
574
+ """Return a dataframe containing the data extracted from MSSQL database's table supporting PyODBC connector
575
+
576
+ :param hostname: Public IP address or hostname of the remote database server
577
+ :param db_user: Username of the database
578
+ :param db_passwd: Password of the database
579
+ :param db_name: Name of the target database
580
+ :param port: TCP port number of the database. Default 3306
581
+ :param table_name: Name of target table
582
+ :param schema: Name of database schema
583
+ :param db_type: Name of database type. Choose from mysql, mssql, postgres, mysql_legacy. Default mysql.
584
+ :param query: SQL statement to use as query.
585
+ :return df: Dataframe containing the data from database's table
586
+
587
+ >>> df = download_dbtable(hostname='202.10.0.1', db_user='johndoe', db_passwd='123456', db_name='dbo.TheDataBase', query='SELECT * FROM table')
588
+ >>> df
589
+ var1 var2 var3
590
+ idx0 1 2 3
591
+ """
592
+ if db_type == 'mysql':
593
+ db_api = 'mysql+mysqlconnector://'
594
+ elif db_type == 'mysql_legacy':
595
+ db_api = 'mysql+pymysql://'
596
+ elif db_type == 'mssql':
597
+ db_api = 'mssql+pymssql://'
598
+ elif db_type == 'postgres':
599
+ db_api = 'postgresql://'
600
+ elif db_type == 'sap_hana':
601
+ db_api = 'hana+hdbcli://'
602
+ else:
603
+ self.logger.exception(f'No valid database type. Please check valid types: mysql, mssql')
604
+
605
+ try:
606
+ if db_type == 'sap_hana':
607
+ engine = create_engine(db_api + db_user + ':' + db_passwd + '@' + hostname + ':' + str(port) + '?currentSchema=' + schema)
608
+ else:
609
+ engine = create_engine(db_api + db_user + ':' + db_passwd + '@' + hostname + ':' + str(port) + '/' + db_name)
610
+ connection = engine.connect()
611
+ stmt = text(query)
612
+ df = pd.read_sql_query(stmt, connection)
613
+ except ConnectionRefusedError as err:
614
+ logger.exception(f'Refused connection to the database. Please check parameters: {err}')
615
+ raise
616
+ return df
617
+
618
+ def download_rdstable(self, rds_arn=None, secret_arn=None, database_name=None, sql_query=None, query_params=None):
619
+ """
620
+ Return query results to RDS database
621
+
622
+ :param rds_arn: Database instance or clusrter's ARN
623
+ :param secret_arn: Secret Manager resource ARN
624
+ :param database_name: Database name to query on instance or cluster
625
+ :param sql_query: Query string on SQL syntax
626
+ :param query_params: List of dictionary values to put into the query string
627
+ :return response: Records queried from the RDS database
628
+
629
+ >>> response = download_rdstable(rds_arn='arn:rds:mycluster', \
630
+ secret_arn='arn:secret:mysecret', \
631
+ database_name='mydb', \
632
+ sql_query=[{'name': 'paramId', 'value': {'stringValue': 'myvalue'}}], \
633
+ query_params=None)
634
+ >>> response = [{'date': '2021-06-07'}, {'name': 'John Doe'}, {'salary': 1000}]
635
+ """
636
+ client = boto3.client('rds-data',
637
+ region_name='us-east-1',
638
+ aws_access_key_id=self.access_key,
639
+ aws_secret_access_key=self.secret_key)
640
+ try:
641
+ # Query project table
642
+ response = client.execute_statement(parameters=query_params,
643
+ resourceArn=rds_arn,
644
+ secretArn=secret_arn,
645
+ database=database_name,
646
+ sql=sql_query)
647
+ except client.exceptions.BadRequestException as err:
648
+ print(f'Incorrect request. Please check query syntax and parameters: {err}')
649
+ return False
650
+ return response['records']
651
+
652
+ def download_csv(self,
653
+ q_name,
654
+ datalake_path=None,
655
+ sep=',',
656
+ index_col=None,
657
+ usecols=None,
658
+ num_records=None,
659
+ dayfirst=False,
660
+ compression='infer',
661
+ encoding='utf-8',
662
+ date_cols=None,
663
+ types=None,
664
+ thousands=None,
665
+ decimal='.',
666
+ low_memory=True,
667
+ use_multipart=True):
668
+ """Return a dataframe from a csv file stored in the datalake
669
+
670
+ OPTIMIZED VERSION with S3 Multipart Transfer support for faster downloads.
671
+
672
+ :param q_name: Plain file (.csv) to download and stored in a dataframe
673
+ :param datalake_path: Path to download the file from the S3 datalake. Default None.
674
+ :param sep: Field delimiter of the downloaded file. Default ','
675
+ :param index_col: Column(s) to use as the row labels of the DataFrame, either given as string name or column index.
676
+ :param usecols: Columns to use in returning dataframe.
677
+ :param num_records: Number of records to fetch from the source. Default None
678
+ :param dayfirst: DD/MM format dates, international and European format. Default False
679
+ :param compression: For on-the-fly decompression of on-disk data. Default 'infer'
680
+ :param encoding: Encoding to use for UTF when reading/writing. Default 'utf-8'
681
+ :param date_cols: List of date columns to parse as datetime type. Default None
682
+ :param types: Dict with data columns as keys and data types as values. Default None
683
+ :param thousands: Thousands separator
684
+ :param decimal: Decimal separator. Default '.'
685
+ :param low_memory: Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. Default True
686
+ :param use_multipart: Enable S3 multipart transfer for files >50MB. Default True
687
+ :return df: Dataframe containing the data from the file stored in the datalake
688
+
689
+ >>> df = download_csv(q_name='Q', datalake_path='as-is/folder')
690
+ >>> df
691
+ var1 var2 var3
692
+ idx0 1 2 3
693
+ """
694
+ start_time = time.time()
695
+ print(f"[DOWNLOAD_CSV] Starting: {q_name}.csv")
696
+
697
+ # Configure boto3 client with larger connection pool for multipart transfers
698
+ botocore_config = BotocoreConfig(
699
+ max_pool_connections=100, # Increased from default 10 to support high concurrency
700
+ retries={'max_attempts': 3, 'mode': 'adaptive'}
701
+ )
702
+ s3_client = boto3.client(
703
+ 's3',
704
+ region_name=self.region,
705
+ aws_access_key_id=self.access_key,
706
+ aws_secret_access_key=self.secret_key,
707
+ config=botocore_config
708
+ )
709
+ file_path = os.path.join(Config.LOCAL_PATH, q_name + '.csv')
710
+
711
+ # Build S3 key
712
+ if datalake_path is None:
713
+ s3_key = q_name + '.csv'
714
+ else:
715
+ s3_key = os.path.join(datalake_path, q_name, q_name + '.csv')
716
+
717
+ print(f"[DOWNLOAD_CSV] S3 path: s3://{self.datalake}/{s3_key}")
718
+
719
+ try:
720
+ # PHASE 1 OPTIMIZATION: Get file size and create transfer config
721
+ file_size = self._get_s3_file_size(s3_client, self.datalake, s3_key)
722
+
723
+ transfer_config = None
724
+ if use_multipart and file_size and file_size > 50 * 1024 * 1024: # > 50MB
725
+ transfer_config = self._create_transfer_config(file_size, s3_client)
726
+ print(f"[DOWNLOAD_CSV] Using multipart transfer | Size: {file_size / (1024*1024):.2f} MB")
727
+ elif file_size:
728
+ print(f"[DOWNLOAD_CSV] Using single-part transfer | Size: {file_size / (1024*1024):.2f} MB")
729
+
730
+ # Download file with optimized config
731
+ download_start = time.time()
732
+ if transfer_config:
733
+ s3_client.download_file(
734
+ Bucket=self.datalake,
735
+ Key=s3_key,
736
+ Filename=file_path,
737
+ Config=transfer_config
738
+ )
739
+ else:
740
+ s3_client.download_file(self.datalake, s3_key, file_path)
741
+
742
+ download_time = time.time() - download_start
743
+
744
+ # Get file size if not already obtained
745
+ if not file_size:
746
+ file_size = os.path.getsize(file_path)
747
+
748
+ speed_mbps = (file_size / (1024*1024)) / download_time if download_time > 0 else 0
749
+ print(f"[DOWNLOAD_CSV] Download completed: {download_time:.2f}s | Size: {file_size / (1024*1024):.2f} MB ({speed_mbps:.2f} MB/s)")
750
+
751
+ # PHASE 2 OPTIMIZATION: Get optimized read parameters
752
+ read_params = self._get_optimized_read_params(file_size, date_cols)
753
+
754
+ # Read CSV with optimizations
755
+ read_start = time.time()
756
+
757
+ # Build read arguments, user params override optimizations
758
+ read_kwargs = {
759
+ 'filepath_or_buffer': file_path,
760
+ 'sep': sep,
761
+ 'index_col': index_col,
762
+ 'usecols': usecols,
763
+ 'nrows': num_records,
764
+ 'dayfirst': dayfirst,
765
+ 'compression': compression,
766
+ 'encoding': encoding,
767
+ 'parse_dates': date_cols,
768
+ 'thousands': thousands,
769
+ 'decimal': decimal,
770
+ 'dtype': types
771
+ }
772
+
773
+ # Add optimized params (user's low_memory takes precedence)
774
+ if low_memory is True:
775
+ read_kwargs['low_memory'] = read_params.get('low_memory', True)
776
+ else:
777
+ read_kwargs['low_memory'] = low_memory
778
+
779
+ # Add other optimization params that don't conflict with user params
780
+ for key, value in read_params.items():
781
+ if key not in ['low_memory', '_suggest_iterator'] and key not in read_kwargs:
782
+ read_kwargs[key] = value
783
+
784
+ df = pd.read_csv(**read_kwargs)
785
+
786
+ read_time = time.time() - read_start
787
+ print(f"[DOWNLOAD_CSV] Read completed: {read_time:.2f}s | Shape: {df.shape[0]:,} rows x {df.shape[1]} cols")
788
+
789
+ except ClientError as err:
790
+ print(f"[DOWNLOAD_CSV] ERROR: Connection failed")
791
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
792
+ raise
793
+ except FileNotFoundError as err:
794
+ print(f"[DOWNLOAD_CSV] ERROR: File not found")
795
+ self.logger.exception(f'No csv file found. Please check paths: {err}')
796
+ raise
797
+ finally:
798
+ # Clean up temporary file
799
+ if os.path.exists(file_path):
800
+ try:
801
+ os.remove(file_path)
802
+ except:
803
+ pass
804
+
805
+ total_time = time.time() - start_time
806
+ print(f"[DOWNLOAD_CSV] Total time: {total_time:.2f}s (download: {download_time:.2f}s, read: {read_time:.2f}s)")
807
+
808
+ return df
809
+
810
+ def download_json_file(self, json_name=None, datalake_path=None):
811
+ """
812
+ Return a JSON file downloaded from the datalake
813
+
814
+ :param json_name: File name to save dataframe
815
+ :param datalake_path: Path to upload the Q to S3 datalake
816
+ :return response: JSON file contents
817
+
818
+ >>>
819
+ >>>
820
+ """
821
+ s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
822
+ file_path = os.path.join(Config.LOCAL_PATH, json_name + '.json')
823
+ try:
824
+ if datalake_path is None:
825
+ s3_client.download_file(self.datalake, json_name + '.csv', file_path)
826
+ else:
827
+ s3_client.download_file(self.datalake, os.path.join(datalake_path, json_name + '.json'), file_path)
828
+ with open(file_path, 'r') as json_file:
829
+ response = json.load(json_file)
830
+ except ClientError as err:
831
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
832
+ except FileNotFoundError as err:
833
+ self.logger.exception(f'No object file found. Please check paths: {err}')
834
+ raise
835
+ return response
836
+
837
+ def download_csv_from_bucket(self,
838
+ datalake=None,
839
+ datalake_path=None,
840
+ sep=',',
841
+ index_col=None,
842
+ usecols=None,
843
+ num_records=None,
844
+ dayfirst=False,
845
+ compression='infer',
846
+ encoding='utf-8',
847
+ date_cols=None,
848
+ types=None,
849
+ thousands=None,
850
+ decimal='.',
851
+ low_memory=True,
852
+ use_multipart=True):
853
+ """Return a dataframe from a file stored in a S3 bucket
854
+
855
+ OPTIMIZED VERSION with S3 Multipart Transfer support for faster downloads.
856
+
857
+ :param datalake: S3 bucket name
858
+ :param datalake_path: Path to download the file from the bucket. Do not include datalake name. Default None.
859
+ :param sep: Field delimiter of the downloaded file. Default ','
860
+ :param index_col: Column(s) to use as the row labels of the DataFrame, either given as string name or column index.
861
+ :param usecols: Columns to use in returning dataframe.
862
+ :param num_records: Number of records to fetch from the source. Default None
863
+ :param dayfirst: DD/MM format dates, international and European format. Default False
864
+ :param compression: For on-the-fly decompression of on-disk data. Default 'infer'
865
+ :param encoding: Encoding to use for UTF when reading/writing. Default 'utf-8'
866
+ :param low_memory: Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. Default True
867
+ :param date_cols: List of date columns to parse as datetime type. Default None
868
+ :param types: Dict with data columns as keys and data types as values. Default None
869
+ :param thousands: Thousands separator
870
+ :param decimal: Decimal separator. Default '.'
871
+ :param use_multipart: Enable S3 multipart transfer for files >50MB. Default True
872
+ :return df: Dataframe containing the data from the file stored in the bucket
873
+
874
+ >>> df = download_csv_from_bucket(datalake='my-bucket', datalake_path='as-is/folder/file.csv')
875
+ >>> df
876
+ var1 var2 var3
877
+ idx0 1 2 3
878
+ """
879
+ start_time = time.time()
880
+ print(f"[DOWNLOAD_CSV_BUCKET] Starting: {datalake_path}")
881
+
882
+ # Configure boto3 client with larger connection pool for multipart transfers
883
+ botocore_config = BotocoreConfig(
884
+ max_pool_connections=100, # Increased from default 10 to support high concurrency
885
+ retries={'max_attempts': 3, 'mode': 'adaptive'}
886
+ )
887
+ s3_client = boto3.client(
888
+ 's3',
889
+ region_name=self.region,
890
+ aws_access_key_id=self.access_key,
891
+ aws_secret_access_key=self.secret_key,
892
+ config=botocore_config
893
+ )
894
+ file_path = os.path.join(Config.LOCAL_PATH, 'object.csv')
895
+
896
+ print(f"[DOWNLOAD_CSV_BUCKET] S3 path: s3://{datalake}/{datalake_path}")
897
+
898
+ try:
899
+ # PHASE 1 OPTIMIZATION: Get file size and create transfer config
900
+ file_size = self._get_s3_file_size(s3_client, datalake, datalake_path)
901
+
902
+ transfer_config = None
903
+ if use_multipart and file_size and file_size > 50 * 1024 * 1024: # > 50MB
904
+ transfer_config = self._create_transfer_config(file_size, s3_client)
905
+ print(f"[DOWNLOAD_CSV_BUCKET] Using multipart transfer | Size: {file_size / (1024*1024):.2f} MB")
906
+ elif file_size:
907
+ print(f"[DOWNLOAD_CSV_BUCKET] Using single-part transfer | Size: {file_size / (1024*1024):.2f} MB")
908
+
909
+ # Download file with optimized config
910
+ download_start = time.time()
911
+ if transfer_config:
912
+ s3_client.download_file(
913
+ Bucket=datalake,
914
+ Key=datalake_path,
915
+ Filename=file_path,
916
+ Config=transfer_config
917
+ )
918
+ else:
919
+ s3_client.download_file(datalake, datalake_path, file_path)
920
+
921
+ download_time = time.time() - download_start
922
+
923
+ # Get file size if not already obtained
924
+ if not file_size:
925
+ file_size = os.path.getsize(file_path)
926
+
927
+ speed_mbps = (file_size / (1024*1024)) / download_time if download_time > 0 else 0
928
+ print(f"[DOWNLOAD_CSV_BUCKET] Download completed: {download_time:.2f}s | Size: {file_size / (1024*1024):.2f} MB ({speed_mbps:.2f} MB/s)")
929
+
930
+ # PHASE 2 OPTIMIZATION: Get optimized read parameters
931
+ read_params = self._get_optimized_read_params(file_size, date_cols)
932
+
933
+ # Read CSV with optimizations
934
+ read_start = time.time()
935
+
936
+ # Build read arguments, user params override optimizations
937
+ read_kwargs = {
938
+ 'filepath_or_buffer': file_path,
939
+ 'sep': sep,
940
+ 'index_col': index_col,
941
+ 'usecols': usecols,
942
+ 'nrows': num_records,
943
+ 'dayfirst': dayfirst,
944
+ 'compression': compression,
945
+ 'encoding': encoding,
946
+ 'parse_dates': date_cols,
947
+ 'thousands': thousands,
948
+ 'decimal': decimal,
949
+ 'dtype': types
950
+ }
951
+
952
+ # Add optimized params (user's low_memory takes precedence)
953
+ if low_memory is True:
954
+ read_kwargs['low_memory'] = read_params.get('low_memory', True)
955
+ else:
956
+ read_kwargs['low_memory'] = low_memory
957
+
958
+ # Add other optimization params that don't conflict with user params
959
+ for key, value in read_params.items():
960
+ if key not in ['low_memory', '_suggest_iterator'] and key not in read_kwargs:
961
+ read_kwargs[key] = value
962
+
963
+ df = pd.read_csv(**read_kwargs)
964
+
965
+ read_time = time.time() - read_start
966
+ print(f"[DOWNLOAD_CSV_BUCKET] Read completed: {read_time:.2f}s | Shape: {df.shape[0]:,} rows x {df.shape[1]} cols")
967
+
968
+ except ClientError as err:
969
+ print(f"[DOWNLOAD_CSV_BUCKET] ERROR: Connection failed")
970
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
971
+ raise
972
+ except FileNotFoundError as err:
973
+ print(f"[DOWNLOAD_CSV_BUCKET] ERROR: File not found")
974
+ self.logger.exception(f'No object file found. Please check paths: {err}')
975
+ raise
976
+ finally:
977
+ # Clean up temporary file
978
+ if os.path.exists(file_path):
979
+ try:
980
+ os.remove(file_path)
981
+ except:
982
+ pass
983
+
984
+ total_time = time.time() - start_time
985
+ print(f"[DOWNLOAD_CSV_BUCKET] Total time: {total_time:.2f}s (download: {download_time:.2f}s, read: {read_time:.2f}s)")
986
+
987
+ return df
988
+
989
+ def download_object_csv(self,
990
+ datalake_path=None,
991
+ sep=',',
992
+ index_col=None,
993
+ usecols=None,
994
+ num_records=None,
995
+ dayfirst=False,
996
+ compression='infer',
997
+ encoding='utf-8',
998
+ date_cols=None,
999
+ types=None,
1000
+ thousands=None,
1001
+ decimal='.',
1002
+ low_memory=True):
1003
+ """Return a dataframe from a file stored in the datalake
1004
+
1005
+ :param datalake_path: Path to download the file from the S3 datalake. Do not include datalake name. Default None.
1006
+ :param sep: Field delimiter of the downloaded file. Default ','
1007
+ :param index_col: Column(s) to use as the row labels of the DataFrame, either given as string name or column index.
1008
+ :param usecols: Columns to use in returning dataframe.
1009
+ :param num_records: Number of records to fetch from the source. Default None
1010
+ :param dayfirst: DD/MM format dates, international and European format. Default False
1011
+ :param compression: For on-the-fly decompression of on-disk data. Default 'infer'
1012
+ :param encoding: Encoding to use for UTF when reading/writing. Default 'utf-8'
1013
+ :param low_memory: Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. Default True
1014
+ :param date_cols: List of date columns to parse as datetime type. Default None
1015
+ :param types: Dict with data columns as keys and data types as values. Default None
1016
+ :param thousands: Thousands separator
1017
+ :param decimal: Decimal separator. Default '.'
1018
+ :return df: Dataframe containing the data from the file stored in the datalake
1019
+
1020
+ >>> df = download_object_csv(datalake_path='as-is/folder/file.txt')
1021
+ >>> df
1022
+ var1 var2 var3
1023
+ idx0 1 2 3
1024
+ """
1025
+ s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1026
+ file_path = os.path.join(Config.LOCAL_PATH, 'object.dat')
1027
+ try:
1028
+ s3_client.download_file(self.datalake, os.path.join(datalake_path), file_path)
1029
+ df = pd.read_csv(file_path,
1030
+ sep=sep,
1031
+ index_col=index_col,
1032
+ usecols=usecols,
1033
+ nrows=num_records,
1034
+ dayfirst=dayfirst,
1035
+ compression=compression,
1036
+ encoding=encoding,
1037
+ low_memory=low_memory,
1038
+ thousands=thousands,
1039
+ parse_dates=date_cols,
1040
+ decimal=decimal,
1041
+ dtype=types)
1042
+ except ClientError as err:
1043
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
1044
+ except FileNotFoundError as err:
1045
+ self.logger.exception(f'No object file found. Please check paths: {err}')
1046
+ raise
1047
+ return df
1048
+
1049
+ def download_txt(self,
1050
+ q_name,
1051
+ datalake_path=None,
1052
+ sep='\t',
1053
+ index_col=None,
1054
+ usecols=None,
1055
+ num_records=None,
1056
+ dayfirst=False,
1057
+ compression='infer',
1058
+ encoding='utf-8',
1059
+ date_cols=None,
1060
+ types=None,
1061
+ thousands=None,
1062
+ low_memory=True,
1063
+ decimal='.'):
1064
+ """Return a dataframe from a csv file stored in the datalake
1065
+
1066
+ :param q_name: Plain file (.txt) to download and stored in a dataframe
1067
+ :param datalake_path: Path to download the file from the S3 datalake. Default None.
1068
+ :param sep: Field delimiter of the downloaded file. Default '\t'
1069
+ :param index_col: Column(s) to use as the row labels of the DataFrame, either given as string name or column index.
1070
+ :param usecols: Columns to use in returning dataframe.
1071
+ :param num_records: Number of records to fetch from the source. Default None
1072
+ :param dayfirst: DD/MM format dates, international and European format. Default False
1073
+ :param compression: For on-the-fly decompression of on-disk data. Default 'infer'
1074
+ :param encoding: Encoding to use for UTF when reading/writing. Default 'utf-8'
1075
+ :param date_cols: List of date columns to parse as datetime type. Default None
1076
+ :param types: Dict with data columns as keys and data types as values. Default None
1077
+ :param thousands: Thousands separator.
1078
+ :param decimal: Decimal separator. Default '.'
1079
+ :param low_memory: Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. Default True
1080
+ :return df: Dataframe containing the data from the file stored in the datalake
1081
+
1082
+ >>> df = download_txt(q_name='Q', datalake_path='as-is/folder')
1083
+ >>> df
1084
+ var1 var2 var3
1085
+ idx0 1 2 3
1086
+ """
1087
+ s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1088
+ file_path = os.path.join(Config.LOCAL_PATH, q_name + '.txt')
1089
+ try:
1090
+ if datalake_path is None:
1091
+ s3_client.download_file(self.datalake, q_name + '.txt', file_path)
1092
+ else:
1093
+ s3_client.download_file(self.datalake, os.path.join(datalake_path, q_name + '.txt'), file_path)
1094
+
1095
+ df = pd.read_csv(file_path,
1096
+ sep=sep,
1097
+ index_col=index_col,
1098
+ usecols=usecols,
1099
+ nrows=num_records,
1100
+ dayfirst=dayfirst,
1101
+ compression=compression,
1102
+ encoding=encoding,
1103
+ low_memory=low_memory,
1104
+ parse_dates=date_cols,
1105
+ thousands=thousands,
1106
+ decimal=decimal,
1107
+ dtype=types)
1108
+ except ClientError as err:
1109
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
1110
+ except FileNotFoundError as err:
1111
+ self.logger.exception(f'No csv file found. Please check paths: {err}')
1112
+ raise
1113
+ return df
1114
+
1115
+ def download_all_objects_csv(self,
1116
+ datalake_path=None,
1117
+ sep=',',
1118
+ index_col=None,
1119
+ num_records=None,
1120
+ dayfirst=False,
1121
+ compression='infer',
1122
+ encoding='utf-8',
1123
+ low_memory=True,
1124
+ date_cols=None,
1125
+ types=None,
1126
+ thousands=None,
1127
+ decimal='.'):
1128
+ """Return a dataframe from a file stored in the datalake
1129
+
1130
+ :param datalake_path: Path to download the file from the S3 datalake. Do not include datalake name. Default None.
1131
+ :param sep: Field delimiter of the downloaded file. Default ','
1132
+ :param index_col: Column(s) to use as the row labels of the DataFrame, either given as string name or column index.
1133
+ :param num_records: Number of records to fetch from the source. Default None
1134
+ :param dayfirst: DD/MM format dates, international and European format. Default False
1135
+ :param compression: For on-the-fly decompression of on-disk data. Default 'infer'
1136
+ :param encoding: Encoding to use for UTF when reading/writing. Default 'utf-8'
1137
+ :param low_memory: Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. Default True
1138
+ :param date_cols: List of date columns to parse as datetime type. Default None
1139
+ :param types: Dict with data columns as keys and data types as values. Default None
1140
+ :param thousands: Thousands separator
1141
+ :param decimal: Decimal separator. Default '.'
1142
+ :return df: Dataframe containing the data from the file stored in the datalake
1143
+
1144
+ >>> df = download_all_objects_csv(datalake_path='as-is/folder/file')
1145
+ >>> df
1146
+ var1 var2 var3
1147
+ idx0 1 2 3
1148
+ """
1149
+ s3_resource = boto3.resource('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1150
+
1151
+ try:
1152
+ df = pd.DataFrame()
1153
+ datalake = s3_resource.Bucket(self.datalake)
1154
+ objects = datalake.objects.filter(Prefix=datalake_path)
1155
+ for obj in objects:
1156
+ path, filename = os.path.split(obj.key)
1157
+ if filename != '_SUCCESS' and filename != '_CHECK':
1158
+ datalake.download_file(obj.key, os.path.join('/tmp', filename))
1159
+ df_tmp = pd.read_csv(os.path.join('/tmp', filename),
1160
+ sep=sep,
1161
+ index_col=index_col,
1162
+ nrows=num_records,
1163
+ dayfirst=dayfirst,
1164
+ compression=compression,
1165
+ encoding=encoding,
1166
+ low_memory=low_memory,
1167
+ parse_dates=date_cols,
1168
+ thousands=thousands,
1169
+ decimal=decimal,
1170
+ dtype=types)
1171
+ df = pd.concat([df, df_tmp], axis='rows').drop_duplicates()
1172
+ except ClientError as err:
1173
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
1174
+ except FileNotFoundError as err:
1175
+ self.logger.exception(f'No object file found. Please check paths: {err}')
1176
+ raise
1177
+ return df
1178
+
1179
+ def download_dynamodb(self, table_name, tenant_id):
1180
+ """
1181
+ Return a dataframe with the data fetch from DynamoDB
1182
+
1183
+ :param table_name: Table name in DynamoDB table
1184
+ :param tenant_id: Partition column mapping tenant's ID to whom belongs the records
1185
+ :return df: Dataframe to store records fetched from DynamoDB
1186
+ >>> df = download_dynamodb(table_name='sampleTbl', tenant_id='1234')
1187
+ >>> df =
1188
+ tenantId Date Attr
1189
+ idx0 A121 2020-12-01 3
1190
+ """
1191
+ dydb_client = boto3.client('dynamodb', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1192
+ dynamodb_session = Session(aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key, region_name=self.region)
1193
+ dydb = dynamodb_session.resource('dynamodb')
1194
+ try:
1195
+ dynamo_tbl = dydb.Table(table_name)
1196
+ response = dynamo_tbl.query(
1197
+ KeyConditionExpression=Key('tenantId').eq(md5(tenant_id.encode('utf-8')).hexdigest()) &\
1198
+ Key('Fecha').between('2010-01-01', '2025-12-31')
1199
+ )
1200
+ items = response['Items']
1201
+ except dydb_client.exceptions.ResourceNotFoundException as err:
1202
+ print(f'Table not found. Please check names :{err}')
1203
+ return False
1204
+ raise
1205
+ return items
1206
+
1207
+ def download_excel(self,
1208
+ q_name,
1209
+ sheet_name,
1210
+ datalake_path=None,
1211
+ index_col=None,
1212
+ usecols=None,
1213
+ num_records=None,
1214
+ date_cols=None,
1215
+ types=None,
1216
+ header_=0,
1217
+ skiprows_=None):
1218
+ """Return a dataframe from a csv file stored in the datalake
1219
+
1220
+ :param q_name: Excel file to download and stored in a dataframe. Include extension xls, xlsx, ods, etc.
1221
+ :param sheet_name: Excel sheet to download and stored in a dataframe
1222
+ :param datalake_path: Path to download the file from the S3 datalake. Default None.
1223
+ :param index_col: Column(s) to use as the row labels of the DataFrame, either given as string name or column index.
1224
+ :param usecols: Columns to use in returning dataframe.
1225
+ :param num_records: Number of records to fetch from the source. Default None
1226
+ :param date_cols: List of date columns to parse as datetime type. Default None
1227
+ :param types: Dict with data columns as keys and data types as values. Default None
1228
+ :return df: Dataframe containing the data from the file stored in the datalake
1229
+
1230
+ >>> df = download_excel(q_name='Q', sheet_name='sheet1', datalake_path='as-is/folder')
1231
+ >>> df
1232
+ var1 var2 var3
1233
+ idx0 1 2 3
1234
+ """
1235
+ start_time = time.time()
1236
+ print(f"[DOWNLOAD_EXCEL] Starting: {q_name} | Sheet: {sheet_name}")
1237
+
1238
+ s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1239
+ file_path = os.path.join(Config.LOCAL_PATH, q_name)
1240
+
1241
+ # Build S3 key
1242
+ if datalake_path is None:
1243
+ s3_key = q_name
1244
+ else:
1245
+ s3_key = os.path.join(datalake_path, q_name)
1246
+
1247
+ print(f"[DOWNLOAD_EXCEL] S3 path: s3://{self.datalake}/{s3_key}")
1248
+
1249
+ try:
1250
+ # Download file
1251
+ download_start = time.time()
1252
+ s3_client.download_file(self.datalake, s3_key, file_path)
1253
+ download_time = time.time() - download_start
1254
+
1255
+ # Get file size
1256
+ file_size = os.path.getsize(file_path)
1257
+ speed_mbps = (file_size / (1024*1024)) / download_time if download_time > 0 else 0
1258
+ print(f"[DOWNLOAD_EXCEL] Download completed: {download_time:.2f}s | Size: {file_size / (1024*1024):.2f} MB ({speed_mbps:.2f} MB/s)")
1259
+
1260
+ # Read Excel file
1261
+ read_start = time.time()
1262
+ df = pd.read_excel(file_path,
1263
+ sheet_name=sheet_name,
1264
+ index_col=index_col,
1265
+ usecols=usecols,
1266
+ engine='openpyxl',
1267
+ nrows=num_records,
1268
+ parse_dates=date_cols,
1269
+ dtype=types,
1270
+ header=header_,
1271
+ skiprows=skiprows_)
1272
+ df = df.dropna(how='all')
1273
+
1274
+ read_time = time.time() - read_start
1275
+ print(f"[DOWNLOAD_EXCEL] Read completed: {read_time:.2f}s | Shape: {df.shape[0]:,} rows x {df.shape[1]} cols")
1276
+
1277
+ except ClientError as err:
1278
+ print(f"[DOWNLOAD_EXCEL] ERROR: Connection failed")
1279
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
1280
+ raise
1281
+ except FileNotFoundError as err:
1282
+ print(f"[DOWNLOAD_EXCEL] ERROR: File or sheet not found")
1283
+ self.logger.exception(f'No excel file or sheet name found. Please check paths: {err}')
1284
+ raise
1285
+ finally:
1286
+ # Clean up temporary file
1287
+ if os.path.exists(file_path):
1288
+ try:
1289
+ os.remove(file_path)
1290
+ except:
1291
+ pass
1292
+
1293
+ total_time = time.time() - start_time
1294
+ print(f"[DOWNLOAD_EXCEL] Total time: {total_time:.2f}s (download: {download_time:.2f}s, read: {read_time:.2f}s)")
1295
+
1296
+ return df
1297
+
1298
+ def download_excel_from_bucket(self,
1299
+ datalake=None,
1300
+ datalake_path=None,
1301
+ sheet_name=0,
1302
+ index_col=None,
1303
+ usecols=None,
1304
+ num_records=None,
1305
+ date_cols=None,
1306
+ types=None,
1307
+ header_=0,
1308
+ skiprows_=None):
1309
+ """Return a dataframe from a file stored in a S3 bucket
1310
+
1311
+ :param datalake: S3 bucket name
1312
+ :param datalake_path: Path to download the file from the bucket. Do not include datalake name. Default None.
1313
+ :param sheet_name: Excel sheet to download and stored in a dataframe
1314
+ :param index_col: Column(s) to use as the row labels of the DataFrame, either given as string name or column index.
1315
+ :param usecols: Columns to use in returning dataframe.
1316
+ :param num_records: Number of records to fetch from the source. Default None
1317
+ :param date_cols: List of date columns to parse as datetime type. Default None
1318
+ :param types: Dict with data columns as keys and data types as values. Default None
1319
+ :return df: Dataframe containing the data from the file stored in the datalake
1320
+
1321
+ >>> df = download_excel_from_bucket(datalake='my-bucket', datalake_path='as-is/folder/file.csv')
1322
+ >>> df
1323
+ var1 var2 var3
1324
+ idx0 1 2 3
1325
+ """
1326
+ start_time = time.time()
1327
+ print(f"[DOWNLOAD_EXCEL_BUCKET] Starting: {datalake_path} | Sheet: {sheet_name}")
1328
+
1329
+ s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1330
+ file_path = os.path.join(Config.LOCAL_PATH, 'object.xlsx')
1331
+
1332
+ print(f"[DOWNLOAD_EXCEL_BUCKET] S3 path: s3://{datalake}/{datalake_path}")
1333
+
1334
+ try:
1335
+ # Download file
1336
+ download_start = time.time()
1337
+ s3_client.download_file(datalake, os.path.join(datalake_path), file_path)
1338
+ download_time = time.time() - download_start
1339
+
1340
+ # Get file size
1341
+ file_size = os.path.getsize(file_path)
1342
+ speed_mbps = (file_size / (1024*1024)) / download_time if download_time > 0 else 0
1343
+ print(f"[DOWNLOAD_EXCEL_BUCKET] Download completed: {download_time:.2f}s | Size: {file_size / (1024*1024):.2f} MB ({speed_mbps:.2f} MB/s)")
1344
+
1345
+ # Read Excel file
1346
+ read_start = time.time()
1347
+ df = pd.read_excel(file_path,
1348
+ sheet_name=sheet_name,
1349
+ index_col=index_col,
1350
+ usecols=usecols,
1351
+ engine='openpyxl',
1352
+ nrows=num_records,
1353
+ parse_dates=date_cols,
1354
+ dtype=types,
1355
+ header=header_,
1356
+ skiprows=skiprows_)
1357
+ df = df.dropna(how='all')
1358
+
1359
+ read_time = time.time() - read_start
1360
+ print(f"[DOWNLOAD_EXCEL_BUCKET] Read completed: {read_time:.2f}s | Shape: {df.shape[0]:,} rows x {df.shape[1]} cols")
1361
+
1362
+ except ClientError as err:
1363
+ print(f"[DOWNLOAD_EXCEL_BUCKET] ERROR: Connection failed")
1364
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
1365
+ raise
1366
+ except FileNotFoundError as err:
1367
+ print(f"[DOWNLOAD_EXCEL_BUCKET] ERROR: File not found")
1368
+ self.logger.exception(f'No object file found. Please check paths: {err}')
1369
+ raise
1370
+ finally:
1371
+ # Clean up temporary file
1372
+ if os.path.exists(file_path):
1373
+ try:
1374
+ os.remove(file_path)
1375
+ except:
1376
+ pass
1377
+
1378
+ total_time = time.time() - start_time
1379
+ print(f"[DOWNLOAD_EXCEL_BUCKET] Total time: {total_time:.2f}s (download: {download_time:.2f}s, read: {read_time:.2f}s)")
1380
+
1381
+ return df
1382
+
1383
+ def download_xml(self, url_=None, header_=None, body_=None):
1384
+ """Return a response in XML format from a SOAP web service
1385
+
1386
+ :param url_: URL endpoint to access SOAP web service
1387
+ :param header_: Header in rest api configuration parameters
1388
+ :param body_: Body input parameters
1389
+ :return response: Plain text with data xml
1390
+
1391
+ address = 'http://200.200.200.200:81/service.asmx'
1392
+ headers = {'Content-Type':'text/xml;charset=UTF-8'}
1393
+ body = ""<soapenv:Envelope xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/" xmlns:tem="http://tempuri.org/">
1394
+ <soapenv:Header/>
1395
+ <soapenv:Body>
1396
+ <tem:EjecutarConsultaXML>
1397
+ <!--Optional:-->
1398
+ <tem:pvstrxmlParametros>
1399
+ <![CDATA[
1400
+ <Consulta>
1401
+ <NombreConexion>Datup_Real</NombreConexion>
1402
+ <IdCia>2</IdCia>
1403
+ <IdProveedor>Analytics</IdProveedor>
1404
+ <IdConsulta>CONSULTA_VENTAS</IdConsulta>
1405
+ <Usuario>myuser</Usuario>
1406
+ <Clave>mypassword</Clave>
1407
+ <Parametros>
1408
+ <p_periodo_ini>202105</p_periodo_ini>
1409
+ <p_periodo_fin>202105</p_periodo_fin>
1410
+ </Parametros>
1411
+ </Consulta>]]>
1412
+ </tem:pvstrxmlParametros>
1413
+ </tem:EjecutarConsultaXML>
1414
+ </soapenv:Body>
1415
+ </soapenv:Envelope>""
1416
+
1417
+ >>> response = download_xml(url_=address, header_=headers, body_=body)
1418
+ >>> response =
1419
+ '<?xml version="1.0" encoding="utf-8"?><soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
1420
+ xmlns:xsd="http://www.w3.org/2001/XMLSchema"><soap:Body><EjecutarConsultaXMLResponse xmlns="http://tempuri.org/"><EjecutarConsultaXMLResult><xs:schema id="NewDataSet"
1421
+ xmlns="" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata"><xs:element name="NewDataSet" msdata:IsDataSet="true"
1422
+ msdata:UseCurrentLocale="true"><xs:complexType><xs:choice minOccurs="0" maxOccurs="unbounded"><xs:element name="Resultado"><xs:complexType><xs:sequence><xs:element
1423
+ name="Compañia" type="xs:short" minOccurs="0" /><xs:element name="Llave_x0020_Documento" type="xs:int" minOccurs="0"'
1424
+ """
1425
+ try:
1426
+ r = requests.post(url_, headers=header_, data=body_, allow_redirects=True)
1427
+ response = r.text
1428
+ except requests.exceptions.HTTPError as err:
1429
+ self.logger.exception(f'Http error: {err}')
1430
+ raise
1431
+ except requests.exceptions.ConnectionError as err:
1432
+ self.logger.exception(f'Error connecting: {err}')
1433
+ raise
1434
+ except requests.exceptions.Timeout as err:
1435
+ self.logger.exception(f'Timeout error: {err}')
1436
+ raise
1437
+ except requests.exceptions.RequestException as err:
1438
+ self.logger.exception(f'Oops: Something else: {err}')
1439
+ raise
1440
+ return response
1441
+
1442
+ def download_parquet(self, q_name, datalake_path=None, columns=None, engine='pyarrow', filters=None):
1443
+ """Return a dataframe from a parquet file stored in the datalake
1444
+
1445
+ :param q_name: File name (without extension) to download and store in a dataframe.
1446
+ :param datalake_path: Path to download the file from the S3 datalake. Default None.
1447
+ :param columns: Subset of columns to read from the Parquet file. Default None (reads all columns).
1448
+ :param engine: Engine to use for reading Parquet files. Default 'pyarrow'.
1449
+ :param filters: Filters to apply to the Parquet file rows while reading. Default None.
1450
+ :return df: DataFrame containing the data from the Parquet file stored in the datalake.
1451
+
1452
+ >>> df = download_parquet(q_name='Q', datalake_path='as-is/folder')
1453
+ >>> df
1454
+ var1 var2 var3
1455
+ idx0 1 2 3
1456
+ """
1457
+ start_time = time.time()
1458
+ print(f"[DOWNLOAD_PARQUET] Starting: {q_name}.parquet | Engine: {engine}")
1459
+
1460
+ s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1461
+ file_path = os.path.join(Config.LOCAL_PATH, q_name + '.parquet')
1462
+
1463
+ # Build S3 key
1464
+ if datalake_path is None:
1465
+ s3_key = q_name + '.parquet'
1466
+ else:
1467
+ s3_key = os.path.join(datalake_path, q_name, q_name + '.parquet')
1468
+
1469
+ print(f"[DOWNLOAD_PARQUET] S3 path: s3://{self.datalake}/{s3_key}")
1470
+
1471
+ try:
1472
+ # Download the Parquet file from S3
1473
+ download_start = time.time()
1474
+ s3_client.download_file(self.datalake, s3_key, file_path)
1475
+ download_time = time.time() - download_start
1476
+
1477
+ # Get file size
1478
+ file_size = os.path.getsize(file_path)
1479
+ speed_mbps = (file_size / (1024*1024)) / download_time if download_time > 0 else 0
1480
+ print(f"[DOWNLOAD_PARQUET] Download completed: {download_time:.2f}s | Size: {file_size / (1024*1024):.2f} MB ({speed_mbps:.2f} MB/s)")
1481
+
1482
+ # Read the Parquet file into a DataFrame
1483
+ read_start = time.time()
1484
+ df = pd.read_parquet(file_path, columns=columns, engine=engine, filters=filters)
1485
+
1486
+ read_time = time.time() - read_start
1487
+ print(f"[DOWNLOAD_PARQUET] Read completed: {read_time:.2f}s | Shape: {df.shape[0]:,} rows x {df.shape[1]} cols")
1488
+
1489
+ except ClientError as err:
1490
+ print(f"[DOWNLOAD_PARQUET] ERROR: Connection failed")
1491
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
1492
+ raise
1493
+ except FileNotFoundError as err:
1494
+ print(f"[DOWNLOAD_PARQUET] ERROR: File not found")
1495
+ self.logger.exception(f'No Parquet file found. Please check paths: {err}')
1496
+ raise
1497
+ except Exception as e:
1498
+ print(f"[DOWNLOAD_PARQUET] ERROR: {e}")
1499
+ self.logger.exception(f'Failed to read the Parquet file: {e}')
1500
+ raise
1501
+ finally:
1502
+ # Clean up temporary file
1503
+ if os.path.exists(file_path):
1504
+ try:
1505
+ os.remove(file_path)
1506
+ except:
1507
+ pass
1508
+
1509
+ total_time = time.time() - start_time
1510
+ print(f"[DOWNLOAD_PARQUET] Total time: {total_time:.2f}s (download: {download_time:.2f}s, read: {read_time:.2f}s)")
1511
+
1512
+ return df
1513
+
1514
+
1515
+ def download_models(self, datalake_path=None):
1516
+ """Returns True as successful download of the n_backtests models trained by attup model
1517
+
1518
+ :param datalake_path: Path to download the file from the S3 datalake. Default None.
1519
+ :return: True if success, else False.
1520
+
1521
+ >>> models = download_models(datalake_path='path/to/data')
1522
+ >>> True
1523
+ """
1524
+
1525
+ s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1526
+ for i in range(self.backtests + 1):
1527
+ q_name = "model" + str(i)
1528
+ file_path = os.path.join(Config.LOCAL_PATH, q_name + '.h5')
1529
+ print(file_path)
1530
+ try:
1531
+ if datalake_path is None:
1532
+ s3_client.download_file(self.datalake, q_name + '.h5', file_path)
1533
+ else:
1534
+ s3_client.download_file(self.datalake, os.path.join(datalake_path, "models", q_name + '.h5'), file_path)
1535
+ except ClientError as err:
1536
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
1537
+ except FileNotFoundError as err:
1538
+ self.logger.exception(f'No csv file found. Please check paths: {err}')
1539
+ raise
1540
+ return True
1541
+
1542
+ def download_models_tft(self, datalake_path=None):
1543
+ """Returns True as successful download of the n_backtests models trained by attup model
1544
+
1545
+ :param datalake_path: Path to download the file from the S3 datalake. Default None.
1546
+ :return: True if success, else False.
1547
+
1548
+ >>> models = download_models(datalake_path='path/to/data')
1549
+ >>> True
1550
+ """
1551
+
1552
+ s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1553
+ for i in range(self.backtests + 1):
1554
+ q_name = "model" + str(i)
1555
+ file_path = os.path.join(Config.LOCAL_PATH, q_name + '.ckpt')
1556
+ print(file_path)
1557
+ try:
1558
+ if datalake_path is None:
1559
+ s3_client.download_file(self.datalake, q_name + '.ckpt', file_path)
1560
+ else:
1561
+ s3_client.download_file(self.datalake, os.path.join(datalake_path, "models", q_name + '.ckpt'), file_path)
1562
+ except ClientError as err:
1563
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
1564
+ except FileNotFoundError as err:
1565
+ self.logger.exception(f'No csv file found. Please check paths: {err}')
1566
+ raise
1567
+ return True
1568
+
1569
+ def upload_csv(self, df, q_name, datalake_path, sep=',', encoding='utf-8', date_format='%Y-%m-%d', lineterminator=None):
1570
+ """Return a success or failure boolean attempting to upload a local file to the datalake
1571
+
1572
+ :param df: Dataframe to upload
1573
+ :param q_name: File name to save dataframe
1574
+ :param datalake_path: Path to upload the Q to S3 datalake
1575
+ :param sep: Field delimiter for the output file. Default ','
1576
+ :param date_format: Format string for datetime objects of output file. Default '%Y-%m-%d'
1577
+ :param encoding: A string representing the encoding to use in the output file. Default 'utf-8'
1578
+ :return: True if success, else False.
1579
+
1580
+ >>> upload_csv(df=df, q_name='Q', datalake_path='as-is/folder')
1581
+ >>> True
1582
+ """
1583
+ file_path = os.path.join(Config.LOCAL_PATH, q_name + '.csv')
1584
+ df.to_csv(file_path, sep=sep, encoding=encoding, date_format=date_format, index=False, lineterminator=lineterminator)
1585
+ s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1586
+ try:
1587
+ response = s3_client.upload_file(file_path, self.datalake, os.path.join(datalake_path, q_name, q_name + '.csv'))
1588
+ except ClientError as err:
1589
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
1590
+ return False
1591
+ except FileNotFoundError as err:
1592
+ self.logger.exception(f'No excel file or sheet name found. Please check paths: {err}')
1593
+ return False
1594
+ return True
1595
+
1596
+ def upload_dynamodb(self, df, table_name, tenant_id, sort_col):
1597
+ """
1598
+ Return a success or failure boolean attempting to upload timeseries data to DynamoDB
1599
+
1600
+ :param df: Dataframe to upload to DynamoDB table
1601
+ :param table_name: Table name in DynamoDB table
1602
+ :param tenant_id: Partition column mapping tenant's ID to whom belongs the records
1603
+ :param sort_col: Sorting column mapping usually to date column
1604
+ :return response: HTTP status code response. If 400 success, failure otherwise
1605
+
1606
+ >>> upload_dynamodb(df=df, table_name=sampleTbl, tenant_id='acme', sort_col='Date')
1607
+ >>> True
1608
+ """
1609
+ dydb_client = boto3.client('dynamodb', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1610
+ dynamodb_session = Session(aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key, region_name=self.region)
1611
+ dydb = dynamodb_session.resource('dynamodb')
1612
+ try:
1613
+ dynamo_tbl = dydb.Table(table_name)
1614
+ with dynamo_tbl.batch_writer() as batch:
1615
+ for row in df.itertuples(index=False):
1616
+ record = {}
1617
+ record.update({'tenantId': md5(tenant_id.encode('utf-8')).hexdigest()})
1618
+ record.update({sort_col: row[0].strftime('%Y-%m-%d')})
1619
+ for ix, rec in enumerate(row[1:]):
1620
+ record.update({df.columns[ix + 1]: Decimal(str(rec))})
1621
+ batch.put_item(Item=record)
1622
+ except dydb_client.exceptions.ResourceNotFoundException as err:
1623
+ print(f'Table not found. Please check names :{err}')
1624
+ return False
1625
+ raise
1626
+ return True
1627
+
1628
+ def upload_json(self, df, q_name=None, datalake_path=None, orient_=None, date_format_=None, date_unit_='s', compression_=None, indent_=4):
1629
+ """
1630
+ Return a success or failure response after attempting to upload a dataframe in JSON format
1631
+
1632
+ :param df: Dataframe to upload in JSON format
1633
+ :param q_name: File name to save dataframe
1634
+ :param datalake_path: Path to upload the Q to S3 datalake
1635
+ :param orient_: Expected JSON string format. Possible values split, records, index, table, columns, values
1636
+ :param date_format_: Type of date conversion. epoch = epoch milliseconds, iso = ISO8601.
1637
+ :param date_unit_: The time unit to encode to, governs timestamp and ISO8601 precisione, e.g. s, ms, us, ns.
1638
+ :param compression_: A string representing the compression to use in the output file, e.g. gzip, bz2, zip, xz.
1639
+ :param indent_: Length of whitespace used to indent each record. Default 4.
1640
+ :return response: Success or failure uploading the dataframe
1641
+
1642
+ >>> upload_json(df, q_name='Qtest', orient_='columns')
1643
+ >>> True
1644
+ """
1645
+ s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1646
+ file_path = os.path.join(Config.LOCAL_PATH, q_name + '.json')
1647
+ try:
1648
+ df.to_json(file_path, orient=orient_, date_format=date_format_, date_unit=date_unit_, compression=compression_, indent=indent_)
1649
+ response = s3_client.upload_file(file_path, self.datalake, os.path.join(datalake_path, q_name + '.json'))
1650
+ except ClientError as err:
1651
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
1652
+ return False
1653
+ except FileNotFoundError as err:
1654
+ self.logger.exception(f'No excel file or sheet name found. Please check paths: {err}')
1655
+ return False
1656
+ return True
1657
+
1658
+ def upload_json_file(self, message=None, json_name=None, datalake_path=None, indent_=4):
1659
+ """
1660
+ Return a success or failure response after attempting to upload a JSON file
1661
+
1662
+
1663
+ :param message: Dict type to convert to JSON and upload to datalake
1664
+ :param json_name: File name to save dataframe
1665
+ :param datalake_path: Path to upload the Q to S3 datalake
1666
+ :param indent_: Length of whitespace used to indent each record. Default 4.
1667
+ :return : Success or failure uploading the dataframe
1668
+
1669
+ >>> upload_json_file(message=resp_dict, json_name='myjson', datalake_path='/path/to/data')
1670
+ >>> True
1671
+ """
1672
+
1673
+ s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1674
+ file_path = os.path.join(Config.LOCAL_PATH, json_name + '.json')
1675
+ try:
1676
+ with open(file_path, 'w') as json_file:
1677
+ json.dump(message, json_file, indent=indent_)
1678
+ s3_client.upload_file(file_path, self.datalake, os.path.join(datalake_path, json_name + '.json'))
1679
+ except ClientError as err:
1680
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
1681
+ return False
1682
+ except FileNotFoundError as err:
1683
+ self.logger.exception(f'No excel file or sheet name found. Please check paths: {err}')
1684
+ return False
1685
+ return True
1686
+
1687
+ def upload_timestream(self, df, db_name, table_name):
1688
+ """
1689
+ Return a success or failure boolean attempting to upload timeseries data to timestream database
1690
+
1691
+ :param df: Dataframe to upload to Timestream table
1692
+ :param db_name: Database name in Timestream service
1693
+ :param table_name: Table name in Timestream service
1694
+ :return response: HTTP status code response. If 400 success, failure otherwise
1695
+
1696
+ >>> upload_timestream(df=df, db_name=dbSample, table_name=tbSample)
1697
+ >>> True
1698
+ """
1699
+ ts_client = boto3.client('timestream-write',
1700
+ region_name=self.region,
1701
+ aws_access_key_id=self.access_key,
1702
+ aws_secret_access_key=self.secret_key)
1703
+ dimensions = [{'Name': 'tenantId', 'Value': '1000', 'DimensionValueType': 'VARCHAR'}]
1704
+ records = []
1705
+ for row in df.itertuples(index=False):
1706
+ for ix, rec in enumerate(row[1:]):
1707
+ records.append({
1708
+ 'Dimensions': dimensions,
1709
+ 'MeasureName': df.columns[ix + 1],
1710
+ 'MeasureValue': str(rec),
1711
+ 'MeasureValueType': 'DOUBLE',
1712
+ 'Time': str(int(pd.to_datetime(row[0]).timestamp())),
1713
+ 'TimeUnit': 'SECONDS',
1714
+ 'Version': 3
1715
+ })
1716
+ try:
1717
+ response = ts_client.write_records(DatabaseName=db_name, TableName=table_name, Records=records)
1718
+ status = response['ResponseMetadata']['HTTPStatusCode']
1719
+ print(f'Processed records: {len(records)}. WriteRecords status: {status}')
1720
+ self.logger.exception(f'Processed records: {len(records)}. WriteRecords status: {status}')
1721
+ except ts_client.exceptions.RejectedRecordsException as err:
1722
+ print(f'{err}')
1723
+ self.logger.exception(f'{err}')
1724
+ for e in err.response["RejectedRecords"]:
1725
+ print("Rejected Index " + str(e["RecordIndex"]) + ": " + e["Reason"])
1726
+ self.logger.exception("Rejected Index " + str(e["RecordIndex"]) + ": " + e["Reason"])
1727
+ return False
1728
+ except ts_client.exceptions.ValidationException as err:
1729
+ print(f"{err.response['Error']['Message']}")
1730
+ self.logger.exception(f"{err.response['Error']['Message']}")
1731
+ return False
1732
+ return status
1733
+
1734
+ def upload_models(self, datalake_path):
1735
+ """Return a success or failure boolean attempting to upload a tensorflow models to the datalake.
1736
+
1737
+ :param datalake_path: Path to upload the attup trained models to S3 datalake
1738
+ :return: True if success, else False.
1739
+
1740
+ >>> upload_models(datalake_path='as-is/folder')
1741
+ >>> True
1742
+ """
1743
+ s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1744
+
1745
+ for i in range(self.backtests + 1):
1746
+ q_name = "model" + str(i)
1747
+ print(q_name)
1748
+ file_path = os.path.join(Config.LOCAL_PATH, q_name + '.h5')
1749
+ try:
1750
+ response = s3_client.upload_file(file_path, self.datalake, os.path.join(datalake_path, "models", q_name + '.h5'))
1751
+ except ClientError as err:
1752
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
1753
+ return False
1754
+ except FileNotFoundError as err:
1755
+ self.logger.exception(f'No excel file or sheet name found. Please check paths: {err}')
1756
+ return False
1757
+ return True
1758
+
1759
+ def upload_models_tft(self, q_name, datalake_path):
1760
+ """Return a success or failure boolean attempting to upload a tensorflow models to the datalake.
1761
+
1762
+ :param datalake_path: Path to upload the attup trained models to S3 datalake
1763
+ :return: True if success, else False.
1764
+
1765
+ >>> upload_models(datalake_path='as-is/folder')
1766
+ >>> True
1767
+ """
1768
+ s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1769
+ print(q_name)
1770
+ file_path = os.path.join(Config.LOCAL_PATH, q_name + '.ckpt')
1771
+ try:
1772
+ response = s3_client.upload_file(file_path, self.datalake, os.path.join(datalake_path, "models", q_name + '.ckpt'))
1773
+ except ClientError as err:
1774
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
1775
+ return False
1776
+ except FileNotFoundError as err:
1777
+ self.logger.exception(f'No excel file or sheet name found. Please check paths: {err}')
1778
+ return False
1779
+ return True
1780
+
1781
+ def upload_object(self, datalake_name=None, datalake_path='', object_name=None):
1782
+ """Return a success or failure boolean attempting to upload a local file to the datalake
1783
+
1784
+ :param datalake_name: S3 bucket name (datalake) to upload the object
1785
+ :param datalake_path: Path to upload the Q to S3 datalake
1786
+ :param object_name: Object name to upload to the S3 bucket (datalake)
1787
+ :return: True if success, else False.
1788
+
1789
+ >>> upload_object(datalake_name='datup-datalake-datup', datalake_path='path/to/data', object_name='datup.dat')
1790
+ >>> True
1791
+ """
1792
+ file_path = os.path.join(Config.LOCAL_PATH, object_name)
1793
+ s3_client = boto3.client('s3', region_name='us-east-1', aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1794
+ try:
1795
+ response = s3_client.upload_file(file_path, datalake_name, os.path.join(datalake_path, object_name))
1796
+ except ClientError as err:
1797
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
1798
+ return False
1799
+ except FileNotFoundError as err:
1800
+ self.logger.exception(f'No excel file or sheet name found. Please check paths: {err}')
1801
+ return False
1802
+ return True
1803
+
1804
+ def upload_log(self):
1805
+ """Return a success or failure boolean attempting to upload a local file to the datalake
1806
+
1807
+ :param datalake_path: Path to upload the Q to S3 datalake
1808
+ :return: True if success, else False.
1809
+
1810
+ >>> upload_log()
1811
+ >>> True
1812
+ """
1813
+ file_path = os.path.join(Config.LOCAL_PATH, self.logfile)
1814
+ s3_client = boto3.client('s3', region_name='us-east-1', aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1815
+ try:
1816
+ response = s3_client.upload_file(file_path, self.datalake, os.path.join(self.log_path, self.logfile))
1817
+ except ClientError as err:
1818
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
1819
+ return False
1820
+ except FileNotFoundError as err:
1821
+ self.logger.exception(f'No excel file or sheet name found. Please check paths: {err}')
1822
+ return False
1823
+ return True
1824
+
1825
+ def upload_parquet(self, df, q_name, datalake_path, compression='snappy', engine='pyarrow'):
1826
+ """Return a success or failure boolean attempting to upload a local parquet file to the datalake
1827
+
1828
+ :param df: Dataframe to upload
1829
+ :param q_name: File name to save dataframe
1830
+ :param datalake_path: Path to upload the file to S3 datalake
1831
+ :param compression: Compression to use in the parquet file. Default 'snappy'
1832
+ :param engine: Engine to use for writing parquet files. Default 'pyarrow'
1833
+ :return: True if success, else False.
1834
+
1835
+ >>> upload_parquet(df=df, q_name='Q', datalake_path='as-is/folder')
1836
+ >>> True
1837
+ """
1838
+ file_path = os.path.join(Config.LOCAL_PATH, q_name + '.parquet')
1839
+
1840
+ print(f'Compression: {compression}')
1841
+ print(f'Engine: {engine}')
1842
+
1843
+ # Save DataFrame as Parquet file
1844
+ try:
1845
+ df.to_parquet(file_path, engine=engine, compression=compression, index=False)
1846
+ except Exception as e:
1847
+ self.logger.exception(f'Failed to save the DataFrame as a Parquet file: {e}')
1848
+ return False
1849
+
1850
+ s3_client = boto3.client(
1851
+ 's3',
1852
+ region_name=self.region,
1853
+ aws_access_key_id=self.access_key,
1854
+ aws_secret_access_key=self.secret_key
1855
+ )
1856
+ try:
1857
+ # Upload the Parquet file to S3
1858
+ response = s3_client.upload_file(
1859
+ file_path,
1860
+ self.datalake,
1861
+ os.path.join(datalake_path, q_name, q_name + '.parquet')
1862
+ )
1863
+ except ClientError as err:
1864
+ self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
1865
+ return False
1866
+ except FileNotFoundError as err:
1867
+ self.logger.exception(f'No Parquet file found. Please check paths: {err}')
1868
+ return False
1869
+
1870
+ return True
1871
+
1872
+
1873
+ def copy_between_datalakes(self, q_name=None, src_datalake=None, src_path=None, dst_datalake=None, dst_path=None):
1874
+ """
1875
+ Return True whether successful copy between datalake buckets occurs
1876
+
1877
+ :param q_name: File or dataset name including the type or extension
1878
+ :param src_datalake: Source datalake's bucket name
1879
+ :param src_path: Source datalake's key path, excluding dataset name
1880
+ :param dst_datalake: Destination datalake's bucket name
1881
+ :param dst_path: Destination datalake's key path, excluding dataset name
1882
+ :return : True if success, else False.
1883
+
1884
+ >>> copy_between_datalakes(q_name='mycube', src_datalake='bucket-a', src_path='path/to/file', dst_datalake='bucket-b', dst_path='path/to/file')
1885
+ >>> True
1886
+ """
1887
+
1888
+ s3_client = boto3.resource('s3',
1889
+ region_name='us-east-1',
1890
+ aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
1891
+ try:
1892
+ copy_source = {'Bucket': src_datalake, 'Key': os.path.join(src_path, q_name)}
1893
+ filename, filetype = q_name.split('.')
1894
+ if filetype == 'csv':
1895
+ s3_client.meta.client.copy(copy_source, dst_datalake, os.path.join(dst_path, filename, filename + '.' + filetype))
1896
+ elif filetype == 'xls' or filetype == 'xlsx' or filetype == 'XLS' or filetype == 'XLSX':
1897
+ s3_client.meta.client.copy(copy_source, dst_datalake, os.path.join(dst_path, filename + '.' + filetype))
1898
+ else:
1899
+ self.logger.debug(f'No valid dataset type. Please check database or datalake to debug.')
1900
+ except FileNotFoundError as err:
1901
+ self.logger.exception(f'No file or datalake found. Please check paths: {err}')
1902
+ return False
1903
+ return True
1904
+
1905
+ def download_bucket_last_excel_file(self,
1906
+ bucket_name,
1907
+ folder,
1908
+ datalake=None,
1909
+ sheet_name=0,
1910
+ index_col=None,
1911
+ usecols=None,
1912
+ num_records=None,
1913
+ date_cols=None,
1914
+ types=None,
1915
+ header_=0,
1916
+ skiprows_=None):
1917
+ """
1918
+ Esta función descarga el archivo Excel más reciente (último modificado)
1919
+ de una carpeta en un bucket de S3, sin importar el nombre del archivo.
1920
+ Input:
1921
+ - bucket_name: Nombre del bucket de S3.
1922
+ - folder: Carpeta en el bucket donde se buscará el archivo.
1923
+ - datalake: Nombre del datalake donde se buscará el archivo.
1924
+ - sheet_name: Nombre o índice de la hoja a cargar.
1925
+ - index_col: Nombre o índice de la columna a usar como índice.
1926
+ - usecols: Columnas a seleccionar.
1927
+ - num_records: Número de registros a cargar.
1928
+ - date_cols: Columnas a parsear como fechas.
1929
+ - types: Tipos de datos de las columnas.
1930
+ - header_: Fila a usar como encabezado.
1931
+ - skiprows_: Número de filas a salt
1932
+ Output:
1933
+ - df: DataFrame de pandas con los datos del archivo Excel.
1934
+ """
1935
+ # Configura tu cliente de S3
1936
+ s3 = boto3.client('s3')
1937
+
1938
+ # Lista los archivos en la carpeta especificada
1939
+ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder)
1940
+
1941
+ # Verifica si se encontraron archivos
1942
+ if 'Contents' in response:
1943
+ # Filtra los archivos Excel
1944
+ archivos_excel = [obj for obj in response['Contents'] if obj['Key'].endswith('.xlsx') or obj['Key'].endswith('.xls')]
1945
+ file_list = pd.DataFrame(archivos_excel)
1946
+ print('Archivos encontrados:\n',file_list[['Key', 'LastModified']])
1947
+ if archivos_excel:
1948
+ # Find the most recent file
1949
+ most_recent_file = max(archivos_excel, key=lambda x: x['LastModified'])
1950
+ archivo_s3 = most_recent_file['Key']
1951
+ print(f'Archivo más reciente encontrado: {archivo_s3}')
1952
+
1953
+ # Carga el archivo de Excel en un DataFrame de pandas
1954
+ df = self.download_excel_from_bucket(datalake=datalake,
1955
+ datalake_path=archivo_s3,
1956
+ sheet_name=sheet_name,
1957
+ index_col=index_col,
1958
+ usecols=usecols,
1959
+ num_records=num_records,
1960
+ date_cols=date_cols,
1961
+ types=types,
1962
+ header_=header_,
1963
+ skiprows_=skiprows_)
1964
+ print(f'Archivo cargado: {archivo_s3} \n')
1965
+ return df
1966
+ else:
1967
+ print('No se encontraron archivos Excel en la carpeta especificada.')
1968
+ else:
1969
+ print('No se encontraron archivos en la carpeta especificada.')
1970
+
1971
+ def rename_and_upload_delta_hist_file(self,
1972
+ df,
1973
+ prefix='DEMAND',
1974
+ col_date='Fecha',
1975
+ datalake_path='dev/raw/as-is/forecast/historic_data',
1976
+ sep=',',
1977
+ encoding='utf-8',
1978
+ date_format='%Y-%m-%d',
1979
+ lineterminator=None):
1980
+ """
1981
+ Rename and upload the file with the prefix and YYYYMM date to datalake.
1982
+ Input:
1983
+ - df: DataFrame to upload.
1984
+ - prefix: Prefix for the file name.
1985
+ - col_date: Column name with the date.
1986
+ - datalake_path: Path in the datalake to upload the file.
1987
+ - sep: Separator for the CSV file.
1988
+ - encoding: Encoding for the CSV file.
1989
+ - date_format: Date format for the CSV file.
1990
+ - lineterminator: Line terminator for the CSV file.
1991
+ Output:
1992
+ - return: True if success, else False.
1993
+ """
1994
+ df[col_date] = pd.to_datetime(df[col_date])
1995
+ date_min = df[col_date].min()
1996
+ date_max = df[col_date].max()
1997
+
1998
+ date_min = str(date_min)[0:7].replace('-','')
1999
+ date_max = str(date_max)[0:7].replace('-','')
2000
+
2001
+ print(f'Fecha mínima: {date_min}')
2002
+ print(f'Fecha máxima: {date_max}')
2003
+
2004
+ if date_min == date_max:
2005
+ print(f'El mes y año de las fechas min y max son iguales. Guardando archivo con nombre: {prefix}{date_min}.csv al datalake: {datalake_path}')
2006
+ self.upload_csv(df, q_name=prefix+date_min,
2007
+ datalake_path=datalake_path,
2008
+ sep=sep,
2009
+ encoding=encoding,
2010
+ date_format=date_format,
2011
+ lineterminator=lineterminator)
2012
+ else:
2013
+ print('El mes y año de las fechas min y max son diferentes. Revisar datos.')
2014
+ return False
2015
+ return True