datupapi 1.114.0__py3-none-any.whl → 1.115.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datupapi/extract/io_opt.py +2015 -0
- datupapi/inventory/src/ProcessForecast/define_periods.py +9 -2
- {datupapi-1.114.0.dist-info → datupapi-1.115.1.dist-info}/METADATA +1 -1
- {datupapi-1.114.0.dist-info → datupapi-1.115.1.dist-info}/RECORD +6 -5
- {datupapi-1.114.0.dist-info → datupapi-1.115.1.dist-info}/WHEEL +1 -1
- {datupapi-1.114.0.dist-info → datupapi-1.115.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,2015 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
import boto3
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import requests
|
|
8
|
+
import time
|
|
9
|
+
from boto3.dynamodb.conditions import Key, Attr
|
|
10
|
+
from boto3.s3.transfer import TransferConfig
|
|
11
|
+
from boto3.session import Session
|
|
12
|
+
from botocore.exceptions import ClientError
|
|
13
|
+
from botocore.config import Config as BotocoreConfig
|
|
14
|
+
from decimal import Decimal
|
|
15
|
+
from datupapi.configure.config import Config
|
|
16
|
+
from google.cloud import bigquery
|
|
17
|
+
from google.oauth2 import service_account
|
|
18
|
+
import google
|
|
19
|
+
from hashlib import md5
|
|
20
|
+
from hdbcli import dbapi
|
|
21
|
+
from sqlalchemy import create_engine, Table, Column, MetaData
|
|
22
|
+
from sqlalchemy import Integer, Float, String, DECIMAL
|
|
23
|
+
from sqlalchemy import insert, delete, exists, schema, text
|
|
24
|
+
from concurrent.futures import ThreadPoolExecutor # para cargar en paralelo a GCP
|
|
25
|
+
from threading import current_thread
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class IO_Optimized(Config):
|
|
29
|
+
|
|
30
|
+
def __init__(self, config_file, logfile, log_path, *args, **kwargs):
|
|
31
|
+
Config.__init__(self, config_file=config_file, logfile=logfile)
|
|
32
|
+
self.log_path = log_path
|
|
33
|
+
|
|
34
|
+
def _get_s3_file_size(self, s3_client, bucket, key):
|
|
35
|
+
"""
|
|
36
|
+
Get file size from S3 without downloading using head_object.
|
|
37
|
+
|
|
38
|
+
:param s3_client: Boto3 S3 client
|
|
39
|
+
:param bucket: S3 bucket name
|
|
40
|
+
:param key: S3 object key
|
|
41
|
+
:return: File size in bytes, or None if error
|
|
42
|
+
"""
|
|
43
|
+
try:
|
|
44
|
+
response = s3_client.head_object(Bucket=bucket, Key=key)
|
|
45
|
+
return response['ContentLength']
|
|
46
|
+
except ClientError:
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
def _create_transfer_config(self, file_size_bytes, s3_client=None):
|
|
50
|
+
"""
|
|
51
|
+
Create optimized TransferConfig based on file size for multipart transfers.
|
|
52
|
+
Also configures urllib3 connection pool to match concurrency settings.
|
|
53
|
+
|
|
54
|
+
Strategy:
|
|
55
|
+
- < 50 MB: No multipart (single-part is faster)
|
|
56
|
+
- 50-500 MB: Moderate multipart (16 MB chunks, 16 threads)
|
|
57
|
+
- 500MB-2GB: Aggressive multipart (32 MB chunks, 32 threads)
|
|
58
|
+
- > 2 GB: Ultra-aggressive multipart (64 MB chunks, 64 threads)
|
|
59
|
+
|
|
60
|
+
:param file_size_bytes: Size of file in bytes
|
|
61
|
+
:param s3_client: Boto3 S3 client (optional, for pool configuration)
|
|
62
|
+
:return: TransferConfig object or None for small files
|
|
63
|
+
"""
|
|
64
|
+
MB = 1024 * 1024
|
|
65
|
+
GB = 1024 * MB
|
|
66
|
+
|
|
67
|
+
if file_size_bytes < 50 * MB:
|
|
68
|
+
# Small files: single-part is faster
|
|
69
|
+
return None
|
|
70
|
+
elif file_size_bytes < 500 * MB:
|
|
71
|
+
# Medium files (50MB - 500MB): moderate multipart
|
|
72
|
+
max_concurrency = 16
|
|
73
|
+
elif file_size_bytes < 2 * GB:
|
|
74
|
+
# Large files (500MB - 2GB): aggressive multipart
|
|
75
|
+
max_concurrency = 32
|
|
76
|
+
else:
|
|
77
|
+
# Very large files (2GB+): ultra-aggressive multipart
|
|
78
|
+
max_concurrency = 64
|
|
79
|
+
|
|
80
|
+
# Configure urllib3 connection pool to match max_concurrency
|
|
81
|
+
# This prevents "Connection pool is full" warnings
|
|
82
|
+
if s3_client is not None:
|
|
83
|
+
try:
|
|
84
|
+
# Increase connection pool size for the HTTP adapter
|
|
85
|
+
adapter = s3_client._client_config
|
|
86
|
+
if hasattr(s3_client.meta.client, '_endpoint'):
|
|
87
|
+
http_session = s3_client.meta.client._endpoint.http_session
|
|
88
|
+
if http_session:
|
|
89
|
+
# Configure pool for both http and https
|
|
90
|
+
http_session.adapters['https://'].poolmanager.connection_pool_kw['maxsize'] = max_concurrency + 10
|
|
91
|
+
http_session.adapters['http://'].poolmanager.connection_pool_kw['maxsize'] = max_concurrency + 10
|
|
92
|
+
except (AttributeError, KeyError):
|
|
93
|
+
# If we can't configure the pool, continue anyway
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
# Determine chunk size based on file size
|
|
97
|
+
if file_size_bytes < 500 * MB:
|
|
98
|
+
chunksize = 16 * MB
|
|
99
|
+
elif file_size_bytes < 2 * GB:
|
|
100
|
+
chunksize = 32 * MB
|
|
101
|
+
else:
|
|
102
|
+
chunksize = 64 * MB
|
|
103
|
+
|
|
104
|
+
return TransferConfig(
|
|
105
|
+
multipart_threshold=8 * MB,
|
|
106
|
+
multipart_chunksize=chunksize,
|
|
107
|
+
max_concurrency=max_concurrency,
|
|
108
|
+
use_threads=True
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def _get_optimized_read_params(self, file_size_bytes, date_cols=None):
|
|
112
|
+
"""
|
|
113
|
+
Calculate optimized pandas read_csv parameters based on file size.
|
|
114
|
+
|
|
115
|
+
SIMPLIFIED OPTIMIZATION: Focus on what actually improves performance.
|
|
116
|
+
|
|
117
|
+
Strategy:
|
|
118
|
+
- Keep it simple - pandas defaults are well-optimized
|
|
119
|
+
- Only add parameters that provide measurable benefit
|
|
120
|
+
- Avoid deprecated or counterproductive options
|
|
121
|
+
|
|
122
|
+
:param file_size_bytes: Size of file in bytes
|
|
123
|
+
:param date_cols: List of date columns (if known)
|
|
124
|
+
:return: dict with optimized parameters
|
|
125
|
+
"""
|
|
126
|
+
MB = 1024 * 1024
|
|
127
|
+
GB = 1024 * MB
|
|
128
|
+
|
|
129
|
+
# Start with minimal, proven parameters
|
|
130
|
+
params = {
|
|
131
|
+
'engine': 'c', # C engine is faster than python
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
# For files > 100MB, use low_memory mode to process in chunks
|
|
135
|
+
if file_size_bytes > 100 * MB:
|
|
136
|
+
params['low_memory'] = True
|
|
137
|
+
|
|
138
|
+
# Only add date optimization if date columns are specified
|
|
139
|
+
# ISO8601 format is significantly faster for date parsing
|
|
140
|
+
if date_cols and len(date_cols) > 0:
|
|
141
|
+
params['cache_dates'] = True
|
|
142
|
+
|
|
143
|
+
return params
|
|
144
|
+
|
|
145
|
+
def get_secret(self, secret_name=None):
|
|
146
|
+
"""
|
|
147
|
+
Return the credentials mapped to the entered secret name
|
|
148
|
+
|
|
149
|
+
:param secret_name: Name identifying the credentials in AWS.
|
|
150
|
+
:return response: Credential to authenticate against AWS resource
|
|
151
|
+
|
|
152
|
+
>>> creds = get_secret()
|
|
153
|
+
>>> creds = ...
|
|
154
|
+
"""
|
|
155
|
+
session = boto3.session.Session()
|
|
156
|
+
client = session.client(service_name='secretsmanager',
|
|
157
|
+
region_name=self.region,
|
|
158
|
+
aws_access_key_id=self.access_key,
|
|
159
|
+
aws_secret_access_key=self.secret_key)
|
|
160
|
+
try:
|
|
161
|
+
if secret_name is not None:
|
|
162
|
+
get_secret_value_response = client.get_secret_value(SecretId=secret_name)
|
|
163
|
+
else:
|
|
164
|
+
get_secret_value_response = client.get_secret_value(SecretId=self.sql_database + 'secret')
|
|
165
|
+
except ClientError as e:
|
|
166
|
+
if e.response['Error']['Code'] == 'DecryptionFailureException':
|
|
167
|
+
# Secrets Manager can't decrypt the protected secret text using the provided KMS key.
|
|
168
|
+
# Deal with the exception here, and/or rethrow at your discretion.
|
|
169
|
+
raise e
|
|
170
|
+
elif e.response['Error']['Code'] == 'InternalServiceErrorException':
|
|
171
|
+
# An error occurred on the server side.
|
|
172
|
+
# Deal with the exception here, and/or rethrow at your discretion.
|
|
173
|
+
raise e
|
|
174
|
+
elif e.response['Error']['Code'] == 'InvalidParameterException':
|
|
175
|
+
# You provided an invalid value for a parameter.
|
|
176
|
+
# Deal with the exception here, and/or rethrow at your discretion.
|
|
177
|
+
raise e
|
|
178
|
+
elif e.response['Error']['Code'] == 'InvalidRequestException':
|
|
179
|
+
# You provided a parameter value that is not valid for the current state of the resource.
|
|
180
|
+
# Deal with the exception here, and/or rethrow at your discretion.
|
|
181
|
+
raise e
|
|
182
|
+
elif e.response['Error']['Code'] == 'ResourceNotFoundException':
|
|
183
|
+
# We can't find the resource that you asked for.
|
|
184
|
+
# Deal with the exception here, and/or rethrow at your discretion.
|
|
185
|
+
raise e
|
|
186
|
+
else:
|
|
187
|
+
# Decrypts secret using the associated KMS CMK.
|
|
188
|
+
# Depending on whether the secret is a string or binary, one of these fields will be populated.
|
|
189
|
+
if 'SecretString' in get_secret_value_response:
|
|
190
|
+
secret = get_secret_value_response['SecretString']
|
|
191
|
+
else:
|
|
192
|
+
decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
|
|
193
|
+
return json.loads(get_secret_value_response['SecretString'])
|
|
194
|
+
|
|
195
|
+
def populate_snowflake_table(self,
|
|
196
|
+
df,
|
|
197
|
+
dwh_account=None,
|
|
198
|
+
dwh_name=None,
|
|
199
|
+
dwh_user=None,
|
|
200
|
+
dwh_passwd=None,
|
|
201
|
+
dwh_dbname=None,
|
|
202
|
+
dwh_schema=None,
|
|
203
|
+
table_name=None,
|
|
204
|
+
replace=True):
|
|
205
|
+
"""
|
|
206
|
+
Create a table in Snowflake DWH and insert the records from a dataframe
|
|
207
|
+
|
|
208
|
+
:param df: Dataframe storing the records to insert into the database table
|
|
209
|
+
:param dwh_account: Snowflake account identifier
|
|
210
|
+
:param dwh_name: Snowflake datawarehouse name
|
|
211
|
+
:param dwh_user: Snowflake account username
|
|
212
|
+
:param dwh_passwd: Snowflake account password
|
|
213
|
+
:param db_name: Snowflake database name
|
|
214
|
+
:param dwh_schema: Snowflake database schema
|
|
215
|
+
:param table_name: Snowflake table name
|
|
216
|
+
:param replace: If True replace table records whether exists. Otherwise append records. Default True.
|
|
217
|
+
:return inserted_records: Number of records inserted
|
|
218
|
+
|
|
219
|
+
>>> records = populate_snowflake_table(df, dwh_account='xx12345.us-east-1', dwh_name='myDwh', dwh_user='myuser', dwh_passwd='12345', dwh_dbname='mydbname', dwh_schema='myschema', table_name='mytable')
|
|
220
|
+
>>> records = 1000
|
|
221
|
+
"""
|
|
222
|
+
if self.tenant_id != '':
|
|
223
|
+
tenant_table_name = (self.tenant_id + table_name)
|
|
224
|
+
else:
|
|
225
|
+
tenant_table_name = table_name
|
|
226
|
+
|
|
227
|
+
url = URL(account=dwh_account, user=dwh_user, password=dwh_passwd, warehouse=dwh_name, database=dwh_dbname, schema=dwh_schema)
|
|
228
|
+
try:
|
|
229
|
+
engine = create_engine(url)
|
|
230
|
+
conn = engine.connect()
|
|
231
|
+
if replace:
|
|
232
|
+
df.to_sql(tenant_table_name, con=engine, if_exists='replace', index=False, chunksize=16000)
|
|
233
|
+
else:
|
|
234
|
+
df.to_sql(tenant_table_name, con=engine, if_exists='append', index=False, chunksize=16000)
|
|
235
|
+
inserted_records = conn.execute('select count(*) from ' + '"' + tenant_table_name + '"').fetchone()[0]
|
|
236
|
+
finally:
|
|
237
|
+
conn.close()
|
|
238
|
+
engine.dispose()
|
|
239
|
+
return inserted_records
|
|
240
|
+
|
|
241
|
+
def populate_bigquery_table(self, df, project_id=None, tenant_id=None, table_name=None, write_mode='overwrite', gcp_key='datup-supplyai-dev-gcp.json'):
|
|
242
|
+
"""
|
|
243
|
+
Create a table in BigQuery DWH and insert the records from a dataframe
|
|
244
|
+
:param df: Dataframe storing the records to insert into the database table
|
|
245
|
+
:param projectId: Project identifier in GCP
|
|
246
|
+
:param tenantId: Tenant or customer identifier
|
|
247
|
+
:param table_name: BigQuery table name
|
|
248
|
+
:param write_mode: BigQuery table update method. Either overwrite or append
|
|
249
|
+
:param gcp_key: BigQuery credential key
|
|
250
|
+
:return: Number of records inserted
|
|
251
|
+
|
|
252
|
+
>>> records = populate_bigquery_table(df, project_id='myproject', tenant_id='acme', table_name='mytable')
|
|
253
|
+
>>> records = 1000
|
|
254
|
+
"""
|
|
255
|
+
key = service_account.Credentials.from_service_account_file(os.path.join('/opt/ml/processing/input', gcp_key))
|
|
256
|
+
client = bigquery.Client(credentials=key)
|
|
257
|
+
|
|
258
|
+
try:
|
|
259
|
+
if write_mode == 'overwrite':
|
|
260
|
+
write_mode_ = 'WRITE_TRUNCATE'
|
|
261
|
+
elif write_mode == 'append':
|
|
262
|
+
write_mode_ = 'WRITE_APPEND'
|
|
263
|
+
else:
|
|
264
|
+
self.logger.exception(f'No valid BigQuery write mode. Please check valid types: overwrite or append')
|
|
265
|
+
table_id = project_id + '.' + tenant_id + '.' + table_name
|
|
266
|
+
job_config = bigquery.LoadJobConfig(autodetect=False,
|
|
267
|
+
source_format=bigquery.SourceFormat.CSV,
|
|
268
|
+
allow_quoted_newlines=True,
|
|
269
|
+
write_disposition=write_mode_)
|
|
270
|
+
#client.delete_table(table_id, not_found_ok=True)
|
|
271
|
+
load_job = client.load_table_from_dataframe(dataframe=df, destination=table_id, job_config=job_config)
|
|
272
|
+
load_job.result()
|
|
273
|
+
destination_table = client.get_table(table_id)
|
|
274
|
+
except google.api_core.exceptions.NotFound as err:
|
|
275
|
+
raise
|
|
276
|
+
return destination_table.num_rows
|
|
277
|
+
|
|
278
|
+
def populate_bigquery_table_with_schema(self, df, project_id=None, tenant_id=None, table_name=None, write_mode='overwrite', gcp_key='datup-supplyai-dev-gcp.json'):
|
|
279
|
+
"""
|
|
280
|
+
Create a table in BigQuery DWH and insert the records from a dataframe
|
|
281
|
+
:param df: Dataframe storing the records to insert into the database table
|
|
282
|
+
:param projectId: Project identifier in GCP
|
|
283
|
+
:param tenantId: Tenant or customer identifier
|
|
284
|
+
:param table_name: BigQuery table name
|
|
285
|
+
:return: Number of records inserted
|
|
286
|
+
|
|
287
|
+
>>> records = populate_bigquery_table_with_schema(df, project_id='myproject', tenant_id='acme', table_name='mytable')
|
|
288
|
+
>>> records = 1000
|
|
289
|
+
"""
|
|
290
|
+
key = service_account.Credentials.from_service_account_file(os.path.join('/opt/ml/processing/input', gcp_key))
|
|
291
|
+
client = bigquery.Client(credentials=key)
|
|
292
|
+
|
|
293
|
+
try:
|
|
294
|
+
if write_mode == 'overwrite':
|
|
295
|
+
write_mode_ = 'WRITE_TRUNCATE'
|
|
296
|
+
elif write_mode == 'append':
|
|
297
|
+
write_mode_ = 'WRITE_APPEND'
|
|
298
|
+
else:
|
|
299
|
+
self.logger.exception(f'No valid BigQuery write mode. Please check valid types: overwrite or append')
|
|
300
|
+
|
|
301
|
+
# Build schema dynamically
|
|
302
|
+
df_schema = []
|
|
303
|
+
date_cols = list(df.select_dtypes(include=['datetime64']).columns)
|
|
304
|
+
string_cols = list(df.select_dtypes(include=['object']).columns)
|
|
305
|
+
integer_cols = list(df.select_dtypes(include=['int64']).columns)
|
|
306
|
+
float_cols = list(df.select_dtypes(include=['float64']).columns)
|
|
307
|
+
[df_schema.append(bigquery.SchemaField(col, bigquery.enums.SqlTypeNames.DATE)) for col in date_cols]
|
|
308
|
+
[df_schema.append(bigquery.SchemaField(col, bigquery.enums.SqlTypeNames.INT64)) for col in integer_cols]
|
|
309
|
+
[df_schema.append(bigquery.SchemaField(col, bigquery.enums.SqlTypeNames.FLOAT64)) for col in float_cols]
|
|
310
|
+
[df_schema.append(bigquery.SchemaField(col, bigquery.enums.SqlTypeNames.STRING)) for col in string_cols]
|
|
311
|
+
#Load pandas dataframe into BigQuery table
|
|
312
|
+
table_id = project_id + '.' + tenant_id + '.' + table_name
|
|
313
|
+
job_config = bigquery.LoadJobConfig(autodetect=False,
|
|
314
|
+
schema=df_schema,
|
|
315
|
+
write_disposition=write_mode_)
|
|
316
|
+
load_job = client.load_table_from_dataframe(dataframe=df, destination=table_id, job_config=job_config)
|
|
317
|
+
load_job.result()
|
|
318
|
+
destination_table = client.get_table(table_id)
|
|
319
|
+
except google.api_core.exceptions.NotFound as err:
|
|
320
|
+
raise
|
|
321
|
+
return destination_table.num_rows
|
|
322
|
+
|
|
323
|
+
def get_secret_for_bigquery(self,secretname=None):
|
|
324
|
+
"""
|
|
325
|
+
Get the secret from AWS and return a json with the keys for
|
|
326
|
+
made a query in Google BigQuery
|
|
327
|
+
:param secretname: Name of the AWS secret
|
|
328
|
+
"""
|
|
329
|
+
json = {
|
|
330
|
+
"type":self.get_secret(secret_name=secretname)['type'],
|
|
331
|
+
"project_id":self.get_secret(secret_name=secretname)['project_id'],
|
|
332
|
+
"private_key_id":self.get_secret(secret_name=secretname)['private_key_id'],
|
|
333
|
+
"private_key":self.get_secret(secret_name=secretname)['private_key'],
|
|
334
|
+
"client_email":self.get_secret(secret_name=secretname)['client_email'],
|
|
335
|
+
"client_id":self.get_secret(secret_name=secretname)['client_id'],
|
|
336
|
+
"auth_uri":self.get_secret(secret_name=secretname)['auth_uri'],
|
|
337
|
+
"token_uri":self.get_secret(secret_name=secretname)['token_uri'],
|
|
338
|
+
"auth_provider_x509_cert_url":self.get_secret(secret_name=secretname)['auth_provider_x509_cert_url'],
|
|
339
|
+
"client_x509_cert_url":self.get_secret(secret_name=secretname)['client_x509_cert_url'],
|
|
340
|
+
"universe_domain":self.get_secret(secret_name=secretname)['universe_domain']
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
json['private_key']=json['private_key'].replace("\\n", "\n")
|
|
344
|
+
|
|
345
|
+
return json
|
|
346
|
+
|
|
347
|
+
def describe_bigquery_table(self, aws_secret=None, gcp_key=None) -> Dict[str, List[str]]:
|
|
348
|
+
"""
|
|
349
|
+
Describe a table in BigQuery shows its data sets and tables contained.
|
|
350
|
+
:return: Project, datasets and tables.
|
|
351
|
+
|
|
352
|
+
>>> description = describe()
|
|
353
|
+
>>> descripton = ...
|
|
354
|
+
"""
|
|
355
|
+
try:
|
|
356
|
+
if aws_secret==None:
|
|
357
|
+
print("Using json file")
|
|
358
|
+
key = service_account.Credentials.from_service_account_file(os.path.join('/opt/ml/processing/input', gcp_key))
|
|
359
|
+
else:
|
|
360
|
+
print("Using aws secret")
|
|
361
|
+
key = service_account.Credentials.from_service_account_info(self.get_secret_for_bigquery(aws_secret))
|
|
362
|
+
|
|
363
|
+
client = bigquery.Client(credentials=key)
|
|
364
|
+
print('Proyecto: ', client.project)
|
|
365
|
+
datasets = client.list_datasets()
|
|
366
|
+
table_client: List[str] = []
|
|
367
|
+
component_client: Dict[str, List[str]] = {}
|
|
368
|
+
for dataset in datasets:
|
|
369
|
+
print('\nDataset: ',dataset.reference ,'\nTablas:')
|
|
370
|
+
tables = client.list_tables(dataset.reference)
|
|
371
|
+
for table in tables:
|
|
372
|
+
print(' ',table.table_id)
|
|
373
|
+
table_client.append(table.table_id)
|
|
374
|
+
component_client[dataset.reference] = table_client
|
|
375
|
+
|
|
376
|
+
client.close()
|
|
377
|
+
except Exception as e:
|
|
378
|
+
print(f"Falla la consulta en base de datos big query: {e}")
|
|
379
|
+
raise
|
|
380
|
+
else:
|
|
381
|
+
return component_client
|
|
382
|
+
|
|
383
|
+
def download_bigquery_table(self,
|
|
384
|
+
project_id=None,
|
|
385
|
+
tenant_id=None,
|
|
386
|
+
table_name=None,
|
|
387
|
+
aws_secret=None,
|
|
388
|
+
gcp_key=None,
|
|
389
|
+
sqlQuery=None):
|
|
390
|
+
"""
|
|
391
|
+
Download a query from a data set in BigQuery.
|
|
392
|
+
|
|
393
|
+
:param projectId: Project identifier in GCP
|
|
394
|
+
:param tenantId: Tenant or customer identifier
|
|
395
|
+
:param table_name: BigQuery table name
|
|
396
|
+
:param aw_secret: Name of the AWS secret
|
|
397
|
+
:param query: SQl query.
|
|
398
|
+
:return: Dataframe from query.
|
|
399
|
+
|
|
400
|
+
>>> records = populate_dbtable(df, hostname='202.10.0.1', db_user='johndoe', db_passwd='123456', db_name='dbo.TheDataBase')
|
|
401
|
+
>>> records = 1000
|
|
402
|
+
"""
|
|
403
|
+
|
|
404
|
+
try:
|
|
405
|
+
if aws_secret==None: #If gcp key is read from json file
|
|
406
|
+
print("Using json file")
|
|
407
|
+
key = service_account.Credentials.from_service_account_file(os.path.join('/opt/ml/processing/input', gcp_key))
|
|
408
|
+
else: #If aws_secret has a value
|
|
409
|
+
print("Using aws secret")
|
|
410
|
+
key = service_account.Credentials.from_service_account_info(self.get_secret_for_bigquery(aws_secret))
|
|
411
|
+
|
|
412
|
+
except TypeError:
|
|
413
|
+
print("Please, use a valid aws_secret or json file")
|
|
414
|
+
|
|
415
|
+
client = bigquery.Client(credentials=key)
|
|
416
|
+
try:
|
|
417
|
+
sql = sqlQuery
|
|
418
|
+
df = client.query(sql).to_dataframe()
|
|
419
|
+
print(f"¡Historical forecast download success from date!")
|
|
420
|
+
except Exception as e:
|
|
421
|
+
raise f"Falla la consulta en base de datos big query: {e}"
|
|
422
|
+
else:
|
|
423
|
+
return df
|
|
424
|
+
|
|
425
|
+
def populate_dbtable(self,
|
|
426
|
+
df,
|
|
427
|
+
hostname=None,
|
|
428
|
+
db_user=None,
|
|
429
|
+
db_passwd=None,
|
|
430
|
+
db_name=None,
|
|
431
|
+
port='3306',
|
|
432
|
+
table_name=None,
|
|
433
|
+
db_type='mysql',
|
|
434
|
+
replace=True):
|
|
435
|
+
"""
|
|
436
|
+
Create a table in a MySQL database and insert the records from a dataframe
|
|
437
|
+
|
|
438
|
+
:param df: Dataframe storing the records to insert into the database table
|
|
439
|
+
:param hostname: Public IP address or hostname of the remote database server
|
|
440
|
+
:param db_user: Username of the database
|
|
441
|
+
:param db_passwd: Password of the database
|
|
442
|
+
:param db_name: Name of the target database
|
|
443
|
+
:param port: TCP port number of the database (usually 3306)
|
|
444
|
+
:param table_name: Name of target table
|
|
445
|
+
:param db_type: Name of database type. Choose from mysql, mssql. Default mysql.
|
|
446
|
+
:param replace: If True replace table records whether exists. Otherwise append records. Default True.
|
|
447
|
+
:return inserted_records: Number of records inserted
|
|
448
|
+
|
|
449
|
+
>>> records = populate_dbtable(df, hostname='202.10.0.1', db_user='johndoe', db_passwd='123456', db_name='dbo.TheDataBase')
|
|
450
|
+
>>> records = 1000
|
|
451
|
+
"""
|
|
452
|
+
if db_type == 'mysql':
|
|
453
|
+
db_api = 'mysql+mysqlconnector://'
|
|
454
|
+
elif db_type == 'mysql_legacy':
|
|
455
|
+
db_api = 'mysql+pymysql://'
|
|
456
|
+
elif db_type == 'mssql':
|
|
457
|
+
db_api = 'mssql+pymssql://'
|
|
458
|
+
else:
|
|
459
|
+
self.logger.exception(f'No valid database type. Please check valid types: mysql, mssql')
|
|
460
|
+
|
|
461
|
+
try:
|
|
462
|
+
engine = create_engine(db_api + db_user + ':' + db_passwd + '@' + hostname + ':' + str(port) + '/' + db_name)
|
|
463
|
+
if replace:
|
|
464
|
+
df.to_sql(table_name, con=engine, if_exists='replace', index=False)
|
|
465
|
+
else:
|
|
466
|
+
df.to_sql(table_name, con=engine, if_exists='append', index=False)
|
|
467
|
+
#inserted_records = engine.execute('SELECT COUNT(*) FROM ' + table_name).fetchall()[0][0]
|
|
468
|
+
inserted_records = df.shape[0]
|
|
469
|
+
except ConnectionRefusedError as err:
|
|
470
|
+
logger.exception(f'Refused connection to the database. Please check parameters: {err}')
|
|
471
|
+
raise
|
|
472
|
+
return inserted_records
|
|
473
|
+
|
|
474
|
+
def populate_dbtable_threads(self,
|
|
475
|
+
df,
|
|
476
|
+
hostname=None,
|
|
477
|
+
db_user=None,
|
|
478
|
+
db_passwd=None,
|
|
479
|
+
db_name=None,
|
|
480
|
+
port='3306',
|
|
481
|
+
table_name=None,
|
|
482
|
+
db_type='mysql',
|
|
483
|
+
replace = True,
|
|
484
|
+
chunk_size=500,
|
|
485
|
+
batch_size=10000,
|
|
486
|
+
threads=2):
|
|
487
|
+
"""
|
|
488
|
+
Create a table in a MySQL database and insert the records from a dataframe
|
|
489
|
+
|
|
490
|
+
:param df: Dataframe storing the records to insert into the database table
|
|
491
|
+
:param hostname: Public IP address or hostname of the remote database server
|
|
492
|
+
:param db_user: Username of the database
|
|
493
|
+
:param db_passwd: Password of the database
|
|
494
|
+
:param db_name: Name of the target database
|
|
495
|
+
:param port: TCP port number of the database (usually 3306)
|
|
496
|
+
:param table_name: Name of target table
|
|
497
|
+
:param db_type: Name of database type. Choose from mysql, mssql. Default mysql.
|
|
498
|
+
:param replace: If True replace table records whether exists. Otherwise append records. Default True.
|
|
499
|
+
:param chunk_size: Number of records to insert. Default 500.
|
|
500
|
+
:param batch_size: Number of rows per batch. Default 10000.
|
|
501
|
+
:param threads: Number of threads to use for parallel execution. Default 2.
|
|
502
|
+
:return inserted_records: Number of records inserted
|
|
503
|
+
|
|
504
|
+
>>> records = populate_dbtable(df, hostname='202.10.0.1', db_user='johndoe', db_passwd='123456', db_name='dbo.TheDataBase')
|
|
505
|
+
>>> records = 1000
|
|
506
|
+
"""
|
|
507
|
+
try:
|
|
508
|
+
if db_type == 'mysql':
|
|
509
|
+
db_api = 'mysql+mysqlconnector://'
|
|
510
|
+
elif db_type == 'mysql_legacy':
|
|
511
|
+
db_api = 'mysql+pymysql://'
|
|
512
|
+
elif db_type == 'mssql':
|
|
513
|
+
db_api = 'mssql+pymssql://'
|
|
514
|
+
else:
|
|
515
|
+
raise ValueError(f"No valid database type. Please check valid types: mysql, mssql")
|
|
516
|
+
|
|
517
|
+
# Validación inicial
|
|
518
|
+
if df.empty:
|
|
519
|
+
print(f"[WARNING] El DataFrame para la tabla {table_name} está vacío. Saltando...")
|
|
520
|
+
return 0
|
|
521
|
+
if not table_name:
|
|
522
|
+
print("[ERROR] No se especificó un nombre de tabla. Saltando...")
|
|
523
|
+
return 0
|
|
524
|
+
|
|
525
|
+
total_inserted_records = 0
|
|
526
|
+
|
|
527
|
+
# Creación de batches y el motor
|
|
528
|
+
total_rows = len(df)
|
|
529
|
+
batches = [df.iloc[start:start + batch_size] for start in range(0, total_rows, batch_size)]
|
|
530
|
+
engine = create_engine(f"{db_api}{db_user}:{db_passwd}@{hostname}:{port}/{db_name}")
|
|
531
|
+
|
|
532
|
+
#Si replace, realizamos ya la eliminación de los registros anteriores y dejamos creada la estructura nueva
|
|
533
|
+
#Se hace por fuera de los hilos para que no genere problemas con el acceso paralelo a la tabla
|
|
534
|
+
action = 'replace' if replace else 'append'
|
|
535
|
+
|
|
536
|
+
if replace:
|
|
537
|
+
print(f"[INFO] Reemplazando la tabla {table_name} antes de iniciar la carga.")
|
|
538
|
+
df.iloc[0:0].to_sql(table_name, con=engine, if_exists='replace', index=False)
|
|
539
|
+
action = 'append' # Evitamos que los hilos vuelvan a reemplazar
|
|
540
|
+
|
|
541
|
+
# Función interna para procesar cada batch.
|
|
542
|
+
# Si replace es True, al iniciar se reemplaza y todos los otros batches deben agregarse al primero
|
|
543
|
+
def process_batch(batch, start_idx):
|
|
544
|
+
try:
|
|
545
|
+
# Obtener rango de líneas
|
|
546
|
+
start_line = start_idx
|
|
547
|
+
end_line = start_idx + len(batch) - 1
|
|
548
|
+
thread_name = current_thread().name
|
|
549
|
+
print(f"[INFO] [{thread_name}] Procesando batch de líneas {start_line}-{end_line} en la tabla {table_name} con acción {action}.")
|
|
550
|
+
batch.to_sql(table_name, con=engine, if_exists='append', index=False, chunksize=chunk_size)
|
|
551
|
+
return len(batch)
|
|
552
|
+
except Exception as err:
|
|
553
|
+
print(f"[ERROR] Error al cargar batch: {err}")
|
|
554
|
+
raise
|
|
555
|
+
|
|
556
|
+
# Ejecutar la carga en paralelo
|
|
557
|
+
print(f"[INFO] Iniciando carga para la tabla {table_name}. Total de filas: {total_rows}")
|
|
558
|
+
with ThreadPoolExecutor(max_workers=threads) as executor:
|
|
559
|
+
results = executor.map(
|
|
560
|
+
lambda idx_batch: process_batch(batches[idx_batch], start_idx=(idx_batch * batch_size)),
|
|
561
|
+
range(len(batches)))
|
|
562
|
+
total_inserted_records = sum(results)
|
|
563
|
+
|
|
564
|
+
print(f"[INFO] Carga completa para la tabla {table_name}. Total de registros insertados: {total_inserted_records}")
|
|
565
|
+
return total_inserted_records
|
|
566
|
+
|
|
567
|
+
except Exception as e:
|
|
568
|
+
print(f"[ERROR] Error durante la carga de la tabla {table_name}: {e}")
|
|
569
|
+
return 0
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def download_dbtable(self, hostname=None, db_user=None, db_passwd=None, db_name=None, port='3306', table_name=None, schema=None, db_type='mysql', query=None):
|
|
574
|
+
"""Return a dataframe containing the data extracted from MSSQL database's table supporting PyODBC connector
|
|
575
|
+
|
|
576
|
+
:param hostname: Public IP address or hostname of the remote database server
|
|
577
|
+
:param db_user: Username of the database
|
|
578
|
+
:param db_passwd: Password of the database
|
|
579
|
+
:param db_name: Name of the target database
|
|
580
|
+
:param port: TCP port number of the database. Default 3306
|
|
581
|
+
:param table_name: Name of target table
|
|
582
|
+
:param schema: Name of database schema
|
|
583
|
+
:param db_type: Name of database type. Choose from mysql, mssql, postgres, mysql_legacy. Default mysql.
|
|
584
|
+
:param query: SQL statement to use as query.
|
|
585
|
+
:return df: Dataframe containing the data from database's table
|
|
586
|
+
|
|
587
|
+
>>> df = download_dbtable(hostname='202.10.0.1', db_user='johndoe', db_passwd='123456', db_name='dbo.TheDataBase', query='SELECT * FROM table')
|
|
588
|
+
>>> df
|
|
589
|
+
var1 var2 var3
|
|
590
|
+
idx0 1 2 3
|
|
591
|
+
"""
|
|
592
|
+
if db_type == 'mysql':
|
|
593
|
+
db_api = 'mysql+mysqlconnector://'
|
|
594
|
+
elif db_type == 'mysql_legacy':
|
|
595
|
+
db_api = 'mysql+pymysql://'
|
|
596
|
+
elif db_type == 'mssql':
|
|
597
|
+
db_api = 'mssql+pymssql://'
|
|
598
|
+
elif db_type == 'postgres':
|
|
599
|
+
db_api = 'postgresql://'
|
|
600
|
+
elif db_type == 'sap_hana':
|
|
601
|
+
db_api = 'hana+hdbcli://'
|
|
602
|
+
else:
|
|
603
|
+
self.logger.exception(f'No valid database type. Please check valid types: mysql, mssql')
|
|
604
|
+
|
|
605
|
+
try:
|
|
606
|
+
if db_type == 'sap_hana':
|
|
607
|
+
engine = create_engine(db_api + db_user + ':' + db_passwd + '@' + hostname + ':' + str(port) + '?currentSchema=' + schema)
|
|
608
|
+
else:
|
|
609
|
+
engine = create_engine(db_api + db_user + ':' + db_passwd + '@' + hostname + ':' + str(port) + '/' + db_name)
|
|
610
|
+
connection = engine.connect()
|
|
611
|
+
stmt = text(query)
|
|
612
|
+
df = pd.read_sql_query(stmt, connection)
|
|
613
|
+
except ConnectionRefusedError as err:
|
|
614
|
+
logger.exception(f'Refused connection to the database. Please check parameters: {err}')
|
|
615
|
+
raise
|
|
616
|
+
return df
|
|
617
|
+
|
|
618
|
+
def download_rdstable(self, rds_arn=None, secret_arn=None, database_name=None, sql_query=None, query_params=None):
|
|
619
|
+
"""
|
|
620
|
+
Return query results to RDS database
|
|
621
|
+
|
|
622
|
+
:param rds_arn: Database instance or clusrter's ARN
|
|
623
|
+
:param secret_arn: Secret Manager resource ARN
|
|
624
|
+
:param database_name: Database name to query on instance or cluster
|
|
625
|
+
:param sql_query: Query string on SQL syntax
|
|
626
|
+
:param query_params: List of dictionary values to put into the query string
|
|
627
|
+
:return response: Records queried from the RDS database
|
|
628
|
+
|
|
629
|
+
>>> response = download_rdstable(rds_arn='arn:rds:mycluster', \
|
|
630
|
+
secret_arn='arn:secret:mysecret', \
|
|
631
|
+
database_name='mydb', \
|
|
632
|
+
sql_query=[{'name': 'paramId', 'value': {'stringValue': 'myvalue'}}], \
|
|
633
|
+
query_params=None)
|
|
634
|
+
>>> response = [{'date': '2021-06-07'}, {'name': 'John Doe'}, {'salary': 1000}]
|
|
635
|
+
"""
|
|
636
|
+
client = boto3.client('rds-data',
|
|
637
|
+
region_name='us-east-1',
|
|
638
|
+
aws_access_key_id=self.access_key,
|
|
639
|
+
aws_secret_access_key=self.secret_key)
|
|
640
|
+
try:
|
|
641
|
+
# Query project table
|
|
642
|
+
response = client.execute_statement(parameters=query_params,
|
|
643
|
+
resourceArn=rds_arn,
|
|
644
|
+
secretArn=secret_arn,
|
|
645
|
+
database=database_name,
|
|
646
|
+
sql=sql_query)
|
|
647
|
+
except client.exceptions.BadRequestException as err:
|
|
648
|
+
print(f'Incorrect request. Please check query syntax and parameters: {err}')
|
|
649
|
+
return False
|
|
650
|
+
return response['records']
|
|
651
|
+
|
|
652
|
+
def download_csv(self,
|
|
653
|
+
q_name,
|
|
654
|
+
datalake_path=None,
|
|
655
|
+
sep=',',
|
|
656
|
+
index_col=None,
|
|
657
|
+
usecols=None,
|
|
658
|
+
num_records=None,
|
|
659
|
+
dayfirst=False,
|
|
660
|
+
compression='infer',
|
|
661
|
+
encoding='utf-8',
|
|
662
|
+
date_cols=None,
|
|
663
|
+
types=None,
|
|
664
|
+
thousands=None,
|
|
665
|
+
decimal='.',
|
|
666
|
+
low_memory=True,
|
|
667
|
+
use_multipart=True):
|
|
668
|
+
"""Return a dataframe from a csv file stored in the datalake
|
|
669
|
+
|
|
670
|
+
OPTIMIZED VERSION with S3 Multipart Transfer support for faster downloads.
|
|
671
|
+
|
|
672
|
+
:param q_name: Plain file (.csv) to download and stored in a dataframe
|
|
673
|
+
:param datalake_path: Path to download the file from the S3 datalake. Default None.
|
|
674
|
+
:param sep: Field delimiter of the downloaded file. Default ','
|
|
675
|
+
:param index_col: Column(s) to use as the row labels of the DataFrame, either given as string name or column index.
|
|
676
|
+
:param usecols: Columns to use in returning dataframe.
|
|
677
|
+
:param num_records: Number of records to fetch from the source. Default None
|
|
678
|
+
:param dayfirst: DD/MM format dates, international and European format. Default False
|
|
679
|
+
:param compression: For on-the-fly decompression of on-disk data. Default 'infer'
|
|
680
|
+
:param encoding: Encoding to use for UTF when reading/writing. Default 'utf-8'
|
|
681
|
+
:param date_cols: List of date columns to parse as datetime type. Default None
|
|
682
|
+
:param types: Dict with data columns as keys and data types as values. Default None
|
|
683
|
+
:param thousands: Thousands separator
|
|
684
|
+
:param decimal: Decimal separator. Default '.'
|
|
685
|
+
:param low_memory: Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. Default True
|
|
686
|
+
:param use_multipart: Enable S3 multipart transfer for files >50MB. Default True
|
|
687
|
+
:return df: Dataframe containing the data from the file stored in the datalake
|
|
688
|
+
|
|
689
|
+
>>> df = download_csv(q_name='Q', datalake_path='as-is/folder')
|
|
690
|
+
>>> df
|
|
691
|
+
var1 var2 var3
|
|
692
|
+
idx0 1 2 3
|
|
693
|
+
"""
|
|
694
|
+
start_time = time.time()
|
|
695
|
+
print(f"[DOWNLOAD_CSV] Starting: {q_name}.csv")
|
|
696
|
+
|
|
697
|
+
# Configure boto3 client with larger connection pool for multipart transfers
|
|
698
|
+
botocore_config = BotocoreConfig(
|
|
699
|
+
max_pool_connections=100, # Increased from default 10 to support high concurrency
|
|
700
|
+
retries={'max_attempts': 3, 'mode': 'adaptive'}
|
|
701
|
+
)
|
|
702
|
+
s3_client = boto3.client(
|
|
703
|
+
's3',
|
|
704
|
+
region_name=self.region,
|
|
705
|
+
aws_access_key_id=self.access_key,
|
|
706
|
+
aws_secret_access_key=self.secret_key,
|
|
707
|
+
config=botocore_config
|
|
708
|
+
)
|
|
709
|
+
file_path = os.path.join(Config.LOCAL_PATH, q_name + '.csv')
|
|
710
|
+
|
|
711
|
+
# Build S3 key
|
|
712
|
+
if datalake_path is None:
|
|
713
|
+
s3_key = q_name + '.csv'
|
|
714
|
+
else:
|
|
715
|
+
s3_key = os.path.join(datalake_path, q_name, q_name + '.csv')
|
|
716
|
+
|
|
717
|
+
print(f"[DOWNLOAD_CSV] S3 path: s3://{self.datalake}/{s3_key}")
|
|
718
|
+
|
|
719
|
+
try:
|
|
720
|
+
# PHASE 1 OPTIMIZATION: Get file size and create transfer config
|
|
721
|
+
file_size = self._get_s3_file_size(s3_client, self.datalake, s3_key)
|
|
722
|
+
|
|
723
|
+
transfer_config = None
|
|
724
|
+
if use_multipart and file_size and file_size > 50 * 1024 * 1024: # > 50MB
|
|
725
|
+
transfer_config = self._create_transfer_config(file_size, s3_client)
|
|
726
|
+
print(f"[DOWNLOAD_CSV] Using multipart transfer | Size: {file_size / (1024*1024):.2f} MB")
|
|
727
|
+
elif file_size:
|
|
728
|
+
print(f"[DOWNLOAD_CSV] Using single-part transfer | Size: {file_size / (1024*1024):.2f} MB")
|
|
729
|
+
|
|
730
|
+
# Download file with optimized config
|
|
731
|
+
download_start = time.time()
|
|
732
|
+
if transfer_config:
|
|
733
|
+
s3_client.download_file(
|
|
734
|
+
Bucket=self.datalake,
|
|
735
|
+
Key=s3_key,
|
|
736
|
+
Filename=file_path,
|
|
737
|
+
Config=transfer_config
|
|
738
|
+
)
|
|
739
|
+
else:
|
|
740
|
+
s3_client.download_file(self.datalake, s3_key, file_path)
|
|
741
|
+
|
|
742
|
+
download_time = time.time() - download_start
|
|
743
|
+
|
|
744
|
+
# Get file size if not already obtained
|
|
745
|
+
if not file_size:
|
|
746
|
+
file_size = os.path.getsize(file_path)
|
|
747
|
+
|
|
748
|
+
speed_mbps = (file_size / (1024*1024)) / download_time if download_time > 0 else 0
|
|
749
|
+
print(f"[DOWNLOAD_CSV] Download completed: {download_time:.2f}s | Size: {file_size / (1024*1024):.2f} MB ({speed_mbps:.2f} MB/s)")
|
|
750
|
+
|
|
751
|
+
# PHASE 2 OPTIMIZATION: Get optimized read parameters
|
|
752
|
+
read_params = self._get_optimized_read_params(file_size, date_cols)
|
|
753
|
+
|
|
754
|
+
# Read CSV with optimizations
|
|
755
|
+
read_start = time.time()
|
|
756
|
+
|
|
757
|
+
# Build read arguments, user params override optimizations
|
|
758
|
+
read_kwargs = {
|
|
759
|
+
'filepath_or_buffer': file_path,
|
|
760
|
+
'sep': sep,
|
|
761
|
+
'index_col': index_col,
|
|
762
|
+
'usecols': usecols,
|
|
763
|
+
'nrows': num_records,
|
|
764
|
+
'dayfirst': dayfirst,
|
|
765
|
+
'compression': compression,
|
|
766
|
+
'encoding': encoding,
|
|
767
|
+
'parse_dates': date_cols,
|
|
768
|
+
'thousands': thousands,
|
|
769
|
+
'decimal': decimal,
|
|
770
|
+
'dtype': types
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
# Add optimized params (user's low_memory takes precedence)
|
|
774
|
+
if low_memory is True:
|
|
775
|
+
read_kwargs['low_memory'] = read_params.get('low_memory', True)
|
|
776
|
+
else:
|
|
777
|
+
read_kwargs['low_memory'] = low_memory
|
|
778
|
+
|
|
779
|
+
# Add other optimization params that don't conflict with user params
|
|
780
|
+
for key, value in read_params.items():
|
|
781
|
+
if key not in ['low_memory', '_suggest_iterator'] and key not in read_kwargs:
|
|
782
|
+
read_kwargs[key] = value
|
|
783
|
+
|
|
784
|
+
df = pd.read_csv(**read_kwargs)
|
|
785
|
+
|
|
786
|
+
read_time = time.time() - read_start
|
|
787
|
+
print(f"[DOWNLOAD_CSV] Read completed: {read_time:.2f}s | Shape: {df.shape[0]:,} rows x {df.shape[1]} cols")
|
|
788
|
+
|
|
789
|
+
except ClientError as err:
|
|
790
|
+
print(f"[DOWNLOAD_CSV] ERROR: Connection failed")
|
|
791
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
792
|
+
raise
|
|
793
|
+
except FileNotFoundError as err:
|
|
794
|
+
print(f"[DOWNLOAD_CSV] ERROR: File not found")
|
|
795
|
+
self.logger.exception(f'No csv file found. Please check paths: {err}')
|
|
796
|
+
raise
|
|
797
|
+
finally:
|
|
798
|
+
# Clean up temporary file
|
|
799
|
+
if os.path.exists(file_path):
|
|
800
|
+
try:
|
|
801
|
+
os.remove(file_path)
|
|
802
|
+
except:
|
|
803
|
+
pass
|
|
804
|
+
|
|
805
|
+
total_time = time.time() - start_time
|
|
806
|
+
print(f"[DOWNLOAD_CSV] Total time: {total_time:.2f}s (download: {download_time:.2f}s, read: {read_time:.2f}s)")
|
|
807
|
+
|
|
808
|
+
return df
|
|
809
|
+
|
|
810
|
+
def download_json_file(self, json_name=None, datalake_path=None):
|
|
811
|
+
"""
|
|
812
|
+
Return a JSON file downloaded from the datalake
|
|
813
|
+
|
|
814
|
+
:param json_name: File name to save dataframe
|
|
815
|
+
:param datalake_path: Path to upload the Q to S3 datalake
|
|
816
|
+
:return response: JSON file contents
|
|
817
|
+
|
|
818
|
+
>>>
|
|
819
|
+
>>>
|
|
820
|
+
"""
|
|
821
|
+
s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
822
|
+
file_path = os.path.join(Config.LOCAL_PATH, json_name + '.json')
|
|
823
|
+
try:
|
|
824
|
+
if datalake_path is None:
|
|
825
|
+
s3_client.download_file(self.datalake, json_name + '.csv', file_path)
|
|
826
|
+
else:
|
|
827
|
+
s3_client.download_file(self.datalake, os.path.join(datalake_path, json_name + '.json'), file_path)
|
|
828
|
+
with open(file_path, 'r') as json_file:
|
|
829
|
+
response = json.load(json_file)
|
|
830
|
+
except ClientError as err:
|
|
831
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
832
|
+
except FileNotFoundError as err:
|
|
833
|
+
self.logger.exception(f'No object file found. Please check paths: {err}')
|
|
834
|
+
raise
|
|
835
|
+
return response
|
|
836
|
+
|
|
837
|
+
def download_csv_from_bucket(self,
|
|
838
|
+
datalake=None,
|
|
839
|
+
datalake_path=None,
|
|
840
|
+
sep=',',
|
|
841
|
+
index_col=None,
|
|
842
|
+
usecols=None,
|
|
843
|
+
num_records=None,
|
|
844
|
+
dayfirst=False,
|
|
845
|
+
compression='infer',
|
|
846
|
+
encoding='utf-8',
|
|
847
|
+
date_cols=None,
|
|
848
|
+
types=None,
|
|
849
|
+
thousands=None,
|
|
850
|
+
decimal='.',
|
|
851
|
+
low_memory=True,
|
|
852
|
+
use_multipart=True):
|
|
853
|
+
"""Return a dataframe from a file stored in a S3 bucket
|
|
854
|
+
|
|
855
|
+
OPTIMIZED VERSION with S3 Multipart Transfer support for faster downloads.
|
|
856
|
+
|
|
857
|
+
:param datalake: S3 bucket name
|
|
858
|
+
:param datalake_path: Path to download the file from the bucket. Do not include datalake name. Default None.
|
|
859
|
+
:param sep: Field delimiter of the downloaded file. Default ','
|
|
860
|
+
:param index_col: Column(s) to use as the row labels of the DataFrame, either given as string name or column index.
|
|
861
|
+
:param usecols: Columns to use in returning dataframe.
|
|
862
|
+
:param num_records: Number of records to fetch from the source. Default None
|
|
863
|
+
:param dayfirst: DD/MM format dates, international and European format. Default False
|
|
864
|
+
:param compression: For on-the-fly decompression of on-disk data. Default 'infer'
|
|
865
|
+
:param encoding: Encoding to use for UTF when reading/writing. Default 'utf-8'
|
|
866
|
+
:param low_memory: Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. Default True
|
|
867
|
+
:param date_cols: List of date columns to parse as datetime type. Default None
|
|
868
|
+
:param types: Dict with data columns as keys and data types as values. Default None
|
|
869
|
+
:param thousands: Thousands separator
|
|
870
|
+
:param decimal: Decimal separator. Default '.'
|
|
871
|
+
:param use_multipart: Enable S3 multipart transfer for files >50MB. Default True
|
|
872
|
+
:return df: Dataframe containing the data from the file stored in the bucket
|
|
873
|
+
|
|
874
|
+
>>> df = download_csv_from_bucket(datalake='my-bucket', datalake_path='as-is/folder/file.csv')
|
|
875
|
+
>>> df
|
|
876
|
+
var1 var2 var3
|
|
877
|
+
idx0 1 2 3
|
|
878
|
+
"""
|
|
879
|
+
start_time = time.time()
|
|
880
|
+
print(f"[DOWNLOAD_CSV_BUCKET] Starting: {datalake_path}")
|
|
881
|
+
|
|
882
|
+
# Configure boto3 client with larger connection pool for multipart transfers
|
|
883
|
+
botocore_config = BotocoreConfig(
|
|
884
|
+
max_pool_connections=100, # Increased from default 10 to support high concurrency
|
|
885
|
+
retries={'max_attempts': 3, 'mode': 'adaptive'}
|
|
886
|
+
)
|
|
887
|
+
s3_client = boto3.client(
|
|
888
|
+
's3',
|
|
889
|
+
region_name=self.region,
|
|
890
|
+
aws_access_key_id=self.access_key,
|
|
891
|
+
aws_secret_access_key=self.secret_key,
|
|
892
|
+
config=botocore_config
|
|
893
|
+
)
|
|
894
|
+
file_path = os.path.join(Config.LOCAL_PATH, 'object.csv')
|
|
895
|
+
|
|
896
|
+
print(f"[DOWNLOAD_CSV_BUCKET] S3 path: s3://{datalake}/{datalake_path}")
|
|
897
|
+
|
|
898
|
+
try:
|
|
899
|
+
# PHASE 1 OPTIMIZATION: Get file size and create transfer config
|
|
900
|
+
file_size = self._get_s3_file_size(s3_client, datalake, datalake_path)
|
|
901
|
+
|
|
902
|
+
transfer_config = None
|
|
903
|
+
if use_multipart and file_size and file_size > 50 * 1024 * 1024: # > 50MB
|
|
904
|
+
transfer_config = self._create_transfer_config(file_size, s3_client)
|
|
905
|
+
print(f"[DOWNLOAD_CSV_BUCKET] Using multipart transfer | Size: {file_size / (1024*1024):.2f} MB")
|
|
906
|
+
elif file_size:
|
|
907
|
+
print(f"[DOWNLOAD_CSV_BUCKET] Using single-part transfer | Size: {file_size / (1024*1024):.2f} MB")
|
|
908
|
+
|
|
909
|
+
# Download file with optimized config
|
|
910
|
+
download_start = time.time()
|
|
911
|
+
if transfer_config:
|
|
912
|
+
s3_client.download_file(
|
|
913
|
+
Bucket=datalake,
|
|
914
|
+
Key=datalake_path,
|
|
915
|
+
Filename=file_path,
|
|
916
|
+
Config=transfer_config
|
|
917
|
+
)
|
|
918
|
+
else:
|
|
919
|
+
s3_client.download_file(datalake, datalake_path, file_path)
|
|
920
|
+
|
|
921
|
+
download_time = time.time() - download_start
|
|
922
|
+
|
|
923
|
+
# Get file size if not already obtained
|
|
924
|
+
if not file_size:
|
|
925
|
+
file_size = os.path.getsize(file_path)
|
|
926
|
+
|
|
927
|
+
speed_mbps = (file_size / (1024*1024)) / download_time if download_time > 0 else 0
|
|
928
|
+
print(f"[DOWNLOAD_CSV_BUCKET] Download completed: {download_time:.2f}s | Size: {file_size / (1024*1024):.2f} MB ({speed_mbps:.2f} MB/s)")
|
|
929
|
+
|
|
930
|
+
# PHASE 2 OPTIMIZATION: Get optimized read parameters
|
|
931
|
+
read_params = self._get_optimized_read_params(file_size, date_cols)
|
|
932
|
+
|
|
933
|
+
# Read CSV with optimizations
|
|
934
|
+
read_start = time.time()
|
|
935
|
+
|
|
936
|
+
# Build read arguments, user params override optimizations
|
|
937
|
+
read_kwargs = {
|
|
938
|
+
'filepath_or_buffer': file_path,
|
|
939
|
+
'sep': sep,
|
|
940
|
+
'index_col': index_col,
|
|
941
|
+
'usecols': usecols,
|
|
942
|
+
'nrows': num_records,
|
|
943
|
+
'dayfirst': dayfirst,
|
|
944
|
+
'compression': compression,
|
|
945
|
+
'encoding': encoding,
|
|
946
|
+
'parse_dates': date_cols,
|
|
947
|
+
'thousands': thousands,
|
|
948
|
+
'decimal': decimal,
|
|
949
|
+
'dtype': types
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
# Add optimized params (user's low_memory takes precedence)
|
|
953
|
+
if low_memory is True:
|
|
954
|
+
read_kwargs['low_memory'] = read_params.get('low_memory', True)
|
|
955
|
+
else:
|
|
956
|
+
read_kwargs['low_memory'] = low_memory
|
|
957
|
+
|
|
958
|
+
# Add other optimization params that don't conflict with user params
|
|
959
|
+
for key, value in read_params.items():
|
|
960
|
+
if key not in ['low_memory', '_suggest_iterator'] and key not in read_kwargs:
|
|
961
|
+
read_kwargs[key] = value
|
|
962
|
+
|
|
963
|
+
df = pd.read_csv(**read_kwargs)
|
|
964
|
+
|
|
965
|
+
read_time = time.time() - read_start
|
|
966
|
+
print(f"[DOWNLOAD_CSV_BUCKET] Read completed: {read_time:.2f}s | Shape: {df.shape[0]:,} rows x {df.shape[1]} cols")
|
|
967
|
+
|
|
968
|
+
except ClientError as err:
|
|
969
|
+
print(f"[DOWNLOAD_CSV_BUCKET] ERROR: Connection failed")
|
|
970
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
971
|
+
raise
|
|
972
|
+
except FileNotFoundError as err:
|
|
973
|
+
print(f"[DOWNLOAD_CSV_BUCKET] ERROR: File not found")
|
|
974
|
+
self.logger.exception(f'No object file found. Please check paths: {err}')
|
|
975
|
+
raise
|
|
976
|
+
finally:
|
|
977
|
+
# Clean up temporary file
|
|
978
|
+
if os.path.exists(file_path):
|
|
979
|
+
try:
|
|
980
|
+
os.remove(file_path)
|
|
981
|
+
except:
|
|
982
|
+
pass
|
|
983
|
+
|
|
984
|
+
total_time = time.time() - start_time
|
|
985
|
+
print(f"[DOWNLOAD_CSV_BUCKET] Total time: {total_time:.2f}s (download: {download_time:.2f}s, read: {read_time:.2f}s)")
|
|
986
|
+
|
|
987
|
+
return df
|
|
988
|
+
|
|
989
|
+
def download_object_csv(self,
|
|
990
|
+
datalake_path=None,
|
|
991
|
+
sep=',',
|
|
992
|
+
index_col=None,
|
|
993
|
+
usecols=None,
|
|
994
|
+
num_records=None,
|
|
995
|
+
dayfirst=False,
|
|
996
|
+
compression='infer',
|
|
997
|
+
encoding='utf-8',
|
|
998
|
+
date_cols=None,
|
|
999
|
+
types=None,
|
|
1000
|
+
thousands=None,
|
|
1001
|
+
decimal='.',
|
|
1002
|
+
low_memory=True):
|
|
1003
|
+
"""Return a dataframe from a file stored in the datalake
|
|
1004
|
+
|
|
1005
|
+
:param datalake_path: Path to download the file from the S3 datalake. Do not include datalake name. Default None.
|
|
1006
|
+
:param sep: Field delimiter of the downloaded file. Default ','
|
|
1007
|
+
:param index_col: Column(s) to use as the row labels of the DataFrame, either given as string name or column index.
|
|
1008
|
+
:param usecols: Columns to use in returning dataframe.
|
|
1009
|
+
:param num_records: Number of records to fetch from the source. Default None
|
|
1010
|
+
:param dayfirst: DD/MM format dates, international and European format. Default False
|
|
1011
|
+
:param compression: For on-the-fly decompression of on-disk data. Default 'infer'
|
|
1012
|
+
:param encoding: Encoding to use for UTF when reading/writing. Default 'utf-8'
|
|
1013
|
+
:param low_memory: Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. Default True
|
|
1014
|
+
:param date_cols: List of date columns to parse as datetime type. Default None
|
|
1015
|
+
:param types: Dict with data columns as keys and data types as values. Default None
|
|
1016
|
+
:param thousands: Thousands separator
|
|
1017
|
+
:param decimal: Decimal separator. Default '.'
|
|
1018
|
+
:return df: Dataframe containing the data from the file stored in the datalake
|
|
1019
|
+
|
|
1020
|
+
>>> df = download_object_csv(datalake_path='as-is/folder/file.txt')
|
|
1021
|
+
>>> df
|
|
1022
|
+
var1 var2 var3
|
|
1023
|
+
idx0 1 2 3
|
|
1024
|
+
"""
|
|
1025
|
+
s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1026
|
+
file_path = os.path.join(Config.LOCAL_PATH, 'object.dat')
|
|
1027
|
+
try:
|
|
1028
|
+
s3_client.download_file(self.datalake, os.path.join(datalake_path), file_path)
|
|
1029
|
+
df = pd.read_csv(file_path,
|
|
1030
|
+
sep=sep,
|
|
1031
|
+
index_col=index_col,
|
|
1032
|
+
usecols=usecols,
|
|
1033
|
+
nrows=num_records,
|
|
1034
|
+
dayfirst=dayfirst,
|
|
1035
|
+
compression=compression,
|
|
1036
|
+
encoding=encoding,
|
|
1037
|
+
low_memory=low_memory,
|
|
1038
|
+
thousands=thousands,
|
|
1039
|
+
parse_dates=date_cols,
|
|
1040
|
+
decimal=decimal,
|
|
1041
|
+
dtype=types)
|
|
1042
|
+
except ClientError as err:
|
|
1043
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
1044
|
+
except FileNotFoundError as err:
|
|
1045
|
+
self.logger.exception(f'No object file found. Please check paths: {err}')
|
|
1046
|
+
raise
|
|
1047
|
+
return df
|
|
1048
|
+
|
|
1049
|
+
def download_txt(self,
|
|
1050
|
+
q_name,
|
|
1051
|
+
datalake_path=None,
|
|
1052
|
+
sep='\t',
|
|
1053
|
+
index_col=None,
|
|
1054
|
+
usecols=None,
|
|
1055
|
+
num_records=None,
|
|
1056
|
+
dayfirst=False,
|
|
1057
|
+
compression='infer',
|
|
1058
|
+
encoding='utf-8',
|
|
1059
|
+
date_cols=None,
|
|
1060
|
+
types=None,
|
|
1061
|
+
thousands=None,
|
|
1062
|
+
low_memory=True,
|
|
1063
|
+
decimal='.'):
|
|
1064
|
+
"""Return a dataframe from a csv file stored in the datalake
|
|
1065
|
+
|
|
1066
|
+
:param q_name: Plain file (.txt) to download and stored in a dataframe
|
|
1067
|
+
:param datalake_path: Path to download the file from the S3 datalake. Default None.
|
|
1068
|
+
:param sep: Field delimiter of the downloaded file. Default '\t'
|
|
1069
|
+
:param index_col: Column(s) to use as the row labels of the DataFrame, either given as string name or column index.
|
|
1070
|
+
:param usecols: Columns to use in returning dataframe.
|
|
1071
|
+
:param num_records: Number of records to fetch from the source. Default None
|
|
1072
|
+
:param dayfirst: DD/MM format dates, international and European format. Default False
|
|
1073
|
+
:param compression: For on-the-fly decompression of on-disk data. Default 'infer'
|
|
1074
|
+
:param encoding: Encoding to use for UTF when reading/writing. Default 'utf-8'
|
|
1075
|
+
:param date_cols: List of date columns to parse as datetime type. Default None
|
|
1076
|
+
:param types: Dict with data columns as keys and data types as values. Default None
|
|
1077
|
+
:param thousands: Thousands separator.
|
|
1078
|
+
:param decimal: Decimal separator. Default '.'
|
|
1079
|
+
:param low_memory: Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. Default True
|
|
1080
|
+
:return df: Dataframe containing the data from the file stored in the datalake
|
|
1081
|
+
|
|
1082
|
+
>>> df = download_txt(q_name='Q', datalake_path='as-is/folder')
|
|
1083
|
+
>>> df
|
|
1084
|
+
var1 var2 var3
|
|
1085
|
+
idx0 1 2 3
|
|
1086
|
+
"""
|
|
1087
|
+
s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1088
|
+
file_path = os.path.join(Config.LOCAL_PATH, q_name + '.txt')
|
|
1089
|
+
try:
|
|
1090
|
+
if datalake_path is None:
|
|
1091
|
+
s3_client.download_file(self.datalake, q_name + '.txt', file_path)
|
|
1092
|
+
else:
|
|
1093
|
+
s3_client.download_file(self.datalake, os.path.join(datalake_path, q_name + '.txt'), file_path)
|
|
1094
|
+
|
|
1095
|
+
df = pd.read_csv(file_path,
|
|
1096
|
+
sep=sep,
|
|
1097
|
+
index_col=index_col,
|
|
1098
|
+
usecols=usecols,
|
|
1099
|
+
nrows=num_records,
|
|
1100
|
+
dayfirst=dayfirst,
|
|
1101
|
+
compression=compression,
|
|
1102
|
+
encoding=encoding,
|
|
1103
|
+
low_memory=low_memory,
|
|
1104
|
+
parse_dates=date_cols,
|
|
1105
|
+
thousands=thousands,
|
|
1106
|
+
decimal=decimal,
|
|
1107
|
+
dtype=types)
|
|
1108
|
+
except ClientError as err:
|
|
1109
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
1110
|
+
except FileNotFoundError as err:
|
|
1111
|
+
self.logger.exception(f'No csv file found. Please check paths: {err}')
|
|
1112
|
+
raise
|
|
1113
|
+
return df
|
|
1114
|
+
|
|
1115
|
+
def download_all_objects_csv(self,
|
|
1116
|
+
datalake_path=None,
|
|
1117
|
+
sep=',',
|
|
1118
|
+
index_col=None,
|
|
1119
|
+
num_records=None,
|
|
1120
|
+
dayfirst=False,
|
|
1121
|
+
compression='infer',
|
|
1122
|
+
encoding='utf-8',
|
|
1123
|
+
low_memory=True,
|
|
1124
|
+
date_cols=None,
|
|
1125
|
+
types=None,
|
|
1126
|
+
thousands=None,
|
|
1127
|
+
decimal='.'):
|
|
1128
|
+
"""Return a dataframe from a file stored in the datalake
|
|
1129
|
+
|
|
1130
|
+
:param datalake_path: Path to download the file from the S3 datalake. Do not include datalake name. Default None.
|
|
1131
|
+
:param sep: Field delimiter of the downloaded file. Default ','
|
|
1132
|
+
:param index_col: Column(s) to use as the row labels of the DataFrame, either given as string name or column index.
|
|
1133
|
+
:param num_records: Number of records to fetch from the source. Default None
|
|
1134
|
+
:param dayfirst: DD/MM format dates, international and European format. Default False
|
|
1135
|
+
:param compression: For on-the-fly decompression of on-disk data. Default 'infer'
|
|
1136
|
+
:param encoding: Encoding to use for UTF when reading/writing. Default 'utf-8'
|
|
1137
|
+
:param low_memory: Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. Default True
|
|
1138
|
+
:param date_cols: List of date columns to parse as datetime type. Default None
|
|
1139
|
+
:param types: Dict with data columns as keys and data types as values. Default None
|
|
1140
|
+
:param thousands: Thousands separator
|
|
1141
|
+
:param decimal: Decimal separator. Default '.'
|
|
1142
|
+
:return df: Dataframe containing the data from the file stored in the datalake
|
|
1143
|
+
|
|
1144
|
+
>>> df = download_all_objects_csv(datalake_path='as-is/folder/file')
|
|
1145
|
+
>>> df
|
|
1146
|
+
var1 var2 var3
|
|
1147
|
+
idx0 1 2 3
|
|
1148
|
+
"""
|
|
1149
|
+
s3_resource = boto3.resource('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1150
|
+
|
|
1151
|
+
try:
|
|
1152
|
+
df = pd.DataFrame()
|
|
1153
|
+
datalake = s3_resource.Bucket(self.datalake)
|
|
1154
|
+
objects = datalake.objects.filter(Prefix=datalake_path)
|
|
1155
|
+
for obj in objects:
|
|
1156
|
+
path, filename = os.path.split(obj.key)
|
|
1157
|
+
if filename != '_SUCCESS' and filename != '_CHECK':
|
|
1158
|
+
datalake.download_file(obj.key, os.path.join('/tmp', filename))
|
|
1159
|
+
df_tmp = pd.read_csv(os.path.join('/tmp', filename),
|
|
1160
|
+
sep=sep,
|
|
1161
|
+
index_col=index_col,
|
|
1162
|
+
nrows=num_records,
|
|
1163
|
+
dayfirst=dayfirst,
|
|
1164
|
+
compression=compression,
|
|
1165
|
+
encoding=encoding,
|
|
1166
|
+
low_memory=low_memory,
|
|
1167
|
+
parse_dates=date_cols,
|
|
1168
|
+
thousands=thousands,
|
|
1169
|
+
decimal=decimal,
|
|
1170
|
+
dtype=types)
|
|
1171
|
+
df = pd.concat([df, df_tmp], axis='rows').drop_duplicates()
|
|
1172
|
+
except ClientError as err:
|
|
1173
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
1174
|
+
except FileNotFoundError as err:
|
|
1175
|
+
self.logger.exception(f'No object file found. Please check paths: {err}')
|
|
1176
|
+
raise
|
|
1177
|
+
return df
|
|
1178
|
+
|
|
1179
|
+
def download_dynamodb(self, table_name, tenant_id):
|
|
1180
|
+
"""
|
|
1181
|
+
Return a dataframe with the data fetch from DynamoDB
|
|
1182
|
+
|
|
1183
|
+
:param table_name: Table name in DynamoDB table
|
|
1184
|
+
:param tenant_id: Partition column mapping tenant's ID to whom belongs the records
|
|
1185
|
+
:return df: Dataframe to store records fetched from DynamoDB
|
|
1186
|
+
>>> df = download_dynamodb(table_name='sampleTbl', tenant_id='1234')
|
|
1187
|
+
>>> df =
|
|
1188
|
+
tenantId Date Attr
|
|
1189
|
+
idx0 A121 2020-12-01 3
|
|
1190
|
+
"""
|
|
1191
|
+
dydb_client = boto3.client('dynamodb', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1192
|
+
dynamodb_session = Session(aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key, region_name=self.region)
|
|
1193
|
+
dydb = dynamodb_session.resource('dynamodb')
|
|
1194
|
+
try:
|
|
1195
|
+
dynamo_tbl = dydb.Table(table_name)
|
|
1196
|
+
response = dynamo_tbl.query(
|
|
1197
|
+
KeyConditionExpression=Key('tenantId').eq(md5(tenant_id.encode('utf-8')).hexdigest()) &\
|
|
1198
|
+
Key('Fecha').between('2010-01-01', '2025-12-31')
|
|
1199
|
+
)
|
|
1200
|
+
items = response['Items']
|
|
1201
|
+
except dydb_client.exceptions.ResourceNotFoundException as err:
|
|
1202
|
+
print(f'Table not found. Please check names :{err}')
|
|
1203
|
+
return False
|
|
1204
|
+
raise
|
|
1205
|
+
return items
|
|
1206
|
+
|
|
1207
|
+
def download_excel(self,
|
|
1208
|
+
q_name,
|
|
1209
|
+
sheet_name,
|
|
1210
|
+
datalake_path=None,
|
|
1211
|
+
index_col=None,
|
|
1212
|
+
usecols=None,
|
|
1213
|
+
num_records=None,
|
|
1214
|
+
date_cols=None,
|
|
1215
|
+
types=None,
|
|
1216
|
+
header_=0,
|
|
1217
|
+
skiprows_=None):
|
|
1218
|
+
"""Return a dataframe from a csv file stored in the datalake
|
|
1219
|
+
|
|
1220
|
+
:param q_name: Excel file to download and stored in a dataframe. Include extension xls, xlsx, ods, etc.
|
|
1221
|
+
:param sheet_name: Excel sheet to download and stored in a dataframe
|
|
1222
|
+
:param datalake_path: Path to download the file from the S3 datalake. Default None.
|
|
1223
|
+
:param index_col: Column(s) to use as the row labels of the DataFrame, either given as string name or column index.
|
|
1224
|
+
:param usecols: Columns to use in returning dataframe.
|
|
1225
|
+
:param num_records: Number of records to fetch from the source. Default None
|
|
1226
|
+
:param date_cols: List of date columns to parse as datetime type. Default None
|
|
1227
|
+
:param types: Dict with data columns as keys and data types as values. Default None
|
|
1228
|
+
:return df: Dataframe containing the data from the file stored in the datalake
|
|
1229
|
+
|
|
1230
|
+
>>> df = download_excel(q_name='Q', sheet_name='sheet1', datalake_path='as-is/folder')
|
|
1231
|
+
>>> df
|
|
1232
|
+
var1 var2 var3
|
|
1233
|
+
idx0 1 2 3
|
|
1234
|
+
"""
|
|
1235
|
+
start_time = time.time()
|
|
1236
|
+
print(f"[DOWNLOAD_EXCEL] Starting: {q_name} | Sheet: {sheet_name}")
|
|
1237
|
+
|
|
1238
|
+
s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1239
|
+
file_path = os.path.join(Config.LOCAL_PATH, q_name)
|
|
1240
|
+
|
|
1241
|
+
# Build S3 key
|
|
1242
|
+
if datalake_path is None:
|
|
1243
|
+
s3_key = q_name
|
|
1244
|
+
else:
|
|
1245
|
+
s3_key = os.path.join(datalake_path, q_name)
|
|
1246
|
+
|
|
1247
|
+
print(f"[DOWNLOAD_EXCEL] S3 path: s3://{self.datalake}/{s3_key}")
|
|
1248
|
+
|
|
1249
|
+
try:
|
|
1250
|
+
# Download file
|
|
1251
|
+
download_start = time.time()
|
|
1252
|
+
s3_client.download_file(self.datalake, s3_key, file_path)
|
|
1253
|
+
download_time = time.time() - download_start
|
|
1254
|
+
|
|
1255
|
+
# Get file size
|
|
1256
|
+
file_size = os.path.getsize(file_path)
|
|
1257
|
+
speed_mbps = (file_size / (1024*1024)) / download_time if download_time > 0 else 0
|
|
1258
|
+
print(f"[DOWNLOAD_EXCEL] Download completed: {download_time:.2f}s | Size: {file_size / (1024*1024):.2f} MB ({speed_mbps:.2f} MB/s)")
|
|
1259
|
+
|
|
1260
|
+
# Read Excel file
|
|
1261
|
+
read_start = time.time()
|
|
1262
|
+
df = pd.read_excel(file_path,
|
|
1263
|
+
sheet_name=sheet_name,
|
|
1264
|
+
index_col=index_col,
|
|
1265
|
+
usecols=usecols,
|
|
1266
|
+
engine='openpyxl',
|
|
1267
|
+
nrows=num_records,
|
|
1268
|
+
parse_dates=date_cols,
|
|
1269
|
+
dtype=types,
|
|
1270
|
+
header=header_,
|
|
1271
|
+
skiprows=skiprows_)
|
|
1272
|
+
df = df.dropna(how='all')
|
|
1273
|
+
|
|
1274
|
+
read_time = time.time() - read_start
|
|
1275
|
+
print(f"[DOWNLOAD_EXCEL] Read completed: {read_time:.2f}s | Shape: {df.shape[0]:,} rows x {df.shape[1]} cols")
|
|
1276
|
+
|
|
1277
|
+
except ClientError as err:
|
|
1278
|
+
print(f"[DOWNLOAD_EXCEL] ERROR: Connection failed")
|
|
1279
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
1280
|
+
raise
|
|
1281
|
+
except FileNotFoundError as err:
|
|
1282
|
+
print(f"[DOWNLOAD_EXCEL] ERROR: File or sheet not found")
|
|
1283
|
+
self.logger.exception(f'No excel file or sheet name found. Please check paths: {err}')
|
|
1284
|
+
raise
|
|
1285
|
+
finally:
|
|
1286
|
+
# Clean up temporary file
|
|
1287
|
+
if os.path.exists(file_path):
|
|
1288
|
+
try:
|
|
1289
|
+
os.remove(file_path)
|
|
1290
|
+
except:
|
|
1291
|
+
pass
|
|
1292
|
+
|
|
1293
|
+
total_time = time.time() - start_time
|
|
1294
|
+
print(f"[DOWNLOAD_EXCEL] Total time: {total_time:.2f}s (download: {download_time:.2f}s, read: {read_time:.2f}s)")
|
|
1295
|
+
|
|
1296
|
+
return df
|
|
1297
|
+
|
|
1298
|
+
def download_excel_from_bucket(self,
|
|
1299
|
+
datalake=None,
|
|
1300
|
+
datalake_path=None,
|
|
1301
|
+
sheet_name=0,
|
|
1302
|
+
index_col=None,
|
|
1303
|
+
usecols=None,
|
|
1304
|
+
num_records=None,
|
|
1305
|
+
date_cols=None,
|
|
1306
|
+
types=None,
|
|
1307
|
+
header_=0,
|
|
1308
|
+
skiprows_=None):
|
|
1309
|
+
"""Return a dataframe from a file stored in a S3 bucket
|
|
1310
|
+
|
|
1311
|
+
:param datalake: S3 bucket name
|
|
1312
|
+
:param datalake_path: Path to download the file from the bucket. Do not include datalake name. Default None.
|
|
1313
|
+
:param sheet_name: Excel sheet to download and stored in a dataframe
|
|
1314
|
+
:param index_col: Column(s) to use as the row labels of the DataFrame, either given as string name or column index.
|
|
1315
|
+
:param usecols: Columns to use in returning dataframe.
|
|
1316
|
+
:param num_records: Number of records to fetch from the source. Default None
|
|
1317
|
+
:param date_cols: List of date columns to parse as datetime type. Default None
|
|
1318
|
+
:param types: Dict with data columns as keys and data types as values. Default None
|
|
1319
|
+
:return df: Dataframe containing the data from the file stored in the datalake
|
|
1320
|
+
|
|
1321
|
+
>>> df = download_excel_from_bucket(datalake='my-bucket', datalake_path='as-is/folder/file.csv')
|
|
1322
|
+
>>> df
|
|
1323
|
+
var1 var2 var3
|
|
1324
|
+
idx0 1 2 3
|
|
1325
|
+
"""
|
|
1326
|
+
start_time = time.time()
|
|
1327
|
+
print(f"[DOWNLOAD_EXCEL_BUCKET] Starting: {datalake_path} | Sheet: {sheet_name}")
|
|
1328
|
+
|
|
1329
|
+
s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1330
|
+
file_path = os.path.join(Config.LOCAL_PATH, 'object.xlsx')
|
|
1331
|
+
|
|
1332
|
+
print(f"[DOWNLOAD_EXCEL_BUCKET] S3 path: s3://{datalake}/{datalake_path}")
|
|
1333
|
+
|
|
1334
|
+
try:
|
|
1335
|
+
# Download file
|
|
1336
|
+
download_start = time.time()
|
|
1337
|
+
s3_client.download_file(datalake, os.path.join(datalake_path), file_path)
|
|
1338
|
+
download_time = time.time() - download_start
|
|
1339
|
+
|
|
1340
|
+
# Get file size
|
|
1341
|
+
file_size = os.path.getsize(file_path)
|
|
1342
|
+
speed_mbps = (file_size / (1024*1024)) / download_time if download_time > 0 else 0
|
|
1343
|
+
print(f"[DOWNLOAD_EXCEL_BUCKET] Download completed: {download_time:.2f}s | Size: {file_size / (1024*1024):.2f} MB ({speed_mbps:.2f} MB/s)")
|
|
1344
|
+
|
|
1345
|
+
# Read Excel file
|
|
1346
|
+
read_start = time.time()
|
|
1347
|
+
df = pd.read_excel(file_path,
|
|
1348
|
+
sheet_name=sheet_name,
|
|
1349
|
+
index_col=index_col,
|
|
1350
|
+
usecols=usecols,
|
|
1351
|
+
engine='openpyxl',
|
|
1352
|
+
nrows=num_records,
|
|
1353
|
+
parse_dates=date_cols,
|
|
1354
|
+
dtype=types,
|
|
1355
|
+
header=header_,
|
|
1356
|
+
skiprows=skiprows_)
|
|
1357
|
+
df = df.dropna(how='all')
|
|
1358
|
+
|
|
1359
|
+
read_time = time.time() - read_start
|
|
1360
|
+
print(f"[DOWNLOAD_EXCEL_BUCKET] Read completed: {read_time:.2f}s | Shape: {df.shape[0]:,} rows x {df.shape[1]} cols")
|
|
1361
|
+
|
|
1362
|
+
except ClientError as err:
|
|
1363
|
+
print(f"[DOWNLOAD_EXCEL_BUCKET] ERROR: Connection failed")
|
|
1364
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
1365
|
+
raise
|
|
1366
|
+
except FileNotFoundError as err:
|
|
1367
|
+
print(f"[DOWNLOAD_EXCEL_BUCKET] ERROR: File not found")
|
|
1368
|
+
self.logger.exception(f'No object file found. Please check paths: {err}')
|
|
1369
|
+
raise
|
|
1370
|
+
finally:
|
|
1371
|
+
# Clean up temporary file
|
|
1372
|
+
if os.path.exists(file_path):
|
|
1373
|
+
try:
|
|
1374
|
+
os.remove(file_path)
|
|
1375
|
+
except:
|
|
1376
|
+
pass
|
|
1377
|
+
|
|
1378
|
+
total_time = time.time() - start_time
|
|
1379
|
+
print(f"[DOWNLOAD_EXCEL_BUCKET] Total time: {total_time:.2f}s (download: {download_time:.2f}s, read: {read_time:.2f}s)")
|
|
1380
|
+
|
|
1381
|
+
return df
|
|
1382
|
+
|
|
1383
|
+
def download_xml(self, url_=None, header_=None, body_=None):
|
|
1384
|
+
"""Return a response in XML format from a SOAP web service
|
|
1385
|
+
|
|
1386
|
+
:param url_: URL endpoint to access SOAP web service
|
|
1387
|
+
:param header_: Header in rest api configuration parameters
|
|
1388
|
+
:param body_: Body input parameters
|
|
1389
|
+
:return response: Plain text with data xml
|
|
1390
|
+
|
|
1391
|
+
address = 'http://200.200.200.200:81/service.asmx'
|
|
1392
|
+
headers = {'Content-Type':'text/xml;charset=UTF-8'}
|
|
1393
|
+
body = ""<soapenv:Envelope xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/" xmlns:tem="http://tempuri.org/">
|
|
1394
|
+
<soapenv:Header/>
|
|
1395
|
+
<soapenv:Body>
|
|
1396
|
+
<tem:EjecutarConsultaXML>
|
|
1397
|
+
<!--Optional:-->
|
|
1398
|
+
<tem:pvstrxmlParametros>
|
|
1399
|
+
<![CDATA[
|
|
1400
|
+
<Consulta>
|
|
1401
|
+
<NombreConexion>Datup_Real</NombreConexion>
|
|
1402
|
+
<IdCia>2</IdCia>
|
|
1403
|
+
<IdProveedor>Analytics</IdProveedor>
|
|
1404
|
+
<IdConsulta>CONSULTA_VENTAS</IdConsulta>
|
|
1405
|
+
<Usuario>myuser</Usuario>
|
|
1406
|
+
<Clave>mypassword</Clave>
|
|
1407
|
+
<Parametros>
|
|
1408
|
+
<p_periodo_ini>202105</p_periodo_ini>
|
|
1409
|
+
<p_periodo_fin>202105</p_periodo_fin>
|
|
1410
|
+
</Parametros>
|
|
1411
|
+
</Consulta>]]>
|
|
1412
|
+
</tem:pvstrxmlParametros>
|
|
1413
|
+
</tem:EjecutarConsultaXML>
|
|
1414
|
+
</soapenv:Body>
|
|
1415
|
+
</soapenv:Envelope>""
|
|
1416
|
+
|
|
1417
|
+
>>> response = download_xml(url_=address, header_=headers, body_=body)
|
|
1418
|
+
>>> response =
|
|
1419
|
+
'<?xml version="1.0" encoding="utf-8"?><soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
1420
|
+
xmlns:xsd="http://www.w3.org/2001/XMLSchema"><soap:Body><EjecutarConsultaXMLResponse xmlns="http://tempuri.org/"><EjecutarConsultaXMLResult><xs:schema id="NewDataSet"
|
|
1421
|
+
xmlns="" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata"><xs:element name="NewDataSet" msdata:IsDataSet="true"
|
|
1422
|
+
msdata:UseCurrentLocale="true"><xs:complexType><xs:choice minOccurs="0" maxOccurs="unbounded"><xs:element name="Resultado"><xs:complexType><xs:sequence><xs:element
|
|
1423
|
+
name="Compañia" type="xs:short" minOccurs="0" /><xs:element name="Llave_x0020_Documento" type="xs:int" minOccurs="0"'
|
|
1424
|
+
"""
|
|
1425
|
+
try:
|
|
1426
|
+
r = requests.post(url_, headers=header_, data=body_, allow_redirects=True)
|
|
1427
|
+
response = r.text
|
|
1428
|
+
except requests.exceptions.HTTPError as err:
|
|
1429
|
+
self.logger.exception(f'Http error: {err}')
|
|
1430
|
+
raise
|
|
1431
|
+
except requests.exceptions.ConnectionError as err:
|
|
1432
|
+
self.logger.exception(f'Error connecting: {err}')
|
|
1433
|
+
raise
|
|
1434
|
+
except requests.exceptions.Timeout as err:
|
|
1435
|
+
self.logger.exception(f'Timeout error: {err}')
|
|
1436
|
+
raise
|
|
1437
|
+
except requests.exceptions.RequestException as err:
|
|
1438
|
+
self.logger.exception(f'Oops: Something else: {err}')
|
|
1439
|
+
raise
|
|
1440
|
+
return response
|
|
1441
|
+
|
|
1442
|
+
def download_parquet(self, q_name, datalake_path=None, columns=None, engine='pyarrow', filters=None):
|
|
1443
|
+
"""Return a dataframe from a parquet file stored in the datalake
|
|
1444
|
+
|
|
1445
|
+
:param q_name: File name (without extension) to download and store in a dataframe.
|
|
1446
|
+
:param datalake_path: Path to download the file from the S3 datalake. Default None.
|
|
1447
|
+
:param columns: Subset of columns to read from the Parquet file. Default None (reads all columns).
|
|
1448
|
+
:param engine: Engine to use for reading Parquet files. Default 'pyarrow'.
|
|
1449
|
+
:param filters: Filters to apply to the Parquet file rows while reading. Default None.
|
|
1450
|
+
:return df: DataFrame containing the data from the Parquet file stored in the datalake.
|
|
1451
|
+
|
|
1452
|
+
>>> df = download_parquet(q_name='Q', datalake_path='as-is/folder')
|
|
1453
|
+
>>> df
|
|
1454
|
+
var1 var2 var3
|
|
1455
|
+
idx0 1 2 3
|
|
1456
|
+
"""
|
|
1457
|
+
start_time = time.time()
|
|
1458
|
+
print(f"[DOWNLOAD_PARQUET] Starting: {q_name}.parquet | Engine: {engine}")
|
|
1459
|
+
|
|
1460
|
+
s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1461
|
+
file_path = os.path.join(Config.LOCAL_PATH, q_name + '.parquet')
|
|
1462
|
+
|
|
1463
|
+
# Build S3 key
|
|
1464
|
+
if datalake_path is None:
|
|
1465
|
+
s3_key = q_name + '.parquet'
|
|
1466
|
+
else:
|
|
1467
|
+
s3_key = os.path.join(datalake_path, q_name, q_name + '.parquet')
|
|
1468
|
+
|
|
1469
|
+
print(f"[DOWNLOAD_PARQUET] S3 path: s3://{self.datalake}/{s3_key}")
|
|
1470
|
+
|
|
1471
|
+
try:
|
|
1472
|
+
# Download the Parquet file from S3
|
|
1473
|
+
download_start = time.time()
|
|
1474
|
+
s3_client.download_file(self.datalake, s3_key, file_path)
|
|
1475
|
+
download_time = time.time() - download_start
|
|
1476
|
+
|
|
1477
|
+
# Get file size
|
|
1478
|
+
file_size = os.path.getsize(file_path)
|
|
1479
|
+
speed_mbps = (file_size / (1024*1024)) / download_time if download_time > 0 else 0
|
|
1480
|
+
print(f"[DOWNLOAD_PARQUET] Download completed: {download_time:.2f}s | Size: {file_size / (1024*1024):.2f} MB ({speed_mbps:.2f} MB/s)")
|
|
1481
|
+
|
|
1482
|
+
# Read the Parquet file into a DataFrame
|
|
1483
|
+
read_start = time.time()
|
|
1484
|
+
df = pd.read_parquet(file_path, columns=columns, engine=engine, filters=filters)
|
|
1485
|
+
|
|
1486
|
+
read_time = time.time() - read_start
|
|
1487
|
+
print(f"[DOWNLOAD_PARQUET] Read completed: {read_time:.2f}s | Shape: {df.shape[0]:,} rows x {df.shape[1]} cols")
|
|
1488
|
+
|
|
1489
|
+
except ClientError as err:
|
|
1490
|
+
print(f"[DOWNLOAD_PARQUET] ERROR: Connection failed")
|
|
1491
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
1492
|
+
raise
|
|
1493
|
+
except FileNotFoundError as err:
|
|
1494
|
+
print(f"[DOWNLOAD_PARQUET] ERROR: File not found")
|
|
1495
|
+
self.logger.exception(f'No Parquet file found. Please check paths: {err}')
|
|
1496
|
+
raise
|
|
1497
|
+
except Exception as e:
|
|
1498
|
+
print(f"[DOWNLOAD_PARQUET] ERROR: {e}")
|
|
1499
|
+
self.logger.exception(f'Failed to read the Parquet file: {e}')
|
|
1500
|
+
raise
|
|
1501
|
+
finally:
|
|
1502
|
+
# Clean up temporary file
|
|
1503
|
+
if os.path.exists(file_path):
|
|
1504
|
+
try:
|
|
1505
|
+
os.remove(file_path)
|
|
1506
|
+
except:
|
|
1507
|
+
pass
|
|
1508
|
+
|
|
1509
|
+
total_time = time.time() - start_time
|
|
1510
|
+
print(f"[DOWNLOAD_PARQUET] Total time: {total_time:.2f}s (download: {download_time:.2f}s, read: {read_time:.2f}s)")
|
|
1511
|
+
|
|
1512
|
+
return df
|
|
1513
|
+
|
|
1514
|
+
|
|
1515
|
+
def download_models(self, datalake_path=None):
|
|
1516
|
+
"""Returns True as successful download of the n_backtests models trained by attup model
|
|
1517
|
+
|
|
1518
|
+
:param datalake_path: Path to download the file from the S3 datalake. Default None.
|
|
1519
|
+
:return: True if success, else False.
|
|
1520
|
+
|
|
1521
|
+
>>> models = download_models(datalake_path='path/to/data')
|
|
1522
|
+
>>> True
|
|
1523
|
+
"""
|
|
1524
|
+
|
|
1525
|
+
s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1526
|
+
for i in range(self.backtests + 1):
|
|
1527
|
+
q_name = "model" + str(i)
|
|
1528
|
+
file_path = os.path.join(Config.LOCAL_PATH, q_name + '.h5')
|
|
1529
|
+
print(file_path)
|
|
1530
|
+
try:
|
|
1531
|
+
if datalake_path is None:
|
|
1532
|
+
s3_client.download_file(self.datalake, q_name + '.h5', file_path)
|
|
1533
|
+
else:
|
|
1534
|
+
s3_client.download_file(self.datalake, os.path.join(datalake_path, "models", q_name + '.h5'), file_path)
|
|
1535
|
+
except ClientError as err:
|
|
1536
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
1537
|
+
except FileNotFoundError as err:
|
|
1538
|
+
self.logger.exception(f'No csv file found. Please check paths: {err}')
|
|
1539
|
+
raise
|
|
1540
|
+
return True
|
|
1541
|
+
|
|
1542
|
+
def download_models_tft(self, datalake_path=None):
|
|
1543
|
+
"""Returns True as successful download of the n_backtests models trained by attup model
|
|
1544
|
+
|
|
1545
|
+
:param datalake_path: Path to download the file from the S3 datalake. Default None.
|
|
1546
|
+
:return: True if success, else False.
|
|
1547
|
+
|
|
1548
|
+
>>> models = download_models(datalake_path='path/to/data')
|
|
1549
|
+
>>> True
|
|
1550
|
+
"""
|
|
1551
|
+
|
|
1552
|
+
s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1553
|
+
for i in range(self.backtests + 1):
|
|
1554
|
+
q_name = "model" + str(i)
|
|
1555
|
+
file_path = os.path.join(Config.LOCAL_PATH, q_name + '.ckpt')
|
|
1556
|
+
print(file_path)
|
|
1557
|
+
try:
|
|
1558
|
+
if datalake_path is None:
|
|
1559
|
+
s3_client.download_file(self.datalake, q_name + '.ckpt', file_path)
|
|
1560
|
+
else:
|
|
1561
|
+
s3_client.download_file(self.datalake, os.path.join(datalake_path, "models", q_name + '.ckpt'), file_path)
|
|
1562
|
+
except ClientError as err:
|
|
1563
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
1564
|
+
except FileNotFoundError as err:
|
|
1565
|
+
self.logger.exception(f'No csv file found. Please check paths: {err}')
|
|
1566
|
+
raise
|
|
1567
|
+
return True
|
|
1568
|
+
|
|
1569
|
+
def upload_csv(self, df, q_name, datalake_path, sep=',', encoding='utf-8', date_format='%Y-%m-%d', lineterminator=None):
|
|
1570
|
+
"""Return a success or failure boolean attempting to upload a local file to the datalake
|
|
1571
|
+
|
|
1572
|
+
:param df: Dataframe to upload
|
|
1573
|
+
:param q_name: File name to save dataframe
|
|
1574
|
+
:param datalake_path: Path to upload the Q to S3 datalake
|
|
1575
|
+
:param sep: Field delimiter for the output file. Default ','
|
|
1576
|
+
:param date_format: Format string for datetime objects of output file. Default '%Y-%m-%d'
|
|
1577
|
+
:param encoding: A string representing the encoding to use in the output file. Default 'utf-8'
|
|
1578
|
+
:return: True if success, else False.
|
|
1579
|
+
|
|
1580
|
+
>>> upload_csv(df=df, q_name='Q', datalake_path='as-is/folder')
|
|
1581
|
+
>>> True
|
|
1582
|
+
"""
|
|
1583
|
+
file_path = os.path.join(Config.LOCAL_PATH, q_name + '.csv')
|
|
1584
|
+
df.to_csv(file_path, sep=sep, encoding=encoding, date_format=date_format, index=False, lineterminator=lineterminator)
|
|
1585
|
+
s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1586
|
+
try:
|
|
1587
|
+
response = s3_client.upload_file(file_path, self.datalake, os.path.join(datalake_path, q_name, q_name + '.csv'))
|
|
1588
|
+
except ClientError as err:
|
|
1589
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
1590
|
+
return False
|
|
1591
|
+
except FileNotFoundError as err:
|
|
1592
|
+
self.logger.exception(f'No excel file or sheet name found. Please check paths: {err}')
|
|
1593
|
+
return False
|
|
1594
|
+
return True
|
|
1595
|
+
|
|
1596
|
+
def upload_dynamodb(self, df, table_name, tenant_id, sort_col):
|
|
1597
|
+
"""
|
|
1598
|
+
Return a success or failure boolean attempting to upload timeseries data to DynamoDB
|
|
1599
|
+
|
|
1600
|
+
:param df: Dataframe to upload to DynamoDB table
|
|
1601
|
+
:param table_name: Table name in DynamoDB table
|
|
1602
|
+
:param tenant_id: Partition column mapping tenant's ID to whom belongs the records
|
|
1603
|
+
:param sort_col: Sorting column mapping usually to date column
|
|
1604
|
+
:return response: HTTP status code response. If 400 success, failure otherwise
|
|
1605
|
+
|
|
1606
|
+
>>> upload_dynamodb(df=df, table_name=sampleTbl, tenant_id='acme', sort_col='Date')
|
|
1607
|
+
>>> True
|
|
1608
|
+
"""
|
|
1609
|
+
dydb_client = boto3.client('dynamodb', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1610
|
+
dynamodb_session = Session(aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key, region_name=self.region)
|
|
1611
|
+
dydb = dynamodb_session.resource('dynamodb')
|
|
1612
|
+
try:
|
|
1613
|
+
dynamo_tbl = dydb.Table(table_name)
|
|
1614
|
+
with dynamo_tbl.batch_writer() as batch:
|
|
1615
|
+
for row in df.itertuples(index=False):
|
|
1616
|
+
record = {}
|
|
1617
|
+
record.update({'tenantId': md5(tenant_id.encode('utf-8')).hexdigest()})
|
|
1618
|
+
record.update({sort_col: row[0].strftime('%Y-%m-%d')})
|
|
1619
|
+
for ix, rec in enumerate(row[1:]):
|
|
1620
|
+
record.update({df.columns[ix + 1]: Decimal(str(rec))})
|
|
1621
|
+
batch.put_item(Item=record)
|
|
1622
|
+
except dydb_client.exceptions.ResourceNotFoundException as err:
|
|
1623
|
+
print(f'Table not found. Please check names :{err}')
|
|
1624
|
+
return False
|
|
1625
|
+
raise
|
|
1626
|
+
return True
|
|
1627
|
+
|
|
1628
|
+
def upload_json(self, df, q_name=None, datalake_path=None, orient_=None, date_format_=None, date_unit_='s', compression_=None, indent_=4):
|
|
1629
|
+
"""
|
|
1630
|
+
Return a success or failure response after attempting to upload a dataframe in JSON format
|
|
1631
|
+
|
|
1632
|
+
:param df: Dataframe to upload in JSON format
|
|
1633
|
+
:param q_name: File name to save dataframe
|
|
1634
|
+
:param datalake_path: Path to upload the Q to S3 datalake
|
|
1635
|
+
:param orient_: Expected JSON string format. Possible values split, records, index, table, columns, values
|
|
1636
|
+
:param date_format_: Type of date conversion. epoch = epoch milliseconds, iso = ISO8601.
|
|
1637
|
+
:param date_unit_: The time unit to encode to, governs timestamp and ISO8601 precisione, e.g. s, ms, us, ns.
|
|
1638
|
+
:param compression_: A string representing the compression to use in the output file, e.g. gzip, bz2, zip, xz.
|
|
1639
|
+
:param indent_: Length of whitespace used to indent each record. Default 4.
|
|
1640
|
+
:return response: Success or failure uploading the dataframe
|
|
1641
|
+
|
|
1642
|
+
>>> upload_json(df, q_name='Qtest', orient_='columns')
|
|
1643
|
+
>>> True
|
|
1644
|
+
"""
|
|
1645
|
+
s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1646
|
+
file_path = os.path.join(Config.LOCAL_PATH, q_name + '.json')
|
|
1647
|
+
try:
|
|
1648
|
+
df.to_json(file_path, orient=orient_, date_format=date_format_, date_unit=date_unit_, compression=compression_, indent=indent_)
|
|
1649
|
+
response = s3_client.upload_file(file_path, self.datalake, os.path.join(datalake_path, q_name + '.json'))
|
|
1650
|
+
except ClientError as err:
|
|
1651
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
1652
|
+
return False
|
|
1653
|
+
except FileNotFoundError as err:
|
|
1654
|
+
self.logger.exception(f'No excel file or sheet name found. Please check paths: {err}')
|
|
1655
|
+
return False
|
|
1656
|
+
return True
|
|
1657
|
+
|
|
1658
|
+
def upload_json_file(self, message=None, json_name=None, datalake_path=None, indent_=4):
|
|
1659
|
+
"""
|
|
1660
|
+
Return a success or failure response after attempting to upload a JSON file
|
|
1661
|
+
|
|
1662
|
+
|
|
1663
|
+
:param message: Dict type to convert to JSON and upload to datalake
|
|
1664
|
+
:param json_name: File name to save dataframe
|
|
1665
|
+
:param datalake_path: Path to upload the Q to S3 datalake
|
|
1666
|
+
:param indent_: Length of whitespace used to indent each record. Default 4.
|
|
1667
|
+
:return : Success or failure uploading the dataframe
|
|
1668
|
+
|
|
1669
|
+
>>> upload_json_file(message=resp_dict, json_name='myjson', datalake_path='/path/to/data')
|
|
1670
|
+
>>> True
|
|
1671
|
+
"""
|
|
1672
|
+
|
|
1673
|
+
s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1674
|
+
file_path = os.path.join(Config.LOCAL_PATH, json_name + '.json')
|
|
1675
|
+
try:
|
|
1676
|
+
with open(file_path, 'w') as json_file:
|
|
1677
|
+
json.dump(message, json_file, indent=indent_)
|
|
1678
|
+
s3_client.upload_file(file_path, self.datalake, os.path.join(datalake_path, json_name + '.json'))
|
|
1679
|
+
except ClientError as err:
|
|
1680
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
1681
|
+
return False
|
|
1682
|
+
except FileNotFoundError as err:
|
|
1683
|
+
self.logger.exception(f'No excel file or sheet name found. Please check paths: {err}')
|
|
1684
|
+
return False
|
|
1685
|
+
return True
|
|
1686
|
+
|
|
1687
|
+
def upload_timestream(self, df, db_name, table_name):
|
|
1688
|
+
"""
|
|
1689
|
+
Return a success or failure boolean attempting to upload timeseries data to timestream database
|
|
1690
|
+
|
|
1691
|
+
:param df: Dataframe to upload to Timestream table
|
|
1692
|
+
:param db_name: Database name in Timestream service
|
|
1693
|
+
:param table_name: Table name in Timestream service
|
|
1694
|
+
:return response: HTTP status code response. If 400 success, failure otherwise
|
|
1695
|
+
|
|
1696
|
+
>>> upload_timestream(df=df, db_name=dbSample, table_name=tbSample)
|
|
1697
|
+
>>> True
|
|
1698
|
+
"""
|
|
1699
|
+
ts_client = boto3.client('timestream-write',
|
|
1700
|
+
region_name=self.region,
|
|
1701
|
+
aws_access_key_id=self.access_key,
|
|
1702
|
+
aws_secret_access_key=self.secret_key)
|
|
1703
|
+
dimensions = [{'Name': 'tenantId', 'Value': '1000', 'DimensionValueType': 'VARCHAR'}]
|
|
1704
|
+
records = []
|
|
1705
|
+
for row in df.itertuples(index=False):
|
|
1706
|
+
for ix, rec in enumerate(row[1:]):
|
|
1707
|
+
records.append({
|
|
1708
|
+
'Dimensions': dimensions,
|
|
1709
|
+
'MeasureName': df.columns[ix + 1],
|
|
1710
|
+
'MeasureValue': str(rec),
|
|
1711
|
+
'MeasureValueType': 'DOUBLE',
|
|
1712
|
+
'Time': str(int(pd.to_datetime(row[0]).timestamp())),
|
|
1713
|
+
'TimeUnit': 'SECONDS',
|
|
1714
|
+
'Version': 3
|
|
1715
|
+
})
|
|
1716
|
+
try:
|
|
1717
|
+
response = ts_client.write_records(DatabaseName=db_name, TableName=table_name, Records=records)
|
|
1718
|
+
status = response['ResponseMetadata']['HTTPStatusCode']
|
|
1719
|
+
print(f'Processed records: {len(records)}. WriteRecords status: {status}')
|
|
1720
|
+
self.logger.exception(f'Processed records: {len(records)}. WriteRecords status: {status}')
|
|
1721
|
+
except ts_client.exceptions.RejectedRecordsException as err:
|
|
1722
|
+
print(f'{err}')
|
|
1723
|
+
self.logger.exception(f'{err}')
|
|
1724
|
+
for e in err.response["RejectedRecords"]:
|
|
1725
|
+
print("Rejected Index " + str(e["RecordIndex"]) + ": " + e["Reason"])
|
|
1726
|
+
self.logger.exception("Rejected Index " + str(e["RecordIndex"]) + ": " + e["Reason"])
|
|
1727
|
+
return False
|
|
1728
|
+
except ts_client.exceptions.ValidationException as err:
|
|
1729
|
+
print(f"{err.response['Error']['Message']}")
|
|
1730
|
+
self.logger.exception(f"{err.response['Error']['Message']}")
|
|
1731
|
+
return False
|
|
1732
|
+
return status
|
|
1733
|
+
|
|
1734
|
+
def upload_models(self, datalake_path):
|
|
1735
|
+
"""Return a success or failure boolean attempting to upload a tensorflow models to the datalake.
|
|
1736
|
+
|
|
1737
|
+
:param datalake_path: Path to upload the attup trained models to S3 datalake
|
|
1738
|
+
:return: True if success, else False.
|
|
1739
|
+
|
|
1740
|
+
>>> upload_models(datalake_path='as-is/folder')
|
|
1741
|
+
>>> True
|
|
1742
|
+
"""
|
|
1743
|
+
s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1744
|
+
|
|
1745
|
+
for i in range(self.backtests + 1):
|
|
1746
|
+
q_name = "model" + str(i)
|
|
1747
|
+
print(q_name)
|
|
1748
|
+
file_path = os.path.join(Config.LOCAL_PATH, q_name + '.h5')
|
|
1749
|
+
try:
|
|
1750
|
+
response = s3_client.upload_file(file_path, self.datalake, os.path.join(datalake_path, "models", q_name + '.h5'))
|
|
1751
|
+
except ClientError as err:
|
|
1752
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
1753
|
+
return False
|
|
1754
|
+
except FileNotFoundError as err:
|
|
1755
|
+
self.logger.exception(f'No excel file or sheet name found. Please check paths: {err}')
|
|
1756
|
+
return False
|
|
1757
|
+
return True
|
|
1758
|
+
|
|
1759
|
+
def upload_models_tft(self, q_name, datalake_path):
|
|
1760
|
+
"""Return a success or failure boolean attempting to upload a tensorflow models to the datalake.
|
|
1761
|
+
|
|
1762
|
+
:param datalake_path: Path to upload the attup trained models to S3 datalake
|
|
1763
|
+
:return: True if success, else False.
|
|
1764
|
+
|
|
1765
|
+
>>> upload_models(datalake_path='as-is/folder')
|
|
1766
|
+
>>> True
|
|
1767
|
+
"""
|
|
1768
|
+
s3_client = boto3.client('s3', region_name=self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1769
|
+
print(q_name)
|
|
1770
|
+
file_path = os.path.join(Config.LOCAL_PATH, q_name + '.ckpt')
|
|
1771
|
+
try:
|
|
1772
|
+
response = s3_client.upload_file(file_path, self.datalake, os.path.join(datalake_path, "models", q_name + '.ckpt'))
|
|
1773
|
+
except ClientError as err:
|
|
1774
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
1775
|
+
return False
|
|
1776
|
+
except FileNotFoundError as err:
|
|
1777
|
+
self.logger.exception(f'No excel file or sheet name found. Please check paths: {err}')
|
|
1778
|
+
return False
|
|
1779
|
+
return True
|
|
1780
|
+
|
|
1781
|
+
def upload_object(self, datalake_name=None, datalake_path='', object_name=None):
|
|
1782
|
+
"""Return a success or failure boolean attempting to upload a local file to the datalake
|
|
1783
|
+
|
|
1784
|
+
:param datalake_name: S3 bucket name (datalake) to upload the object
|
|
1785
|
+
:param datalake_path: Path to upload the Q to S3 datalake
|
|
1786
|
+
:param object_name: Object name to upload to the S3 bucket (datalake)
|
|
1787
|
+
:return: True if success, else False.
|
|
1788
|
+
|
|
1789
|
+
>>> upload_object(datalake_name='datup-datalake-datup', datalake_path='path/to/data', object_name='datup.dat')
|
|
1790
|
+
>>> True
|
|
1791
|
+
"""
|
|
1792
|
+
file_path = os.path.join(Config.LOCAL_PATH, object_name)
|
|
1793
|
+
s3_client = boto3.client('s3', region_name='us-east-1', aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1794
|
+
try:
|
|
1795
|
+
response = s3_client.upload_file(file_path, datalake_name, os.path.join(datalake_path, object_name))
|
|
1796
|
+
except ClientError as err:
|
|
1797
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
1798
|
+
return False
|
|
1799
|
+
except FileNotFoundError as err:
|
|
1800
|
+
self.logger.exception(f'No excel file or sheet name found. Please check paths: {err}')
|
|
1801
|
+
return False
|
|
1802
|
+
return True
|
|
1803
|
+
|
|
1804
|
+
def upload_log(self):
|
|
1805
|
+
"""Return a success or failure boolean attempting to upload a local file to the datalake
|
|
1806
|
+
|
|
1807
|
+
:param datalake_path: Path to upload the Q to S3 datalake
|
|
1808
|
+
:return: True if success, else False.
|
|
1809
|
+
|
|
1810
|
+
>>> upload_log()
|
|
1811
|
+
>>> True
|
|
1812
|
+
"""
|
|
1813
|
+
file_path = os.path.join(Config.LOCAL_PATH, self.logfile)
|
|
1814
|
+
s3_client = boto3.client('s3', region_name='us-east-1', aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1815
|
+
try:
|
|
1816
|
+
response = s3_client.upload_file(file_path, self.datalake, os.path.join(self.log_path, self.logfile))
|
|
1817
|
+
except ClientError as err:
|
|
1818
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
1819
|
+
return False
|
|
1820
|
+
except FileNotFoundError as err:
|
|
1821
|
+
self.logger.exception(f'No excel file or sheet name found. Please check paths: {err}')
|
|
1822
|
+
return False
|
|
1823
|
+
return True
|
|
1824
|
+
|
|
1825
|
+
def upload_parquet(self, df, q_name, datalake_path, compression='snappy', engine='pyarrow'):
|
|
1826
|
+
"""Return a success or failure boolean attempting to upload a local parquet file to the datalake
|
|
1827
|
+
|
|
1828
|
+
:param df: Dataframe to upload
|
|
1829
|
+
:param q_name: File name to save dataframe
|
|
1830
|
+
:param datalake_path: Path to upload the file to S3 datalake
|
|
1831
|
+
:param compression: Compression to use in the parquet file. Default 'snappy'
|
|
1832
|
+
:param engine: Engine to use for writing parquet files. Default 'pyarrow'
|
|
1833
|
+
:return: True if success, else False.
|
|
1834
|
+
|
|
1835
|
+
>>> upload_parquet(df=df, q_name='Q', datalake_path='as-is/folder')
|
|
1836
|
+
>>> True
|
|
1837
|
+
"""
|
|
1838
|
+
file_path = os.path.join(Config.LOCAL_PATH, q_name + '.parquet')
|
|
1839
|
+
|
|
1840
|
+
print(f'Compression: {compression}')
|
|
1841
|
+
print(f'Engine: {engine}')
|
|
1842
|
+
|
|
1843
|
+
# Save DataFrame as Parquet file
|
|
1844
|
+
try:
|
|
1845
|
+
df.to_parquet(file_path, engine=engine, compression=compression, index=False)
|
|
1846
|
+
except Exception as e:
|
|
1847
|
+
self.logger.exception(f'Failed to save the DataFrame as a Parquet file: {e}')
|
|
1848
|
+
return False
|
|
1849
|
+
|
|
1850
|
+
s3_client = boto3.client(
|
|
1851
|
+
's3',
|
|
1852
|
+
region_name=self.region,
|
|
1853
|
+
aws_access_key_id=self.access_key,
|
|
1854
|
+
aws_secret_access_key=self.secret_key
|
|
1855
|
+
)
|
|
1856
|
+
try:
|
|
1857
|
+
# Upload the Parquet file to S3
|
|
1858
|
+
response = s3_client.upload_file(
|
|
1859
|
+
file_path,
|
|
1860
|
+
self.datalake,
|
|
1861
|
+
os.path.join(datalake_path, q_name, q_name + '.parquet')
|
|
1862
|
+
)
|
|
1863
|
+
except ClientError as err:
|
|
1864
|
+
self.logger.exception(f'No connection to the datalake. Please check the paths: {err}')
|
|
1865
|
+
return False
|
|
1866
|
+
except FileNotFoundError as err:
|
|
1867
|
+
self.logger.exception(f'No Parquet file found. Please check paths: {err}')
|
|
1868
|
+
return False
|
|
1869
|
+
|
|
1870
|
+
return True
|
|
1871
|
+
|
|
1872
|
+
|
|
1873
|
+
def copy_between_datalakes(self, q_name=None, src_datalake=None, src_path=None, dst_datalake=None, dst_path=None):
|
|
1874
|
+
"""
|
|
1875
|
+
Return True whether successful copy between datalake buckets occurs
|
|
1876
|
+
|
|
1877
|
+
:param q_name: File or dataset name including the type or extension
|
|
1878
|
+
:param src_datalake: Source datalake's bucket name
|
|
1879
|
+
:param src_path: Source datalake's key path, excluding dataset name
|
|
1880
|
+
:param dst_datalake: Destination datalake's bucket name
|
|
1881
|
+
:param dst_path: Destination datalake's key path, excluding dataset name
|
|
1882
|
+
:return : True if success, else False.
|
|
1883
|
+
|
|
1884
|
+
>>> copy_between_datalakes(q_name='mycube', src_datalake='bucket-a', src_path='path/to/file', dst_datalake='bucket-b', dst_path='path/to/file')
|
|
1885
|
+
>>> True
|
|
1886
|
+
"""
|
|
1887
|
+
|
|
1888
|
+
s3_client = boto3.resource('s3',
|
|
1889
|
+
region_name='us-east-1',
|
|
1890
|
+
aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
1891
|
+
try:
|
|
1892
|
+
copy_source = {'Bucket': src_datalake, 'Key': os.path.join(src_path, q_name)}
|
|
1893
|
+
filename, filetype = q_name.split('.')
|
|
1894
|
+
if filetype == 'csv':
|
|
1895
|
+
s3_client.meta.client.copy(copy_source, dst_datalake, os.path.join(dst_path, filename, filename + '.' + filetype))
|
|
1896
|
+
elif filetype == 'xls' or filetype == 'xlsx' or filetype == 'XLS' or filetype == 'XLSX':
|
|
1897
|
+
s3_client.meta.client.copy(copy_source, dst_datalake, os.path.join(dst_path, filename + '.' + filetype))
|
|
1898
|
+
else:
|
|
1899
|
+
self.logger.debug(f'No valid dataset type. Please check database or datalake to debug.')
|
|
1900
|
+
except FileNotFoundError as err:
|
|
1901
|
+
self.logger.exception(f'No file or datalake found. Please check paths: {err}')
|
|
1902
|
+
return False
|
|
1903
|
+
return True
|
|
1904
|
+
|
|
1905
|
+
def download_bucket_last_excel_file(self,
|
|
1906
|
+
bucket_name,
|
|
1907
|
+
folder,
|
|
1908
|
+
datalake=None,
|
|
1909
|
+
sheet_name=0,
|
|
1910
|
+
index_col=None,
|
|
1911
|
+
usecols=None,
|
|
1912
|
+
num_records=None,
|
|
1913
|
+
date_cols=None,
|
|
1914
|
+
types=None,
|
|
1915
|
+
header_=0,
|
|
1916
|
+
skiprows_=None):
|
|
1917
|
+
"""
|
|
1918
|
+
Esta función descarga el archivo Excel más reciente (último modificado)
|
|
1919
|
+
de una carpeta en un bucket de S3, sin importar el nombre del archivo.
|
|
1920
|
+
Input:
|
|
1921
|
+
- bucket_name: Nombre del bucket de S3.
|
|
1922
|
+
- folder: Carpeta en el bucket donde se buscará el archivo.
|
|
1923
|
+
- datalake: Nombre del datalake donde se buscará el archivo.
|
|
1924
|
+
- sheet_name: Nombre o índice de la hoja a cargar.
|
|
1925
|
+
- index_col: Nombre o índice de la columna a usar como índice.
|
|
1926
|
+
- usecols: Columnas a seleccionar.
|
|
1927
|
+
- num_records: Número de registros a cargar.
|
|
1928
|
+
- date_cols: Columnas a parsear como fechas.
|
|
1929
|
+
- types: Tipos de datos de las columnas.
|
|
1930
|
+
- header_: Fila a usar como encabezado.
|
|
1931
|
+
- skiprows_: Número de filas a salt
|
|
1932
|
+
Output:
|
|
1933
|
+
- df: DataFrame de pandas con los datos del archivo Excel.
|
|
1934
|
+
"""
|
|
1935
|
+
# Configura tu cliente de S3
|
|
1936
|
+
s3 = boto3.client('s3')
|
|
1937
|
+
|
|
1938
|
+
# Lista los archivos en la carpeta especificada
|
|
1939
|
+
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder)
|
|
1940
|
+
|
|
1941
|
+
# Verifica si se encontraron archivos
|
|
1942
|
+
if 'Contents' in response:
|
|
1943
|
+
# Filtra los archivos Excel
|
|
1944
|
+
archivos_excel = [obj for obj in response['Contents'] if obj['Key'].endswith('.xlsx') or obj['Key'].endswith('.xls')]
|
|
1945
|
+
file_list = pd.DataFrame(archivos_excel)
|
|
1946
|
+
print('Archivos encontrados:\n',file_list[['Key', 'LastModified']])
|
|
1947
|
+
if archivos_excel:
|
|
1948
|
+
# Find the most recent file
|
|
1949
|
+
most_recent_file = max(archivos_excel, key=lambda x: x['LastModified'])
|
|
1950
|
+
archivo_s3 = most_recent_file['Key']
|
|
1951
|
+
print(f'Archivo más reciente encontrado: {archivo_s3}')
|
|
1952
|
+
|
|
1953
|
+
# Carga el archivo de Excel en un DataFrame de pandas
|
|
1954
|
+
df = self.download_excel_from_bucket(datalake=datalake,
|
|
1955
|
+
datalake_path=archivo_s3,
|
|
1956
|
+
sheet_name=sheet_name,
|
|
1957
|
+
index_col=index_col,
|
|
1958
|
+
usecols=usecols,
|
|
1959
|
+
num_records=num_records,
|
|
1960
|
+
date_cols=date_cols,
|
|
1961
|
+
types=types,
|
|
1962
|
+
header_=header_,
|
|
1963
|
+
skiprows_=skiprows_)
|
|
1964
|
+
print(f'Archivo cargado: {archivo_s3} \n')
|
|
1965
|
+
return df
|
|
1966
|
+
else:
|
|
1967
|
+
print('No se encontraron archivos Excel en la carpeta especificada.')
|
|
1968
|
+
else:
|
|
1969
|
+
print('No se encontraron archivos en la carpeta especificada.')
|
|
1970
|
+
|
|
1971
|
+
def rename_and_upload_delta_hist_file(self,
|
|
1972
|
+
df,
|
|
1973
|
+
prefix='DEMAND',
|
|
1974
|
+
col_date='Fecha',
|
|
1975
|
+
datalake_path='dev/raw/as-is/forecast/historic_data',
|
|
1976
|
+
sep=',',
|
|
1977
|
+
encoding='utf-8',
|
|
1978
|
+
date_format='%Y-%m-%d',
|
|
1979
|
+
lineterminator=None):
|
|
1980
|
+
"""
|
|
1981
|
+
Rename and upload the file with the prefix and YYYYMM date to datalake.
|
|
1982
|
+
Input:
|
|
1983
|
+
- df: DataFrame to upload.
|
|
1984
|
+
- prefix: Prefix for the file name.
|
|
1985
|
+
- col_date: Column name with the date.
|
|
1986
|
+
- datalake_path: Path in the datalake to upload the file.
|
|
1987
|
+
- sep: Separator for the CSV file.
|
|
1988
|
+
- encoding: Encoding for the CSV file.
|
|
1989
|
+
- date_format: Date format for the CSV file.
|
|
1990
|
+
- lineterminator: Line terminator for the CSV file.
|
|
1991
|
+
Output:
|
|
1992
|
+
- return: True if success, else False.
|
|
1993
|
+
"""
|
|
1994
|
+
df[col_date] = pd.to_datetime(df[col_date])
|
|
1995
|
+
date_min = df[col_date].min()
|
|
1996
|
+
date_max = df[col_date].max()
|
|
1997
|
+
|
|
1998
|
+
date_min = str(date_min)[0:7].replace('-','')
|
|
1999
|
+
date_max = str(date_max)[0:7].replace('-','')
|
|
2000
|
+
|
|
2001
|
+
print(f'Fecha mínima: {date_min}')
|
|
2002
|
+
print(f'Fecha máxima: {date_max}')
|
|
2003
|
+
|
|
2004
|
+
if date_min == date_max:
|
|
2005
|
+
print(f'El mes y año de las fechas min y max son iguales. Guardando archivo con nombre: {prefix}{date_min}.csv al datalake: {datalake_path}')
|
|
2006
|
+
self.upload_csv(df, q_name=prefix+date_min,
|
|
2007
|
+
datalake_path=datalake_path,
|
|
2008
|
+
sep=sep,
|
|
2009
|
+
encoding=encoding,
|
|
2010
|
+
date_format=date_format,
|
|
2011
|
+
lineterminator=lineterminator)
|
|
2012
|
+
else:
|
|
2013
|
+
print('El mes y año de las fechas min y max son diferentes. Revisar datos.')
|
|
2014
|
+
return False
|
|
2015
|
+
return True
|