ipulse-shared-core-ftredge 2.6.1__py3-none-any.whl → 2.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ipulse-shared-core-ftredge might be problematic. Click here for more details.
- ipulse_shared_core_ftredge/__init__.py +10 -9
- ipulse_shared_core_ftredge/enums/__init__.py +12 -7
- ipulse_shared_core_ftredge/enums/enums_common_utils.py +9 -0
- ipulse_shared_core_ftredge/enums/enums_data_eng.py +280 -76
- ipulse_shared_core_ftredge/enums/{enums_logs.py → enums_logging.py} +30 -1
- ipulse_shared_core_ftredge/enums/enums_module_fincore.py +16 -2
- ipulse_shared_core_ftredge/enums/enums_modules.py +6 -0
- ipulse_shared_core_ftredge/enums/{enums_cloud.py → enums_solution_providers.py} +11 -4
- ipulse_shared_core_ftredge/utils/__init__.py +11 -7
- ipulse_shared_core_ftredge/utils/logs/context_log.py +2 -3
- ipulse_shared_core_ftredge/utils/logs/get_logger.py +47 -20
- ipulse_shared_core_ftredge/utils/utils_cloud.py +26 -17
- ipulse_shared_core_ftredge/utils/utils_cloud_gcp.py +311 -180
- ipulse_shared_core_ftredge/utils/utils_cloud_gcp_with_collectors.py +150 -153
- ipulse_shared_core_ftredge/utils/utils_cloud_with_collectors.py +16 -15
- ipulse_shared_core_ftredge/utils/utils_collector_pipelinemon.py +2 -2
- ipulse_shared_core_ftredge/utils/utils_common.py +145 -110
- ipulse_shared_core_ftredge/utils/utils_templates_and_schemas.py +2 -2
- {ipulse_shared_core_ftredge-2.6.1.dist-info → ipulse_shared_core_ftredge-2.7.1.dist-info}/METADATA +1 -1
- ipulse_shared_core_ftredge-2.7.1.dist-info/RECORD +33 -0
- ipulse_shared_core_ftredge-2.6.1.dist-info/RECORD +0 -33
- {ipulse_shared_core_ftredge-2.6.1.dist-info → ipulse_shared_core_ftredge-2.7.1.dist-info}/LICENCE +0 -0
- {ipulse_shared_core_ftredge-2.6.1.dist-info → ipulse_shared_core_ftredge-2.7.1.dist-info}/WHEEL +0 -0
- {ipulse_shared_core_ftredge-2.6.1.dist-info → ipulse_shared_core_ftredge-2.7.1.dist-info}/top_level.txt +0 -0
|
@@ -11,76 +11,29 @@ from io import StringIO
|
|
|
11
11
|
import os
|
|
12
12
|
import time
|
|
13
13
|
import logging
|
|
14
|
-
from typing import Optional
|
|
15
14
|
import traceback
|
|
16
15
|
from google.api_core.exceptions import NotFound
|
|
17
16
|
from google.cloud import error_reporting
|
|
18
17
|
from google.cloud import logging as cloud_logging
|
|
19
18
|
from google.cloud.storage import Client as GCSClient
|
|
20
19
|
from google.cloud import bigquery
|
|
20
|
+
from ipulse_shared_core_ftredge.enums import DuplicationHandling, DuplicationHandlingStatus, MatchConditionType,DataSourceType, LogLevel
|
|
21
|
+
from ipulse_shared_core_ftredge.utils.utils_common import log_error, log_warning, log_info
|
|
22
|
+
from ipulse_shared_core_ftredge.utils.logs import ContextLog
|
|
23
|
+
from ipulse_shared_core_ftredge.utils.utils_collector_pipelinemon import Pipelinemon
|
|
21
24
|
|
|
22
25
|
############################################################################
|
|
23
|
-
##################### GOOGLE CLOUD
|
|
26
|
+
##################### GOOGLE CLOUD PLATFORM UTILS ##################################
|
|
24
27
|
############################################################################
|
|
25
28
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
29
|
+
class CustomGCPLoggingHandler(cloud_logging.handlers.CloudLoggingHandler):
|
|
30
|
+
"""Custom handler for Google Cloud Logging with a dynamic logName."""
|
|
31
|
+
def __init__(self, client, name, resource=None, labels=None):
|
|
32
|
+
super().__init__(client=client, name=name, resource=resource, labels=labels)
|
|
33
|
+
self.client = client # Ensure client is consistently used
|
|
31
34
|
|
|
32
|
-
def
|
|
33
|
-
|
|
34
|
-
logger.warning(message)
|
|
35
|
-
elif print_out:
|
|
36
|
-
print(message)
|
|
37
|
-
|
|
38
|
-
def log_info(message, logger=None, print_out=False):
|
|
39
|
-
if logger:
|
|
40
|
-
logger.info(message)
|
|
41
|
-
elif print_out:
|
|
42
|
-
print(message)
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
############################################################################
|
|
47
|
-
##################### LOGGING and ERROR reporting ##########################
|
|
48
|
-
####DEPCREACATED: THIS APPROACH WAS GOOD, BUT ERRORS WERE NOT REPORTED TO ERROR REPORTING
|
|
49
|
-
# logging.basicConfig(level=logging.INFO)
|
|
50
|
-
# logging_client = google.cloud.logging.Client()
|
|
51
|
-
# logging_client.setup_logging()
|
|
52
|
-
###################################
|
|
53
|
-
def setup_gcp_logging(logger, formatter, enable_error_reporting=True):
|
|
54
|
-
|
|
55
|
-
class CustomGCPErrorReportingHandler(logging.Handler):
|
|
56
|
-
def __init__(self, level=logging.ERROR):
|
|
57
|
-
super().__init__(level)
|
|
58
|
-
self.error_client = error_reporting.Client()
|
|
59
|
-
self.propagate = True
|
|
60
|
-
|
|
61
|
-
def emit(self, record):
|
|
62
|
-
try:
|
|
63
|
-
if record.levelno >= logging.ERROR:
|
|
64
|
-
log_struct = {
|
|
65
|
-
'message': self.format(record),
|
|
66
|
-
'severity': record.levelname,
|
|
67
|
-
'pathname': getattr(record, 'pathname', None),
|
|
68
|
-
'lineno': getattr(record, 'lineno', None)
|
|
69
|
-
}
|
|
70
|
-
if record.exc_info:
|
|
71
|
-
log_struct['exception'] = ''.join(
|
|
72
|
-
traceback.format_exception(*record.exc_info)
|
|
73
|
-
)
|
|
74
|
-
self.error_client.report(str(log_struct))
|
|
75
|
-
except Exception as e:
|
|
76
|
-
self.handleError(record)
|
|
77
|
-
|
|
78
|
-
class CustomGCPLoggingHandler(cloud_logging.handlers.CloudLoggingHandler):
|
|
79
|
-
"""Custom handler for Google Cloud Logging with a dynamic logName."""
|
|
80
|
-
def __init__(self, client, name, resource=None, labels=None):
|
|
81
|
-
super().__init__(client=client, name=name, resource=resource, labels=labels)
|
|
82
|
-
|
|
83
|
-
def emit(self, record):
|
|
35
|
+
def emit(self, record):
|
|
36
|
+
try:
|
|
84
37
|
# 1. Create the basic log entry dictionary
|
|
85
38
|
log_entry = {
|
|
86
39
|
'message': record.msg,
|
|
@@ -109,17 +62,45 @@ def setup_gcp_logging(logger, formatter, enable_error_reporting=True):
|
|
|
109
62
|
|
|
110
63
|
# 4. Send to Google Cloud Logging
|
|
111
64
|
super().emit(record)
|
|
65
|
+
except Exception as e:
|
|
66
|
+
self.handleError(record)
|
|
67
|
+
|
|
68
|
+
class CustomGCPErrorReportingHandler(logging.Handler):
|
|
69
|
+
def __init__(self, client=None, level=logging.ERROR):
|
|
70
|
+
super().__init__(level)
|
|
71
|
+
self.error_client = error_reporting.Client() if client is None else client
|
|
72
|
+
self.propagate = True
|
|
73
|
+
|
|
74
|
+
def emit(self, record):
|
|
75
|
+
try:
|
|
76
|
+
if record.levelno >= logging.ERROR:
|
|
77
|
+
log_struct = {
|
|
78
|
+
'message': self.format(record),
|
|
79
|
+
'severity': record.levelname,
|
|
80
|
+
'pathname': getattr(record, 'pathname', None),
|
|
81
|
+
'lineno': getattr(record, 'lineno', None)
|
|
82
|
+
}
|
|
83
|
+
if record.exc_info:
|
|
84
|
+
log_struct['exception'] = ''.join(
|
|
85
|
+
traceback.format_exception(*record.exc_info)
|
|
86
|
+
)
|
|
87
|
+
self.error_client.report(str(log_struct))
|
|
88
|
+
except Exception as e:
|
|
89
|
+
self.handleError(record)
|
|
90
|
+
|
|
112
91
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
92
|
+
def add_gcp_cloud_logging(logger, formatter, client=None):
|
|
93
|
+
"""Sets up Google Cloud Logging for the logger."""
|
|
94
|
+
client = client or cloud_logging.Client()
|
|
95
|
+
handler = CustomGCPLoggingHandler(client, logger.name)
|
|
96
|
+
handler.setFormatter(formatter)
|
|
97
|
+
logger.addHandler(handler)
|
|
118
98
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
99
|
+
def add_gcp_error_reporting(logger, client=None):
|
|
100
|
+
"""Sets up Google Cloud Error Reporting for the logger."""
|
|
101
|
+
client = client or error_reporting.Client()
|
|
102
|
+
handler = CustomGCPErrorReportingHandler(client=client)
|
|
103
|
+
logger.addHandler(handler)
|
|
123
104
|
|
|
124
105
|
|
|
125
106
|
|
|
@@ -142,13 +123,13 @@ def read_json_from_gcs(storage_client:GCSClient, bucket_name:str, file_name:str,
|
|
|
142
123
|
data = json.loads(data_string)
|
|
143
124
|
return data
|
|
144
125
|
except NotFound:
|
|
145
|
-
|
|
126
|
+
log_warning(msg=f"Warning: The file {file_name} was not found in the bucket {bucket_name}.", logger=logger, print_out=print_out)
|
|
146
127
|
return None
|
|
147
128
|
except json.JSONDecodeError:
|
|
148
|
-
log_error(
|
|
129
|
+
log_error(msg=f"Error: The file {file_name} could not be decoded as JSON.", logger=logger, print_out=print_out)
|
|
149
130
|
return None
|
|
150
131
|
except Exception as e:
|
|
151
|
-
log_error(
|
|
132
|
+
log_error(msg=f"An unexpected error occurred: {e}", exc_info=True, logger=logger, print_out=print_out)
|
|
152
133
|
return None
|
|
153
134
|
|
|
154
135
|
def read_csv_from_gcs(bucket_name:str, file_name:str, storage_client:GCSClient, logger=None, print_out=False):
|
|
@@ -162,114 +143,137 @@ def read_csv_from_gcs(bucket_name:str, file_name:str, storage_client:GCSClient,
|
|
|
162
143
|
reader = csv.DictReader(data_file)
|
|
163
144
|
return list(reader)
|
|
164
145
|
except NotFound:
|
|
165
|
-
|
|
146
|
+
log_warning(msg=f"Error: The file {file_name} was not found in the bucket {bucket_name}.", logger=logger, print_out=print_out)
|
|
166
147
|
return None
|
|
167
148
|
except csv.Error:
|
|
168
|
-
log_error(
|
|
149
|
+
log_error(msg=f"Error: The file {file_name} could not be read as CSV.", logger=logger, print_out=print_out)
|
|
169
150
|
return None
|
|
170
151
|
except Exception as e:
|
|
171
|
-
log_error(
|
|
152
|
+
log_error(msg=f"An unexpected error occurred: {e}", logger=logger, print_out=print_out, exc_info=True)
|
|
172
153
|
return None
|
|
173
154
|
|
|
155
|
+
def write_json_to_gcs_extended(storage_client: GCSClient, data: dict | list | str, bucket_name: str, file_name: str,
|
|
156
|
+
duplication_handling_enum: DuplicationHandling, duplication_match_condition_type_enum: MatchConditionType,
|
|
157
|
+
duplication_match_condition: str = "", max_retries: int = 2, max_deletable_files: int = 1,
|
|
158
|
+
logger=None, print_out=False, raise_e=False, pipelinemon: Pipelinemon = None):
|
|
174
159
|
|
|
160
|
+
"""Saves data to Google Cloud Storage with optional Pipelinemon monitoring.
|
|
175
161
|
|
|
176
|
-
|
|
177
|
-
file_exists_if_starts_with_prefix:Optional[str] =None, overwrite_if_exists:bool=False, increment_if_exists:bool=False,
|
|
178
|
-
max_retries:int=2, max_deletable_files:int=1, logger=None, print_out=False):
|
|
179
|
-
|
|
180
|
-
"""Saves data to Google Cloud Storage and optionally locally.
|
|
181
|
-
|
|
182
|
-
This function attempts to upload data to GCS.
|
|
183
|
-
- If the upload fails after retries and `save_locally` is True or `local_path` is provided, it attempts to save the data locally.
|
|
184
|
-
- It handles file name conflicts based on these rules:
|
|
185
|
-
- If `overwrite_if_exists` is True:
|
|
186
|
-
- If `file_exists_if_contains_substr` is provided, ANY existing file containing the substring is deleted, and the new file is saved with the provided `file_name`.
|
|
187
|
-
- If `file_exists_if_contains_substr` is None, and a file with the exact `file_name` exists, it's overwritten.
|
|
188
|
-
- If `increment_if_exists` is True:
|
|
189
|
-
- If `file_exists_if_contains_substr` is provided, a new file with an incremented version is created ONLY if a file with the EXACT `file_name` exists.
|
|
190
|
-
- If `file_exists_if_contains_substr` is None, a new file with an incremented version is created if a file with the exact `file_name` exists.
|
|
191
|
-
|
|
192
|
-
-If both overwrite_if_exists and increment_if_exists are provided as Ture, an exception will be raised.
|
|
162
|
+
Handles duplication with strategies: OVERWRITE, INCREMENT, SKIP, or RAISE_ERROR.
|
|
193
163
|
"""
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
raise ValueError("Both 'overwrite_if_exists' and 'increment_if_exists' cannot be True simultaneously.")
|
|
198
|
-
if not isinstance(data, (list, dict, str)):
|
|
199
|
-
raise ValueError("Data should be a list, dict, or string.")
|
|
200
|
-
if max_deletable_files > 10:
|
|
201
|
-
raise ValueError("max_deletable_files should be less than 10 for safety. For more use another method.")
|
|
202
|
-
|
|
203
|
-
# Prepare data
|
|
204
|
-
if isinstance(data, (list, dict)):
|
|
205
|
-
data_str = json.dumps(data, indent=2)
|
|
206
|
-
else:
|
|
207
|
-
data_str = data
|
|
208
|
-
|
|
209
|
-
bucket = storage_client.bucket(bucket_name)
|
|
210
|
-
base_file_name, ext = os.path.splitext(file_name)
|
|
211
|
-
increment = 0
|
|
212
|
-
attempts = 0
|
|
213
|
-
success = False
|
|
164
|
+
|
|
165
|
+
max_deletable_files_allowed = 3
|
|
166
|
+
cloud_storage_ref=DataSourceType.GCS.value
|
|
214
167
|
|
|
215
168
|
# GCS-related metadata
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
169
|
+
saved_to_path = None
|
|
170
|
+
matched_duplicates_count = 0
|
|
171
|
+
matched_duplicates_deleted = []
|
|
172
|
+
duplication_handling_status = None
|
|
173
|
+
error_during_operation = None
|
|
174
|
+
|
|
175
|
+
response = {
|
|
176
|
+
"saved_to_path": saved_to_path,
|
|
177
|
+
"matched_duplicates_count": matched_duplicates_count,
|
|
178
|
+
"matched_duplicates_deleted": matched_duplicates_deleted,
|
|
179
|
+
"duplication_handling_status": duplication_handling_status,
|
|
180
|
+
"duplication_match_condition_type": duplication_match_condition_type_enum.value,
|
|
181
|
+
"duplication_match_condition": duplication_match_condition,
|
|
182
|
+
"error_during_operation": error_during_operation
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
supported_match_condition_types = [MatchConditionType.EXACT, MatchConditionType.PREFIX]
|
|
186
|
+
supported_duplication_handling = [DuplicationHandling.RAISE_ERROR, DuplicationHandling.OVERWRITE, DuplicationHandling.INCREMENT, DuplicationHandling.SKIP]
|
|
187
|
+
|
|
188
|
+
try:
|
|
189
|
+
if max_deletable_files > max_deletable_files_allowed:
|
|
190
|
+
raise ValueError(f"max_deletable_files should be less than or equal to {max_deletable_files_allowed} for safety.")
|
|
191
|
+
if duplication_handling_enum not in supported_duplication_handling:
|
|
192
|
+
msg = f"Error: Duplication handling not supported. Supported types: {[dh.value for dh in supported_duplication_handling]}"
|
|
193
|
+
raise ValueError(msg)
|
|
194
|
+
if duplication_match_condition_type_enum not in supported_match_condition_types:
|
|
195
|
+
msg = f"Error: Match condition type not supported. Supported types: {[mct.value for mct in supported_match_condition_types]}"
|
|
196
|
+
raise ValueError(msg)
|
|
197
|
+
elif duplication_match_condition_type_enum != MatchConditionType.EXACT and not duplication_match_condition:
|
|
198
|
+
msg = f"Error: Match condition is required for match condition type: {duplication_match_condition_type_enum.value}"
|
|
199
|
+
raise ValueError(msg)
|
|
200
|
+
|
|
201
|
+
# Prepare data
|
|
202
|
+
if isinstance(data, (list, dict)):
|
|
203
|
+
data_str = json.dumps(data, indent=2)
|
|
204
|
+
else:
|
|
205
|
+
data_str = data
|
|
206
|
+
|
|
207
|
+
increment = 0
|
|
208
|
+
attempts = 0
|
|
209
|
+
success = False
|
|
210
|
+
|
|
211
|
+
# Check for existing files based on duplication_match_condition_type
|
|
212
|
+
files_matched_on_condition = []
|
|
213
|
+
bucket = storage_client.bucket(bucket_name)
|
|
214
|
+
base_file_name, ext = os.path.splitext(file_name)
|
|
215
|
+
if duplication_match_condition_type_enum == MatchConditionType.PREFIX:
|
|
216
|
+
files_matched_on_condition = list(bucket.list_blobs(prefix=duplication_match_condition))
|
|
217
|
+
elif duplication_match_condition_type_enum == MatchConditionType.EXACT:
|
|
218
|
+
duplication_match_condition = file_name if not duplication_match_condition else duplication_match_condition
|
|
219
|
+
if bucket.blob(duplication_match_condition).exists():
|
|
220
|
+
files_matched_on_condition = [bucket.blob(file_name)]
|
|
221
|
+
|
|
222
|
+
matched_duplicates_count = len(files_matched_on_condition)
|
|
223
|
+
response["matched_duplicates_count"] = matched_duplicates_count
|
|
224
|
+
|
|
225
|
+
# Handle duplication based on duplication_handling
|
|
226
|
+
if matched_duplicates_count:
|
|
227
|
+
log_msg = f"Duplicate FOUND, matched_duplicates_count: {matched_duplicates_count}"
|
|
228
|
+
if pipelinemon:
|
|
229
|
+
pipelinemon.add_log(ContextLog(LogLevel.NOTICE_ALREADY_EXISTS, subject="duplicate_found", description=log_msg))
|
|
230
|
+
|
|
231
|
+
if duplication_handling_enum == DuplicationHandling.RAISE_ERROR:
|
|
232
|
+
raise FileExistsError("File(s) matching the condition already exist.")
|
|
233
|
+
|
|
234
|
+
if duplication_handling_enum == DuplicationHandling.SKIP:
|
|
235
|
+
response["duplication_handling_status"] = DuplicationHandlingStatus.SKIPPED.value
|
|
236
|
+
log_msg = f"SKIPPING, response: {response}"
|
|
237
|
+
log_info(log_msg, logger=logger, print_out=print_out) ## only logsor prints if logger is provided and print_out is True
|
|
238
|
+
return response
|
|
239
|
+
|
|
240
|
+
if duplication_handling_enum == DuplicationHandling.OVERWRITE:
|
|
241
|
+
if matched_duplicates_count > max_deletable_files:
|
|
242
|
+
raise ValueError(f"Error: Attempt to delete {matched_duplicates_count} matched files, but limit is {max_deletable_files}. Operation Cancelled.")
|
|
243
|
+
|
|
244
|
+
for blob in files_matched_on_condition:
|
|
245
|
+
cloud_storage_path_to_delete = f"gs://{bucket_name}/{blob.name}"
|
|
235
246
|
blob.delete()
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
blobs_matched = list(bucket.list_blobs(prefix=file_exists_if_starts_with_prefix))
|
|
258
|
-
cloud_storage_file_exists_checked_on_name = file_exists_if_starts_with_prefix
|
|
259
|
-
if blobs_matched:
|
|
260
|
-
upload_allowed = False
|
|
261
|
-
cloud_storage_file_already_exists = True
|
|
262
|
-
elif bucket.blob(file_name).exists():
|
|
263
|
-
upload_allowed = False
|
|
264
|
-
cloud_storage_file_already_exists = True
|
|
265
|
-
|
|
266
|
-
# --- GCS Upload ---
|
|
267
|
-
cloud_storage_path = f"gs://{bucket_name}/{file_name}"
|
|
268
|
-
if overwrite_if_exists or increment_if_exists or upload_allowed:
|
|
247
|
+
matched_duplicates_deleted.append(cloud_storage_path_to_delete)
|
|
248
|
+
log_msg = f"File deleted as part of overwrite: {cloud_storage_path_to_delete}"
|
|
249
|
+
if pipelinemon:
|
|
250
|
+
pipelinemon.add_system_impacted(f"delete: {cloud_storage_ref}_bucket_file: {cloud_storage_path_to_delete}")
|
|
251
|
+
pipelinemon.add_log(ContextLog(LogLevel.INFO_REMOTE_DELETE_COMPLETE, subject="delete_duplicate", description=log_msg))
|
|
252
|
+
log_info(log_msg, logger=logger, print_out=print_out)
|
|
253
|
+
|
|
254
|
+
response["matched_duplicates_deleted"] = matched_duplicates_deleted
|
|
255
|
+
response["duplication_handling_status"] = DuplicationHandlingStatus.OVERWRITTEN.value
|
|
256
|
+
|
|
257
|
+
elif duplication_handling_enum == DuplicationHandling.INCREMENT:
|
|
258
|
+
while bucket.blob(file_name).exists():
|
|
259
|
+
increment += 1
|
|
260
|
+
file_name = f"{base_file_name}_v{increment}{ext}"
|
|
261
|
+
saved_to_path = f"gs://{bucket_name}/{file_name}"
|
|
262
|
+
response["duplication_handling_status"] = DuplicationHandlingStatus.INCREMENTED.value
|
|
263
|
+
log_msg = "INCREMENTING as Duplicate FOUND "
|
|
264
|
+
log_info(log_msg, logger=logger, print_out=print_out) ## only logsor prints if logger is provided and print_out is True
|
|
265
|
+
|
|
266
|
+
# GCS Upload
|
|
267
|
+
saved_to_path = f"gs://{bucket_name}/{file_name}"
|
|
269
268
|
while attempts < max_retries and not success:
|
|
270
269
|
try:
|
|
271
|
-
blob = bucket.blob(file_name)
|
|
270
|
+
blob = bucket.blob(file_name)
|
|
272
271
|
blob.upload_from_string(data_str, content_type='application/json')
|
|
272
|
+
log_msg = f"File uploaded to GCS: {saved_to_path}"
|
|
273
|
+
if pipelinemon:
|
|
274
|
+
pipelinemon.add_system_impacted(f"upload: {cloud_storage_ref}_bucket_file: {saved_to_path}")
|
|
275
|
+
pipelinemon.add_log(ContextLog(LogLevel.INFO_REMOTE_PERSISTNACE_COMPLETE, subject="file_upload", description=log_msg))
|
|
276
|
+
log_info(log_msg, logger=logger, print_out=print_out)
|
|
273
277
|
success = True
|
|
274
278
|
except Exception as e:
|
|
275
279
|
attempts += 1
|
|
@@ -278,18 +282,142 @@ def write_json_to_gcs_extended( storage_client:GCSClient, data:dict | list | str
|
|
|
278
282
|
else:
|
|
279
283
|
raise e
|
|
280
284
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
"
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
285
|
+
except Exception as e:
|
|
286
|
+
error_during_operation = f"Error occurred while writing JSON to GCS path: {saved_to_path} ; Error details: {type(e).__name__} - {str(e)}"
|
|
287
|
+
response["error_during_operation"] = error_during_operation
|
|
288
|
+
if pipelinemon:
|
|
289
|
+
pipelinemon.add_log(ContextLog(LogLevel.ERROR_EXCEPTION, e=e, description="response: {response}"))
|
|
290
|
+
log_error(response, logger=logger, print_out=print_out)
|
|
291
|
+
if raise_e:
|
|
292
|
+
raise e
|
|
293
|
+
|
|
294
|
+
response["saved_to_path"] = saved_to_path if success else None
|
|
295
|
+
return response
|
|
296
|
+
|
|
297
|
+
# def write_json_to_gcs_extended(storage_client: GCSClient, data: dict | list | str, bucket_name: str, file_name: str,
|
|
298
|
+
# duplication_handling: DuplicationHandling, duplication_match_condition_type: MatchConditionType,
|
|
299
|
+
# duplication_match_condition: str | List[str] = "", max_retries: int = 2, max_deletable_files: int = 1,
|
|
300
|
+
# logger=None, print_out=False, raise_e=False):
|
|
301
|
+
|
|
302
|
+
# """Saves data to Google Cloud Storage.
|
|
303
|
+
|
|
304
|
+
# Handles duplication with strategies: OVERWRITE, INCREMENT, SKIP, or RAISE_ERROR.
|
|
305
|
+
# """
|
|
306
|
+
|
|
307
|
+
# max_deletable_files_allowed = 3
|
|
308
|
+
|
|
309
|
+
# # GCS-related metadata
|
|
310
|
+
# saved_to_path = None
|
|
311
|
+
# matched_duplicates_count = 0
|
|
312
|
+
# matched_duplicates_deleted = []
|
|
313
|
+
# duplication_handling_status = None
|
|
314
|
+
# error_during_operation = None
|
|
315
|
+
|
|
316
|
+
# response = {
|
|
317
|
+
# "saved_to_path": saved_to_path,
|
|
318
|
+
# "matched_duplicates_count": matched_duplicates_count,
|
|
319
|
+
# "matched_duplicates_deleted": matched_duplicates_deleted,
|
|
320
|
+
# "duplication_handling_status": duplication_handling_status,
|
|
321
|
+
# "duplication_match_condition_type": duplication_match_condition_type,
|
|
322
|
+
# "duplication_match_condition": duplication_match_condition,
|
|
323
|
+
# "error_during_operation": error_during_operation
|
|
324
|
+
# }
|
|
325
|
+
|
|
326
|
+
# supported_match_condition_types = [MatchConditionType.EXACT, MatchConditionType.PREFIX]
|
|
327
|
+
# supported_duplication_handling = [DuplicationHandling.RAISE_ERROR, DuplicationHandling.OVERWRITE, DuplicationHandling.INCREMENT, DuplicationHandling.SKIP]
|
|
328
|
+
|
|
329
|
+
# try:
|
|
330
|
+
# if max_deletable_files > max_deletable_files_allowed:
|
|
331
|
+
# raise ValueError(f"max_deletable_files should be less than or equal to {max_deletable_files_allowed} for safety.")
|
|
332
|
+
# if duplication_handling not in supported_duplication_handling:
|
|
333
|
+
# msg = f"Error: Duplication handling not supported. Supported types: {supported_duplication_handling}"
|
|
334
|
+
# raise ValueError(msg)
|
|
335
|
+
# if duplication_match_condition_type not in supported_match_condition_types:
|
|
336
|
+
# msg = f"Error: Match condition type not supported. Supported types: {supported_match_condition_types}"
|
|
337
|
+
# raise ValueError(msg)
|
|
338
|
+
# elif duplication_match_condition_type!=MatchConditionType.EXACT and not duplication_match_condition:
|
|
339
|
+
# msg = f"Error: Match condition is required for match condition type: {duplication_match_condition_type}"
|
|
340
|
+
# raise ValueError(msg)
|
|
341
|
+
|
|
342
|
+
# # Prepare data
|
|
343
|
+
# if isinstance(data, (list, dict)):
|
|
344
|
+
# data_str = json.dumps(data, indent=2)
|
|
345
|
+
# else:
|
|
346
|
+
# data_str = data
|
|
347
|
+
|
|
348
|
+
# increment = 0
|
|
349
|
+
# attempts = 0
|
|
350
|
+
# success = False
|
|
351
|
+
|
|
352
|
+
# # Check for existing files based on duplication_match_condition_type
|
|
353
|
+
# files_matched_on_condition = []
|
|
354
|
+
# bucket = storage_client.bucket(bucket_name)
|
|
355
|
+
# base_file_name, ext = os.path.splitext(file_name)
|
|
356
|
+
# if duplication_match_condition_type == MatchConditionType.PREFIX:
|
|
357
|
+
# files_matched_on_condition = list(bucket.list_blobs(prefix=duplication_match_condition))
|
|
358
|
+
# elif duplication_match_condition_type == MatchConditionType.EXACT:
|
|
359
|
+
# if bucket.blob(file_name).exists():
|
|
360
|
+
# files_matched_on_condition = [bucket.blob(file_name)]
|
|
361
|
+
|
|
362
|
+
# matched_duplicates_count = len(files_matched_on_condition)
|
|
363
|
+
# response["matched_duplicates_count"] = matched_duplicates_count
|
|
364
|
+
|
|
365
|
+
# # Handle duplication based on duplication_handling
|
|
366
|
+
# if matched_duplicates_count:
|
|
367
|
+
# if duplication_handling == DuplicationHandling.RAISE_ERROR:
|
|
368
|
+
# raise FileExistsError("File(s) matching the condition already exist.")
|
|
369
|
+
|
|
370
|
+
# if duplication_handling == DuplicationHandling.SKIP:
|
|
371
|
+
# log_warning("Skipping saving to GCS: file(s) matching the condition already exist.", logger=logger, print_out=print_out)
|
|
372
|
+
# response["duplication_handling_status"] = DuplicationHandlingStatus.SKIPPED.value
|
|
373
|
+
# return response
|
|
374
|
+
|
|
375
|
+
# if duplication_handling == DuplicationHandling.OVERWRITE:
|
|
376
|
+
# if matched_duplicates_count > max_deletable_files:
|
|
377
|
+
# raise ValueError(f"Error: Attempt to delete {matched_duplicates_count} matched files, but limit is {max_deletable_files}. Operation Cancelled.")
|
|
378
|
+
|
|
379
|
+
# for blob in files_matched_on_condition:
|
|
380
|
+
# cloud_storage_path_to_delete = f"gs://{bucket_name}/{blob.name}"
|
|
381
|
+
# blob.delete()
|
|
382
|
+
# matched_duplicates_deleted.append(cloud_storage_path_to_delete)
|
|
383
|
+
|
|
384
|
+
# response["matched_duplicates_deleted"] = matched_duplicates_deleted
|
|
385
|
+
# response["duplication_handling_status"] = DuplicationHandlingStatus.OVERWRITTEN.value
|
|
386
|
+
|
|
387
|
+
# elif duplication_handling == DuplicationHandling.INCREMENT:
|
|
388
|
+
# while bucket.blob(file_name).exists():
|
|
389
|
+
# increment += 1
|
|
390
|
+
# file_name = f"{base_file_name}_v{increment}{ext}"
|
|
391
|
+
# saved_to_path = f"gs://{bucket_name}/{file_name}"
|
|
392
|
+
# response["duplication_handling_status"] = DuplicationHandlingStatus.INCREMENTED.value
|
|
393
|
+
|
|
394
|
+
# # GCS Upload
|
|
395
|
+
# saved_to_path = f"gs://{bucket_name}/{file_name}"
|
|
396
|
+
# while attempts < max_retries and not success:
|
|
397
|
+
# try:
|
|
398
|
+
# blob = bucket.blob(file_name)
|
|
399
|
+
# blob.upload_from_string(data_str, content_type='application/json')
|
|
400
|
+
# success = True
|
|
401
|
+
# except Exception as e:
|
|
402
|
+
# attempts += 1
|
|
403
|
+
# if attempts < max_retries:
|
|
404
|
+
# time.sleep(2 ** attempts)
|
|
405
|
+
# else:
|
|
406
|
+
# if raise_e:
|
|
407
|
+
# raise e
|
|
408
|
+
|
|
409
|
+
# except Exception as e:
|
|
410
|
+
# error_message = f"Error occurred while writing JSON to GCS path: {saved_to_path} : {type(e).__name__} - {str(e)}"
|
|
411
|
+
# log_error(error_message, logger=logger, print_out=print_out)
|
|
412
|
+
# response["error_during_operation"] = error_message
|
|
413
|
+
# if raise_e:
|
|
414
|
+
# raise e
|
|
415
|
+
|
|
416
|
+
# response["saved_to_path"] = saved_to_path if success else None
|
|
417
|
+
# return response
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def write_csv_to_gcs(bucket_name:str, file_name:str, data:dict | list | str, storage_client:GCSClient, logger=None, print_out=False, raise_e=False):
|
|
293
421
|
""" Helper function to write a CSV file to Google Cloud Storage """
|
|
294
422
|
try:
|
|
295
423
|
bucket = storage_client.bucket(bucket_name)
|
|
@@ -303,9 +431,12 @@ def write_csv_to_gcs(bucket_name:str, file_name:str, data:dict | list | str, sto
|
|
|
303
431
|
else:
|
|
304
432
|
raise ValueError("Data should be a list of dictionaries")
|
|
305
433
|
blob.upload_from_string(data_file.getvalue(), content_type='text/csv')
|
|
306
|
-
|
|
307
|
-
logger.info(f"Successfully wrote CSV to {file_name} in bucket {bucket_name}.")
|
|
434
|
+
log_info(msg=f"Successfully wrote CSV to {file_name} in bucket {bucket_name}.", logger=logger, print_out=print_out)
|
|
308
435
|
except ValueError as e:
|
|
309
|
-
|
|
436
|
+
log_error(msg=f"ValueError: {e}",logger=logger, print_out=print_out)
|
|
437
|
+
if raise_e:
|
|
438
|
+
raise e
|
|
310
439
|
except Exception as e:
|
|
311
|
-
|
|
440
|
+
log_error(msg=f"An unexpected error occurred while writing CSV to GCS: {e}", logger=logger, print_out=print_out, exc_info=True)
|
|
441
|
+
if raise_e:
|
|
442
|
+
raise e
|