ipulse-shared-core-ftredge 2.55__py3-none-any.whl → 2.57__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ipulse-shared-core-ftredge might be problematic. Click here for more details.
- ipulse_shared_core_ftredge/__init__.py +10 -5
- ipulse_shared_core_ftredge/enums/__init__.py +2 -0
- ipulse_shared_core_ftredge/enums/enums_common_utils.py +48 -17
- ipulse_shared_core_ftredge/models/user_profile.py +3 -3
- ipulse_shared_core_ftredge/utils_custom_logs.py +201 -0
- ipulse_shared_core_ftredge/utils_gcp.py +117 -73
- ipulse_shared_core_ftredge/utils_gcp_for_pipelines.py +201 -0
- ipulse_shared_core_ftredge/{utils_common.py → utils_pipelinemon.py} +85 -205
- ipulse_shared_core_ftredge/utils_templates_and_schemas.py +7 -9
- {ipulse_shared_core_ftredge-2.55.dist-info → ipulse_shared_core_ftredge-2.57.dist-info}/METADATA +1 -1
- {ipulse_shared_core_ftredge-2.55.dist-info → ipulse_shared_core_ftredge-2.57.dist-info}/RECORD +14 -14
- {ipulse_shared_core_ftredge-2.55.dist-info → ipulse_shared_core_ftredge-2.57.dist-info}/WHEEL +1 -1
- ipulse_shared_core_ftredge/tests/__init__.py +0 -0
- ipulse_shared_core_ftredge/tests/test.py +0 -17
- {ipulse_shared_core_ftredge-2.55.dist-info → ipulse_shared_core_ftredge-2.57.dist-info}/LICENCE +0 -0
- {ipulse_shared_core_ftredge-2.55.dist-info → ipulse_shared_core_ftredge-2.57.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,18 @@
|
|
|
1
1
|
from .models import (Organisation, UserAuth, UserProfile,
|
|
2
2
|
UserStatus, UserProfileUpdate, pulse_enums)
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
from .utils_gcp import (setup_gcp_logger_and_error_report,
|
|
6
|
+
read_csv_from_gcs, read_json_from_gcs,
|
|
7
|
+
write_csv_to_gcs,write_json_to_gcs)
|
|
8
|
+
from .utils_custom_logs import (ContextLog)
|
|
9
|
+
from .utils_pipelinemon import ( Pipelinemon)
|
|
10
|
+
from .utils_gcp_for_pipelines import (write_json_to_gcs_in_pipeline )
|
|
11
|
+
|
|
6
12
|
from .utils_templates_and_schemas import (create_bigquery_schema_from_json,
|
|
7
13
|
check_format_against_schema_template)
|
|
8
|
-
from .utils_common import (ContextLog, Pipelinemon)
|
|
9
14
|
|
|
10
|
-
from .enums import (TargetLogs, LogLevel, Unit, Frequency,
|
|
15
|
+
from .enums import (TargetLogs, LogStatus, LogLevel, Unit, Frequency,
|
|
11
16
|
Module, SubModule, BaseDataCategory,
|
|
12
17
|
FinCoreCategory, FincCoreSubCategory,
|
|
13
18
|
FinCoreRecordsCategory, ExchangeOrPublisher,
|
|
@@ -2,10 +2,28 @@
|
|
|
2
2
|
# pylint: disable=missing-module-docstring
|
|
3
3
|
# pylint: disable=missing-function-docstring
|
|
4
4
|
# pylint: disable=missing-class-docstring
|
|
5
|
+
# pylint: disable=line-too-long
|
|
5
6
|
|
|
6
7
|
from enum import Enum
|
|
7
8
|
|
|
8
9
|
|
|
10
|
+
class SystemsImpacted(Enum):
|
|
11
|
+
NO = "__no"
|
|
12
|
+
YES = "__yes"
|
|
13
|
+
INVESTIGATE = "__investigate"
|
|
14
|
+
MULTIPLE = "__multiple"
|
|
15
|
+
DB = "db"
|
|
16
|
+
BQ_TABLE= "bq_table"
|
|
17
|
+
BQ_TABLES = "bq_tables"
|
|
18
|
+
GCS_BUCKET = "gcs_bucket"
|
|
19
|
+
GCS_BUCKETS = "gcs_buckets"
|
|
20
|
+
GCS_BUCKET_FILE = "gcs_bucket_file"
|
|
21
|
+
GCS_BUCKET_FILES = "gcs_bucket_files"
|
|
22
|
+
API = "api"
|
|
23
|
+
APIS = "apis"
|
|
24
|
+
LOCAL_FILE = "local_file"
|
|
25
|
+
LOCAL_FILES = "local_files"
|
|
26
|
+
|
|
9
27
|
class TargetLogs(Enum):
|
|
10
28
|
MIXED="mixed_logs"
|
|
11
29
|
SUCCESSES = "success_logs"
|
|
@@ -15,22 +33,33 @@ class TargetLogs(Enum):
|
|
|
15
33
|
WARNINGS_AND_ERRORS = "warn_n_err_logs"
|
|
16
34
|
ERRORS = "error_logs"
|
|
17
35
|
|
|
18
|
-
|
|
19
36
|
class LogLevel(Enum):
|
|
20
37
|
"""
|
|
21
38
|
Standardized notice levels for data engineering pipelines,
|
|
22
39
|
designed for easy analysis and identification of manual
|
|
23
40
|
intervention needs.
|
|
24
41
|
"""
|
|
25
|
-
DEBUG =
|
|
42
|
+
DEBUG = 10 # Detailed debug information (for development/troubleshooting)
|
|
43
|
+
|
|
44
|
+
INFO = 100
|
|
45
|
+
INFO_REMOTE_PERSISTNACE_COMPLETE= 101
|
|
46
|
+
INFO_REMOTE_UPDATE_COMPLETE = 102
|
|
47
|
+
INFO_REMOTE_DELETE_COMPLETE = 103
|
|
48
|
+
|
|
49
|
+
INFO_REMOTE_BULK_PERSISTNACE_COMPLETE= 111
|
|
50
|
+
INFO_REMOTE_BULK_UPDATE_COMPLETE = 112
|
|
51
|
+
INFO_REMOTE_BULK_DELETE_COMPLETE = 113
|
|
52
|
+
|
|
53
|
+
INFO_LOCAL_PERSISTNACE_COMPLETE = 121
|
|
26
54
|
|
|
27
|
-
INFO = 200
|
|
28
55
|
SUCCESS = 201
|
|
56
|
+
SUCCESS_WITH_NOTICES = 211
|
|
57
|
+
SUCCESS_WITH_WARNINGS = 212
|
|
29
58
|
|
|
30
59
|
NOTICE = 300 # Maybe same file or data already fully or partially exists
|
|
31
60
|
NOTICE_ALREADY_EXISTS = 301 # Data already exists, no action required
|
|
32
61
|
NOTICE_PARTIAL_EXISTS = 302 # Partial data exists, no action required
|
|
33
|
-
|
|
62
|
+
NOTICE_ACTION_CANCELLED = 303 # Data processing cancelled, no action required
|
|
34
63
|
|
|
35
64
|
# Warnings indicate potential issues that might require attention:
|
|
36
65
|
WARNING = 400 # General warning, no immediate action required
|
|
@@ -40,18 +69,22 @@ class LogLevel(Enum):
|
|
|
40
69
|
WARNING_FIX_REQUIRED = 404 # Action required, pipeline can likely continue
|
|
41
70
|
|
|
42
71
|
ERROR = 500 # General error, no immediate action required
|
|
43
|
-
# Errors indicate a problem that disrupts normal pipeline execution:
|
|
44
|
-
ERROR_EXCEPTION_REDO = 501
|
|
45
|
-
ERROR_CUSTOM_REDO = 502 # Temporary error, automatic retry likely to succeed
|
|
46
|
-
|
|
47
|
-
ERROR_EXCEPTION_INVESTIGATE = 601 # Exception occured after some data was likely persisted (e.g., to GCS or BQ)
|
|
48
|
-
ERROR_CUSTOM_INVESTIGATE= 602
|
|
49
|
-
ERROR_EXCEPTION_PERSTISTANCE = 603 # Exception occured after data was persisted (e.g., to GCS or BQ)
|
|
50
|
-
ERROR_CUSTOM_PERSTISTANCE = 604
|
|
51
72
|
|
|
73
|
+
ERROR_EXCEPTION = 501
|
|
74
|
+
ERROR_CUSTOM = 502 # Temporary error, automatic retry likely to succeed
|
|
75
|
+
ERROR_OPERATION_PARTIALLY_FAILED = 511 # Partial or full failure, manual intervention required
|
|
76
|
+
ERROR_OPERATION_FAILED = 512 # Operation failed, manual intervention required
|
|
77
|
+
ERORR_OPERATION_WITH_WARNINGS = 513 # Partial or full failure, manual intervention required
|
|
78
|
+
ERORR_OPERATION_WITH_ERRORS = 514 # Partial or full failure, manual intervention required
|
|
79
|
+
ERORR_OPERATION_WITH_WARNINGS_OR_ERRORS = 515 # Partial or full failure, manual intervention required
|
|
80
|
+
|
|
81
|
+
ERROR_THRESHOLD_REACHED = 551
|
|
82
|
+
ERROR_PIPELINE_THRESHOLD_REACHED = 552 # Error due to threshold reached, no immediate action required
|
|
83
|
+
ERROR_SUBTHRESHOLD_REACHED = 553 # Error due to threshold reached, no immediate action required
|
|
84
|
+
ERROR_DATA_QUALITY_THRESHOLD_REACHED = 554 # Error due to threshold reached, no immediate action required
|
|
52
85
|
# Critical errors indicate severe failures requiring immediate attention:
|
|
53
|
-
|
|
54
|
-
|
|
86
|
+
CRITICAL=600 # General critical error, requires immediate action
|
|
87
|
+
CRITICAL_SYSTEM_FAILURE = 601 # System-level failure (e.g., infrastructure, stackoverflow ), requires immediate action
|
|
55
88
|
|
|
56
89
|
UNKNOWN=1001 # Unknown error, should not be used in normal operation
|
|
57
90
|
|
|
@@ -63,8 +96,6 @@ class LogStatus(Enum):
|
|
|
63
96
|
RESOLVED = "resolved"
|
|
64
97
|
IGNORED = "ignored"
|
|
65
98
|
CANCELLED = "cancelled"
|
|
66
|
-
|
|
67
|
-
|
|
68
99
|
|
|
69
100
|
### Exception during full exection, partially saved
|
|
70
101
|
# Exception during ensemble pipeline; modifications collected in local object , nothing persisted
|
|
@@ -143,4 +174,4 @@ class Frequency(Enum):
|
|
|
143
174
|
THREE_M="3m"
|
|
144
175
|
SIX_M="6m"
|
|
145
176
|
ONE_Y="1y"
|
|
146
|
-
THREE_Y="3y"
|
|
177
|
+
THREE_Y="3y"
|
|
@@ -33,9 +33,9 @@ class UserProfile(BaseModel):
|
|
|
33
33
|
provider_id: str #User can Read only
|
|
34
34
|
|
|
35
35
|
username: Optional[str] = None #User can Read and Edit
|
|
36
|
-
dob: Optional[date] = None #User can Read and Edit
|
|
37
|
-
first_name: Optional[str] = None #User can Read and Edit
|
|
38
|
-
last_name: Optional[str] = None #User can Read and Edit
|
|
36
|
+
dob: Optional[date] = None #User can Read and Edit
|
|
37
|
+
first_name: Optional[str] = None #User can Read and Edit
|
|
38
|
+
last_name: Optional[str] = None #User can Read and Edit
|
|
39
39
|
mobile: Optional[str] = None #User can Read and Edit
|
|
40
40
|
class Config:
|
|
41
41
|
extra = "forbid"
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
|
|
2
|
+
# pylint: disable=missing-module-docstring
|
|
3
|
+
# pylint: disable=missing-function-docstring
|
|
4
|
+
# pylint: disable=logging-fstring-interpolation
|
|
5
|
+
# pylint: disable=line-too-long
|
|
6
|
+
# pylint: disable=missing-class-docstring
|
|
7
|
+
# pylint: disable=broad-exception-caught
|
|
8
|
+
import traceback
|
|
9
|
+
import json
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
from typing import List
|
|
12
|
+
from ipulse_shared_core_ftredge.enums.enums_common_utils import LogLevel, LogStatus
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ContextLog:
|
|
16
|
+
|
|
17
|
+
def __init__(self, level: LogLevel, base_context: str = None, collector_id: str = None,
|
|
18
|
+
context: str = None, description: str = None,
|
|
19
|
+
e: Exception = None, e_type: str = None, e_message: str = None, e_traceback: str = None,
|
|
20
|
+
log_status: LogStatus = LogStatus.OPEN, subject: str = None, systems_impacted: List[str] = None,
|
|
21
|
+
):
|
|
22
|
+
|
|
23
|
+
if e is not None:
|
|
24
|
+
e_type = type(e).__name__ if e_type is None else e_type
|
|
25
|
+
e_message = str(e) if e_message is None else e_message
|
|
26
|
+
e_traceback = traceback.format_exc() if e_traceback is None else e_traceback
|
|
27
|
+
elif e_traceback is None and (e_type or e_message):
|
|
28
|
+
e_traceback = traceback.format_exc()
|
|
29
|
+
|
|
30
|
+
self.level = level
|
|
31
|
+
self.subject = subject
|
|
32
|
+
self.description = description
|
|
33
|
+
self._base_context = base_context
|
|
34
|
+
self._context = context
|
|
35
|
+
self._systems_impacted = systems_impacted if systems_impacted else []
|
|
36
|
+
self.collector_id = collector_id
|
|
37
|
+
self.exception_type = e_type
|
|
38
|
+
self.exception_message = e_message
|
|
39
|
+
self.exception_traceback = e_traceback
|
|
40
|
+
self.log_status = log_status
|
|
41
|
+
self.timestamp = datetime.now(timezone.utc).isoformat()
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def base_context(self):
|
|
45
|
+
return self._base_context
|
|
46
|
+
|
|
47
|
+
@base_context.setter
|
|
48
|
+
def base_context(self, value):
|
|
49
|
+
self._base_context = value
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def context(self):
|
|
53
|
+
return self._context
|
|
54
|
+
|
|
55
|
+
@context.setter
|
|
56
|
+
def context(self, value):
|
|
57
|
+
self._context = value
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def systems_impacted(self):
|
|
61
|
+
return self._systems_impacted
|
|
62
|
+
|
|
63
|
+
@systems_impacted.setter
|
|
64
|
+
def systems_impacted(self, list_of_si: List[str]):
|
|
65
|
+
self._systems_impacted = list_of_si
|
|
66
|
+
|
|
67
|
+
def add_system_impacted(self, system_impacted: str):
|
|
68
|
+
if self._systems_impacted is None:
|
|
69
|
+
self._systems_impacted = []
|
|
70
|
+
self._systems_impacted.append(system_impacted)
|
|
71
|
+
|
|
72
|
+
def remove_system_impacted(self, system_impacted: str):
|
|
73
|
+
if self._systems_impacted is not None:
|
|
74
|
+
self._systems_impacted.remove(system_impacted)
|
|
75
|
+
|
|
76
|
+
def clear_systems_impacted(self):
|
|
77
|
+
self._systems_impacted = []
|
|
78
|
+
|
|
79
|
+
def _format_traceback(self, e_traceback, e_message, max_field_len:int, max_traceback_lines:int):
|
|
80
|
+
if not e_traceback or e_traceback == 'None\n':
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
traceback_lines = e_traceback.splitlines()
|
|
84
|
+
|
|
85
|
+
# Check if the traceback is within the limits
|
|
86
|
+
if len(traceback_lines) <= max_traceback_lines and len(e_traceback) <= max_field_len:
|
|
87
|
+
return e_traceback
|
|
88
|
+
|
|
89
|
+
# Remove lines that are part of the exception message if they are present in traceback
|
|
90
|
+
message_lines = e_message.splitlines() if e_message else []
|
|
91
|
+
if message_lines:
|
|
92
|
+
for message_line in message_lines:
|
|
93
|
+
if message_line in traceback_lines:
|
|
94
|
+
traceback_lines.remove(message_line)
|
|
95
|
+
|
|
96
|
+
# Filter out lines from third-party libraries (like site-packages)
|
|
97
|
+
filtered_lines = [line for line in traceback_lines if "site-packages" not in line]
|
|
98
|
+
|
|
99
|
+
# If filtering results in too few lines, revert to original traceback
|
|
100
|
+
if len(filtered_lines) < 2:
|
|
101
|
+
filtered_lines = traceback_lines
|
|
102
|
+
|
|
103
|
+
# Combine standalone bracket lines with previous or next lines
|
|
104
|
+
combined_lines = []
|
|
105
|
+
for line in filtered_lines:
|
|
106
|
+
if line.strip() in {"(", ")", "{", "}", "[", "]"} and combined_lines:
|
|
107
|
+
combined_lines[-1] += " " + line.strip()
|
|
108
|
+
else:
|
|
109
|
+
combined_lines.append(line)
|
|
110
|
+
|
|
111
|
+
# Ensure the number of lines doesn't exceed MAX_TRACEBACK_LINES
|
|
112
|
+
if len(combined_lines) > max_traceback_lines:
|
|
113
|
+
keep_lines_start = min(max_traceback_lines // 2, len(combined_lines))
|
|
114
|
+
keep_lines_end = min(max_traceback_lines // 2, len(combined_lines) - keep_lines_start)
|
|
115
|
+
combined_lines = (
|
|
116
|
+
combined_lines[:keep_lines_start] +
|
|
117
|
+
['... (truncated) ...'] +
|
|
118
|
+
combined_lines[-keep_lines_end:]
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
formatted_traceback = '\n'.join(combined_lines)
|
|
122
|
+
|
|
123
|
+
# Ensure the total length doesn't exceed MAX_TRACEBACK_LENGTH
|
|
124
|
+
if len(formatted_traceback) > max_field_len:
|
|
125
|
+
truncated_length = max_field_len - len('... (truncated) ...')
|
|
126
|
+
half_truncated_length = truncated_length // 2
|
|
127
|
+
formatted_traceback = (
|
|
128
|
+
formatted_traceback[:half_truncated_length] +
|
|
129
|
+
'\n... (truncated) ...\n' +
|
|
130
|
+
formatted_traceback[-half_truncated_length:]
|
|
131
|
+
)
|
|
132
|
+
return formatted_traceback
|
|
133
|
+
|
|
134
|
+
def to_dict(self, max_field_len:int =10000, size_limit:float=256 * 1024 * 0.80,max_traceback_lines:int = 30):
|
|
135
|
+
size_limit = int(size_limit) # Ensure size_limit is an integer
|
|
136
|
+
|
|
137
|
+
# Unified list of all fields
|
|
138
|
+
systems_impacted_str = f"{len(self.systems_impacted)} system(s): " + " ,,, ".join(self.systems_impacted) if self.systems_impacted else None
|
|
139
|
+
fields = [
|
|
140
|
+
("log_status", str(self.log_status.name)),
|
|
141
|
+
("level_code", self.level.value),
|
|
142
|
+
("level_name", str(self.level.name)),
|
|
143
|
+
("base_context", str(self.base_context)),
|
|
144
|
+
("timestamp", str(self.timestamp)),
|
|
145
|
+
("collector_id", str(self.collector_id)),
|
|
146
|
+
("systems_impacted", systems_impacted_str),
|
|
147
|
+
("context", str(self.context)), # special sizing rules apply to it
|
|
148
|
+
("subject", str(self.subject)),
|
|
149
|
+
("description", str(self.description)),
|
|
150
|
+
("exception_type", str(self.exception_type)),
|
|
151
|
+
("exception_message", str(self.exception_message)),
|
|
152
|
+
("exception_traceback", str(self._format_traceback(self.exception_traceback,self.exception_message, max_field_len, max_traceback_lines)))
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
# Function to calculate the byte size of a JSON-encoded field
|
|
156
|
+
def field_size(key, value):
|
|
157
|
+
return len(json.dumps({key: value}).encode('utf-8'))
|
|
158
|
+
|
|
159
|
+
# Function to truncate a value based on its type
|
|
160
|
+
# Function to truncate a value based on its type
|
|
161
|
+
def truncate_value(value, max_size):
|
|
162
|
+
if isinstance(value, str):
|
|
163
|
+
half_size = max_size // 2
|
|
164
|
+
return value[:half_size] + '...' + value[-(max_size - half_size - 3):]
|
|
165
|
+
return value
|
|
166
|
+
|
|
167
|
+
# Ensure no field exceeds max_field_len
|
|
168
|
+
for i, (key, value) in enumerate(fields):
|
|
169
|
+
if isinstance(value, str) and len(value) > max_field_len:
|
|
170
|
+
fields[i] = (key, truncate_value(value, max_field_len))
|
|
171
|
+
|
|
172
|
+
# Ensure total size of the dict doesn't exceed size_limit
|
|
173
|
+
total_size = sum(field_size(key, value) for key, value in fields)
|
|
174
|
+
log_dict = {}
|
|
175
|
+
truncated = False
|
|
176
|
+
|
|
177
|
+
if total_size > size_limit:
|
|
178
|
+
truncated = True
|
|
179
|
+
remaining_size = size_limit
|
|
180
|
+
remaining_fields = len(fields)
|
|
181
|
+
|
|
182
|
+
for key, value in fields:
|
|
183
|
+
if remaining_fields > 0:
|
|
184
|
+
max_size_per_field = remaining_size // remaining_fields
|
|
185
|
+
else:
|
|
186
|
+
max_size_per_field = 0
|
|
187
|
+
|
|
188
|
+
field_sz = field_size(key, value)
|
|
189
|
+
if field_sz > max_size_per_field:
|
|
190
|
+
value = truncate_value(value, max_size_per_field)
|
|
191
|
+
field_sz = field_size(key, value)
|
|
192
|
+
|
|
193
|
+
log_dict[key] = value
|
|
194
|
+
remaining_size -= field_sz
|
|
195
|
+
remaining_fields -= 1
|
|
196
|
+
else:
|
|
197
|
+
log_dict = dict(fields)
|
|
198
|
+
|
|
199
|
+
log_dict['trunc'] = truncated
|
|
200
|
+
|
|
201
|
+
return log_dict
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
# pylint: disable=missing-module-docstring
|
|
2
2
|
# pylint: disable=missing-function-docstring
|
|
3
3
|
# pylint: disable=missing-class-docstring
|
|
4
|
+
# pylint: disable=broad-exception-caught
|
|
5
|
+
# pylint: disable=line-too-long
|
|
6
|
+
# pylint: disable=unused-variable
|
|
4
7
|
import json
|
|
5
8
|
import csv
|
|
6
9
|
from io import StringIO
|
|
@@ -11,7 +14,6 @@ import traceback
|
|
|
11
14
|
from google.cloud import error_reporting, logging as cloud_logging
|
|
12
15
|
from google.api_core.exceptions import NotFound
|
|
13
16
|
|
|
14
|
-
|
|
15
17
|
############################################################################
|
|
16
18
|
##################### SETTING UP LOGGER ##########################
|
|
17
19
|
|
|
@@ -23,7 +25,6 @@ from google.api_core.exceptions import NotFound
|
|
|
23
25
|
|
|
24
26
|
|
|
25
27
|
##### THIS APPROACH IS USED NOW ########
|
|
26
|
-
## TODO Fix the issue with POST 0B Nan.... printed in Cloud Logging , which is referring to posting to Cloud Logging probably.
|
|
27
28
|
ENV = os.getenv('ENV', 'LOCAL').strip("'")
|
|
28
29
|
|
|
29
30
|
def setup_gcp_logger_and_error_report(logger_name,level=logging.INFO, use_cloud_logging=True):
|
|
@@ -122,19 +123,22 @@ def read_csv_from_gcs(bucket_name, file_name, storage_client, logger):
|
|
|
122
123
|
|
|
123
124
|
|
|
124
125
|
|
|
125
|
-
def write_json_to_gcs(
|
|
126
|
-
|
|
127
|
-
|
|
126
|
+
def write_json_to_gcs( storage_client, data, bucket_name, file_name,
|
|
127
|
+
file_exists_if_starts_with_prefix=None, overwrite_if_exists=False, increment_if_exists=False,
|
|
128
|
+
save_locally=False, local_path=None, max_retries=2, max_deletable_files=1, logger=None):
|
|
128
129
|
"""Saves data to Google Cloud Storage and optionally locally.
|
|
129
|
-
|
|
130
|
-
This function attempts to upload data to GCS.
|
|
131
|
-
retries and `save_locally` is True or `local_path` is provided, it attempts
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
130
|
+
|
|
131
|
+
This function attempts to upload data to GCS.
|
|
132
|
+
- If the upload fails after retries and `save_locally` is True or `local_path` is provided, it attempts to save the data locally.
|
|
133
|
+
- It handles file name conflicts based on these rules:
|
|
134
|
+
- If `overwrite_if_exists` is True:
|
|
135
|
+
- If `file_exists_if_contains_substr` is provided, ANY existing file containing the substring is deleted, and the new file is saved with the provided `file_name`.
|
|
136
|
+
- If `file_exists_if_contains_substr` is None, and a file with the exact `file_name` exists, it's overwritten.
|
|
137
|
+
- If `increment_if_exists` is True:
|
|
138
|
+
- If `file_exists_if_contains_substr` is provided, a new file with an incremented version is created ONLY if a file with the EXACT `file_name` exists.
|
|
139
|
+
- If `file_exists_if_contains_substr` is None, a new file with an incremented version is created if a file with the exact `file_name` exists.
|
|
140
|
+
|
|
141
|
+
-If both overwrite_if_exists and increment_if_exists are provided as Ture, an exception will be raised.
|
|
138
142
|
"""
|
|
139
143
|
|
|
140
144
|
def log_message(message):
|
|
@@ -149,101 +153,141 @@ def write_json_to_gcs(bucket_name, storage_client, data, file_name,
|
|
|
149
153
|
if logger:
|
|
150
154
|
logger.warning(message)
|
|
151
155
|
|
|
152
|
-
|
|
153
|
-
success = False
|
|
154
|
-
gcs_path = None
|
|
155
|
-
local_path_final = None
|
|
156
|
-
gcs_file_overwritten = False
|
|
157
|
-
gcs_file_already_exists = False
|
|
158
|
-
gcs_file_saved_with_increment = False
|
|
159
|
-
gcs_upload_exception = None # Store potential GCS exception
|
|
160
|
-
|
|
161
|
-
# Check for conflicting options
|
|
156
|
+
# Input validation
|
|
162
157
|
if overwrite_if_exists and increment_if_exists:
|
|
163
|
-
raise ValueError("
|
|
158
|
+
raise ValueError("Both 'overwrite_if_exists' and 'increment_if_exists' cannot be True simultaneously.")
|
|
159
|
+
if not isinstance(data, (list, dict, str)):
|
|
160
|
+
raise ValueError("Unsupported data type. Data must be a list, dict, or str.")
|
|
161
|
+
if max_deletable_files > 10:
|
|
162
|
+
raise ValueError("max_deletable_files should be less than 10 for safety. For more use another method.")
|
|
164
163
|
|
|
164
|
+
# Prepare data
|
|
165
165
|
if isinstance(data, (list, dict)):
|
|
166
166
|
data_str = json.dumps(data, indent=2)
|
|
167
|
-
elif isinstance(data, str):
|
|
168
|
-
data_str = data
|
|
169
167
|
else:
|
|
170
|
-
|
|
168
|
+
data_str = data
|
|
171
169
|
|
|
172
170
|
bucket = storage_client.bucket(bucket_name)
|
|
173
171
|
base_file_name, ext = os.path.splitext(file_name)
|
|
174
172
|
increment = 0
|
|
173
|
+
attempts = 0
|
|
174
|
+
success = False
|
|
175
175
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
176
|
+
# GCS-related metadata
|
|
177
|
+
gcs_path = None
|
|
178
|
+
gcs_file_overwritten = False
|
|
179
|
+
gcs_file_already_exists = False
|
|
180
|
+
gcs_file_saved_with_increment = False
|
|
181
|
+
gcs_file_exists_checked_on_name = file_name
|
|
182
|
+
gcs_deleted_files=[]
|
|
183
|
+
|
|
184
|
+
# GCS upload exception
|
|
185
|
+
gcs_upload_exception = None
|
|
186
|
+
|
|
187
|
+
# Local file path
|
|
188
|
+
local_path_final = None
|
|
189
|
+
|
|
190
|
+
try:
|
|
191
|
+
# --- Overwrite Logic ---
|
|
192
|
+
if overwrite_if_exists:
|
|
193
|
+
if file_exists_if_starts_with_prefix:
|
|
194
|
+
gcs_file_exists_checked_on_name = file_exists_if_starts_with_prefix
|
|
195
|
+
blobs_to_delete = list(bucket.list_blobs(prefix=file_exists_if_starts_with_prefix))
|
|
196
|
+
if len(blobs_to_delete) > max_deletable_files:
|
|
197
|
+
raise Exception(f"Error: Attempt to delete {len(blobs_to_delete)} matched files, but limit is {max_deletable_files}.")
|
|
198
|
+
if blobs_to_delete:
|
|
199
|
+
log_message(f"Deleting files containing '{file_exists_if_starts_with_prefix}' for overwrite.")
|
|
200
|
+
for blob in blobs_to_delete:
|
|
201
|
+
blob.delete()
|
|
202
|
+
gcs_deleted_files.append(blob.name)
|
|
203
|
+
log_message(f"Deleted: gs://{bucket_name}/{blob.name}")
|
|
204
|
+
gcs_file_overwritten = True
|
|
185
205
|
else:
|
|
186
206
|
blob = bucket.blob(file_name)
|
|
187
|
-
|
|
188
|
-
# Check if the file exists
|
|
189
207
|
if blob.exists():
|
|
190
208
|
gcs_file_already_exists = True
|
|
191
209
|
gcs_path = f"gs://{bucket_name}/{file_name}"
|
|
192
|
-
log_message(f"File {file_name} already exists
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
210
|
+
log_message(f"File '{file_name}' already exists. Overwriting.")
|
|
211
|
+
blob.delete() # Delete the existing blob
|
|
212
|
+
gcs_deleted_files.append(blob.name)
|
|
213
|
+
gcs_file_overwritten = True
|
|
214
|
+
|
|
215
|
+
# --- Increment Logic ---
|
|
216
|
+
elif increment_if_exists:
|
|
217
|
+
gcs_file_exists_checked_on_name = file_name # We only increment if the exact name exists
|
|
218
|
+
while bucket.blob(file_name).exists():
|
|
219
|
+
gcs_file_already_exists = True
|
|
220
|
+
increment += 1
|
|
221
|
+
file_name = f"{base_file_name}_v{increment}{ext}"
|
|
222
|
+
gcs_file_saved_with_increment = True
|
|
223
|
+
log_warning(f"File already exists. Using incremented name: {file_name}")
|
|
224
|
+
|
|
225
|
+
# --- GCS Upload ---
|
|
226
|
+
if overwrite_if_exists or increment_if_exists: # Only upload if either overwrite or increment is True
|
|
227
|
+
while attempts < max_retries and not success:
|
|
228
|
+
try:
|
|
229
|
+
blob = bucket.blob(file_name) # Use the potentially updated file_name
|
|
230
|
+
blob.upload_from_string(data_str, content_type='application/json')
|
|
231
|
+
gcs_path = f"gs://{bucket_name}/{file_name}"
|
|
232
|
+
log_message(f"Successfully saved file to GCS: {gcs_path}")
|
|
233
|
+
success = True
|
|
234
|
+
except Exception as e:
|
|
235
|
+
gcs_upload_exception=e
|
|
236
|
+
attempts += 1
|
|
237
|
+
if attempts < max_retries:
|
|
238
|
+
log_warning(f"Attempt {attempts} to upload to GCS failed. Retrying...")
|
|
239
|
+
time.sleep(2 ** attempts)
|
|
196
240
|
else:
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
except Exception as e:
|
|
204
|
-
gcs_upload_exception = e
|
|
205
|
-
attempts += 1
|
|
206
|
-
if attempts < max_retries:
|
|
207
|
-
time.sleep(2 ** attempts)
|
|
208
|
-
else:
|
|
209
|
-
log_error(f"Failed to write {file_name} to GCS bucket {bucket_name} after {max_retries} attempts: {e}")
|
|
241
|
+
log_error(f"Failed to write '{file_name}' to GCS bucket '{bucket_name}' after {max_retries} attempts: {e}", exc_info=True)
|
|
242
|
+
if save_locally or local_path:
|
|
243
|
+
log_message(f"Attempting to save '{file_name}' locally due to GCS upload failure.")
|
|
244
|
+
except Exception as e:
|
|
245
|
+
log_error(f"Error during GCS operations: {e}", exc_info=True)
|
|
246
|
+
gcs_upload_exception = e
|
|
210
247
|
|
|
248
|
+
# --- Save Locally ---
|
|
249
|
+
write_out=False
|
|
211
250
|
if not success or save_locally or local_path:
|
|
212
251
|
try:
|
|
213
|
-
if
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
local_path_final = os.path.join(local_path, file_name)
|
|
217
|
-
|
|
252
|
+
local_path=local_path if local_path else "/tmp"
|
|
253
|
+
local_path_final = os.path.join(local_path, file_name)
|
|
254
|
+
|
|
218
255
|
if os.path.exists(local_path_final):
|
|
219
256
|
if increment_if_exists:
|
|
220
257
|
increment = 0
|
|
221
258
|
while os.path.exists(local_path_final):
|
|
222
259
|
increment += 1
|
|
223
|
-
local_path_final = os.path.join(local_path, f"{base_file_name}
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
else:
|
|
260
|
+
local_path_final = os.path.join(local_path, f"{base_file_name}_v{increment}{ext}")
|
|
261
|
+
log_warning(f"Local file already exists. Using incremented name: {local_path_final}")
|
|
262
|
+
write_out=True
|
|
263
|
+
elif overwrite_if_exists:
|
|
264
|
+
write_out=True
|
|
229
265
|
log_message(f"File {file_name} already exists locally at {local_path_final}. Overwriting: {overwrite_if_exists}")
|
|
266
|
+
else:
|
|
267
|
+
log_message(f"File {file_name} already exists locally at {local_path_final} and overwrite is set to False. Skipping save.")
|
|
268
|
+
write_out=False
|
|
269
|
+
else:
|
|
270
|
+
write_out=True
|
|
230
271
|
|
|
231
|
-
if
|
|
272
|
+
if write_out:
|
|
232
273
|
with open(local_path_final, 'w', encoding='utf-8') as f:
|
|
233
274
|
f.write(data_str)
|
|
234
|
-
|
|
235
|
-
|
|
275
|
+
log_message(f"Saved {file_name} locally at {local_path_final}. Overwritten: {overwrite_if_exists}")
|
|
276
|
+
|
|
236
277
|
except Exception as local_e:
|
|
237
278
|
log_error(f"Failed to write {file_name} locally: {local_e}", exc_info=True)
|
|
238
279
|
|
|
239
280
|
if gcs_upload_exception is not None:
|
|
240
281
|
raise gcs_upload_exception # Propagate without nesting
|
|
241
282
|
|
|
283
|
+
# --- Return Metadata ---
|
|
242
284
|
return {
|
|
243
|
-
"gcs_path": gcs_path,
|
|
244
|
-
"local_path": local_path_final,
|
|
285
|
+
"gcs_path": gcs_path if success else None, # Only set gcs_path if upload succeeded
|
|
286
|
+
"local_path": local_path_final if write_out else None, # Only set local_path if saved locally
|
|
245
287
|
"gcs_file_already_exists": gcs_file_already_exists,
|
|
288
|
+
"gcs_file_exists_checked_on_name":gcs_file_exists_checked_on_name ,
|
|
246
289
|
"gcs_file_overwritten": gcs_file_overwritten,
|
|
290
|
+
"gcs_deleted_file_names": ",,,".join(gcs_deleted_files) if gcs_deleted_files else None,
|
|
247
291
|
"gcs_file_saved_with_increment": gcs_file_saved_with_increment
|
|
248
292
|
}
|
|
249
293
|
|
|
@@ -267,4 +311,4 @@ def write_csv_to_gcs(bucket_name, file_name, data, storage_client, logger,log_in
|
|
|
267
311
|
except ValueError as e:
|
|
268
312
|
logger.error(f"ValueError: {e}")
|
|
269
313
|
except Exception as e:
|
|
270
|
-
|
|
314
|
+
logger.error(f"An unexpected error occurred while writing CSV to GCS: {e}", exc_info=True)
|