ipulse-shared-core-ftredge 2.6__py3-none-any.whl → 2.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ipulse-shared-core-ftredge might be problematic. Click here for more details.

Files changed (36) hide show
  1. ipulse_shared_core_ftredge/__init__.py +21 -4
  2. ipulse_shared_core_ftredge/enums/__init__.py +32 -0
  3. ipulse_shared_core_ftredge/enums/enums_cloud.py +17 -0
  4. ipulse_shared_core_ftredge/enums/enums_common_utils.py +98 -0
  5. ipulse_shared_core_ftredge/enums/enums_data_eng.py +109 -0
  6. ipulse_shared_core_ftredge/enums/enums_logs.py +79 -0
  7. ipulse_shared_core_ftredge/enums/enums_module_fincore.py +58 -0
  8. ipulse_shared_core_ftredge/enums/enums_modules.py +25 -0
  9. ipulse_shared_core_ftredge/{models → enums}/pulse_enums.py +10 -46
  10. ipulse_shared_core_ftredge/models/__init__.py +0 -1
  11. ipulse_shared_core_ftredge/models/organisation.py +61 -55
  12. ipulse_shared_core_ftredge/models/resource_catalog_item.py +97 -171
  13. ipulse_shared_core_ftredge/models/user_profile.py +10 -9
  14. ipulse_shared_core_ftredge/models/user_profile_update.py +32 -14
  15. ipulse_shared_core_ftredge/models/user_status.py +21 -11
  16. ipulse_shared_core_ftredge/utils/__init__.py +19 -0
  17. ipulse_shared_core_ftredge/utils/logs/__init__.py +2 -0
  18. ipulse_shared_core_ftredge/{models → utils/logs}/audit_log_firestore.py +1 -1
  19. ipulse_shared_core_ftredge/utils/logs/context_log.py +211 -0
  20. ipulse_shared_core_ftredge/utils/logs/get_logger.py +76 -0
  21. ipulse_shared_core_ftredge/utils/utils_cloud.py +44 -0
  22. ipulse_shared_core_ftredge/utils/utils_cloud_gcp.py +311 -0
  23. ipulse_shared_core_ftredge/utils/utils_cloud_gcp_with_collectors.py +169 -0
  24. ipulse_shared_core_ftredge/utils/utils_cloud_with_collectors.py +26 -0
  25. ipulse_shared_core_ftredge/utils/utils_collector_pipelinemon.py +356 -0
  26. ipulse_shared_core_ftredge/utils/utils_common.py +145 -0
  27. ipulse_shared_core_ftredge/utils/utils_templates_and_schemas.py +151 -0
  28. ipulse_shared_core_ftredge-2.6.1.dist-info/METADATA +14 -0
  29. ipulse_shared_core_ftredge-2.6.1.dist-info/RECORD +33 -0
  30. {ipulse_shared_core_ftredge-2.6.dist-info → ipulse_shared_core_ftredge-2.6.1.dist-info}/WHEEL +1 -1
  31. ipulse_shared_core_ftredge/tests/__init__.py +0 -0
  32. ipulse_shared_core_ftredge/tests/test.py +0 -17
  33. ipulse_shared_core_ftredge-2.6.dist-info/METADATA +0 -11
  34. ipulse_shared_core_ftredge-2.6.dist-info/RECORD +0 -17
  35. {ipulse_shared_core_ftredge-2.6.dist-info → ipulse_shared_core_ftredge-2.6.1.dist-info}/LICENCE +0 -0
  36. {ipulse_shared_core_ftredge-2.6.dist-info → ipulse_shared_core_ftredge-2.6.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,211 @@
1
+
2
+ # pylint: disable=missing-module-docstring
3
+ # pylint: disable=missing-function-docstring
4
+ # pylint: disable=logging-fstring-interpolation
5
+ # pylint: disable=line-too-long
6
+ # pylint: disable=missing-class-docstring
7
+ # pylint: disable=broad-exception-caught
8
+ # pylint: disable=unused-variable
9
+ import traceback
10
+ import json
11
+ from datetime import datetime, timezone
12
+ from typing import List
13
+ from ipulse_shared_core_ftredge.enums.enums_common_utils import Status
14
+ from ipulse_shared_core_ftredge.enums.enums_logs import LogLevel
15
+
16
+ ############################################################################
17
+ ##################### SETTING UP custom LOGGING format= DICT ##########################
18
+ ### Cloud Agnostic, can be used with any cloud provider , jsut use to_dict() method to get the log in dict format
19
+ class ContextLog:
20
+
21
+ def __init__(self, level: LogLevel, base_context: str = None, collector_id: str = None,
22
+ context: str = None, description: str = None,
23
+ e: Exception = None, e_type: str = None, e_message: str = None, e_traceback: str = None,
24
+ log_status: Status = Status.OPEN, subject: str = None, systems_impacted: List[str] = None,
25
+ ):
26
+
27
+ if e is not None:
28
+ e_type = type(e).__name__ if e_type is None else e_type
29
+ e_message = str(e) if e_message is None else e_message
30
+ e_traceback = traceback.format_exc() if e_traceback is None else e_traceback
31
+ elif e_traceback is None and (e_type or e_message):
32
+ e_traceback = traceback.format_exc()
33
+
34
+ self.level = level
35
+ self.subject = subject
36
+ self.description = description
37
+ self._base_context = base_context
38
+ self._context = context
39
+ self._systems_impacted = systems_impacted if systems_impacted else []
40
+ self.collector_id = collector_id
41
+ self.exception_type = e_type
42
+ self.exception_message = e_message
43
+ self.exception_traceback = e_traceback
44
+ self.log_status = log_status
45
+ self.timestamp = datetime.now(timezone.utc).isoformat()
46
+
47
+ @property
48
+ def base_context(self):
49
+ return self._base_context
50
+
51
+ @base_context.setter
52
+ def base_context(self, value):
53
+ self._base_context = value
54
+
55
+ @property
56
+ def context(self):
57
+ return self._context
58
+
59
+ @context.setter
60
+ def context(self, value):
61
+ self._context = value
62
+
63
+ @property
64
+ def systems_impacted(self):
65
+ return self._systems_impacted
66
+
67
+ @systems_impacted.setter
68
+ def systems_impacted(self, list_of_si: List[str]):
69
+ self._systems_impacted = list_of_si
70
+
71
+ def add_system_impacted(self, system_impacted: str):
72
+ if self._systems_impacted is None:
73
+ self._systems_impacted = []
74
+ self._systems_impacted.append(system_impacted)
75
+
76
+ def remove_system_impacted(self, system_impacted: str):
77
+ if self._systems_impacted is not None:
78
+ self._systems_impacted.remove(system_impacted)
79
+
80
+ def clear_systems_impacted(self):
81
+ self._systems_impacted = []
82
+
83
+ def _format_traceback(self, e_traceback, e_message, max_field_len:int, max_traceback_lines:int):
84
+ if not e_traceback or e_traceback == 'None\n':
85
+ return None
86
+
87
+ traceback_lines = e_traceback.splitlines()
88
+
89
+ # Check if the traceback is within the limits
90
+ if len(traceback_lines) <= max_traceback_lines and len(e_traceback) <= max_field_len:
91
+ return e_traceback
92
+
93
+ # Remove lines that are part of the exception message if they are present in traceback
94
+ message_lines = e_message.splitlines() if e_message else []
95
+ if message_lines:
96
+ for message_line in message_lines:
97
+ if message_line in traceback_lines:
98
+ traceback_lines.remove(message_line)
99
+
100
+ # Filter out lines from third-party libraries (like site-packages)
101
+ filtered_lines = [line for line in traceback_lines if "site-packages" not in line]
102
+
103
+ # If filtering results in too few lines, revert to original traceback
104
+ if len(filtered_lines) < 2:
105
+ filtered_lines = traceback_lines
106
+
107
+ # Combine standalone bracket lines with previous or next lines
108
+ combined_lines = []
109
+ for line in filtered_lines:
110
+ if line.strip() in {"(", ")", "{", "}", "[", "]"} and combined_lines:
111
+ combined_lines[-1] += " " + line.strip()
112
+ else:
113
+ combined_lines.append(line)
114
+
115
+ # Ensure the number of lines doesn't exceed MAX_TRACEBACK_LINES
116
+ if len(combined_lines) > max_traceback_lines:
117
+ keep_lines_start = min(max_traceback_lines // 2, len(combined_lines))
118
+ keep_lines_end = min(max_traceback_lines // 2, len(combined_lines) - keep_lines_start)
119
+ combined_lines = (
120
+ combined_lines[:keep_lines_start] +
121
+ ['... (truncated) ...'] +
122
+ combined_lines[-keep_lines_end:]
123
+ )
124
+
125
+ formatted_traceback = '\n'.join(combined_lines)
126
+
127
+ # Ensure the total length doesn't exceed MAX_TRACEBACK_LENGTH
128
+ if len(formatted_traceback) > max_field_len:
129
+ truncated_length = max_field_len - len('... (truncated) ...')
130
+ half_truncated_length = truncated_length // 2
131
+ formatted_traceback = (
132
+ formatted_traceback[:half_truncated_length] +
133
+ '\n... (truncated) ...\n' +
134
+ formatted_traceback[-half_truncated_length:]
135
+ )
136
+ return formatted_traceback
137
+
138
+ def to_dict(self, max_field_len:int =10000, size_limit:float=256 * 1024 * 0.80,max_traceback_lines:int = 30):
139
+ size_limit = int(size_limit) # Ensure size_limit is an integer
140
+
141
+ # Unified list of all fields
142
+ systems_impacted_str = f"{len(self.systems_impacted)} system(s): " + " ,,, ".join(self.systems_impacted) if self.systems_impacted else None
143
+ fields = [
144
+ ("log_status", str(self.log_status.name)),
145
+ ("level_code", self.level.value),
146
+ ("level_name", str(self.level.name)),
147
+ ("base_context", str(self.base_context)),
148
+ ("timestamp", str(self.timestamp)),
149
+ ("collector_id", str(self.collector_id)),
150
+ ("systems_impacted", systems_impacted_str),
151
+ ("context", str(self.context)), # special sizing rules apply to it
152
+ ("subject", str(self.subject)),
153
+ ("description", str(self.description)),
154
+ ("exception_type", str(self.exception_type)),
155
+ ("exception_message", str(self.exception_message)),
156
+ ("exception_traceback", str(self._format_traceback(self.exception_traceback,self.exception_message, max_field_len, max_traceback_lines)))
157
+ ]
158
+
159
+ # Function to calculate the byte size of a JSON-encoded field
160
+ def field_size(key, value):
161
+ return len(json.dumps({key: value}).encode('utf-8'))
162
+
163
+ # Function to truncate a value based on its type
164
+ # Function to truncate a value based on its type
165
+ def truncate_value(value, max_size):
166
+ if isinstance(value, str):
167
+ half_size = max_size // 2
168
+ return value[:half_size] + '...' + value[-(max_size - half_size - 3):]
169
+ return value
170
+
171
+ # Ensure no field exceeds max_field_len
172
+ for i, (key, value) in enumerate(fields):
173
+ if isinstance(value, str) and len(value) > max_field_len:
174
+ fields[i] = (key, truncate_value(value, max_field_len))
175
+
176
+ # Ensure total size of the dict doesn't exceed size_limit
177
+ total_size = sum(field_size(key, value) for key, value in fields)
178
+ log_dict = {}
179
+ truncated = False
180
+
181
+ if total_size > size_limit:
182
+ truncated = True
183
+ remaining_size = size_limit
184
+ remaining_fields = len(fields)
185
+
186
+ for key, value in fields:
187
+ if remaining_fields > 0:
188
+ max_size_per_field = remaining_size // remaining_fields
189
+ else:
190
+ max_size_per_field = 0
191
+
192
+ field_sz = field_size(key, value)
193
+ if field_sz > max_size_per_field:
194
+ value = truncate_value(value, max_size_per_field)
195
+ field_sz = field_size(key, value)
196
+
197
+ log_dict[key] = value
198
+ remaining_size -= field_sz
199
+ remaining_fields -= 1
200
+ else:
201
+ log_dict = dict(fields)
202
+
203
+ log_dict['trunc'] = truncated
204
+
205
+ return log_dict
206
+
207
+ def __str__(self):
208
+ return json.dumps(self.to_dict(), indent=4)
209
+
210
+ def __repr__(self):
211
+ return self.__str__()
@@ -0,0 +1,76 @@
1
+ # pylint: disable=missing-module-docstring
2
+ # pylint: disable=missing-function-docstring
3
+ # pylint: disable=missing-class-docstring
4
+ # pylint: disable=broad-exception-caught
5
+ # pylint: disable=line-too-long
6
+ # pylint: disable=unused-variable
7
+ # pylint: disable=broad-exception-raised
8
+ import logging
9
+ import os
10
+ import json
11
+ import traceback
12
+ from ipulse_shared_core_ftredge.enums.enums_cloud import CloudProvider
13
+ from ipulse_shared_core_ftredge.utils.utils_cloud_gcp import setup_gcp_logging
14
+
15
+ ###################################################################################################
16
+ ##################################################################################################
17
+ ##################################### SETTING UP LOGGER ##########################################
18
+
19
+ class CloudLogFormatter(logging.Formatter):
20
+ """Formats log records as structured JSON."""
21
+
22
+ def format(self, record):
23
+ log_entry = {
24
+ 'timestamp': self.formatTime(record, self.datefmt),
25
+ 'name': record.name,
26
+ 'severity': record.levelname,
27
+ 'message': record.msg,
28
+ 'pathname': record.pathname,
29
+ 'lineno': record.lineno,
30
+ }
31
+ if record.exc_info:
32
+ log_entry['exception_traceback'] = ''.join(traceback.format_exception(*record.exc_info))
33
+ if isinstance(record.msg, dict):
34
+ log_entry.update(record.msg)
35
+ return json.dumps(log_entry)
36
+
37
+
38
+ class LocalLogFormatter(logging.Formatter):
39
+ """Formats log records for local output to the console."""
40
+
41
+ def format(self, record): # Make sure you have the 'record' argument here!
42
+ path_parts = record.pathname.split(os.sep)
43
+
44
+ # Get the last two parts of the path if they exist
45
+ if len(path_parts) >= 2:
46
+ short_path = os.path.join(path_parts[-2], path_parts[-1])
47
+ else:
48
+ short_path = record.pathname
49
+
50
+ log_message = f"{record.levelname} ::: {record.name} ::: {short_path} ::: lineno: {record.lineno} ::: {self.formatTime(record, self.datefmt)} ::: message: {record.msg}"
51
+ if record.exc_info:
52
+ log_message += "\n" + ''.join(
53
+ traceback.format_exception(*record.exc_info)
54
+ )
55
+ return log_message
56
+
57
+
58
+ def get_logger( logger_name:str ,level=logging.INFO, enable_local_streamer=False, cloud_provider:CloudProvider=CloudProvider.NO_CLOUD, enable_error_reporting=True ):
59
+
60
+ logger = logging.getLogger(logger_name)
61
+ logger.setLevel(level)
62
+ cloud_formatter = CloudLogFormatter()
63
+
64
+ without_cloud_logging_handler = [CloudProvider.NO_CLOUD, CloudProvider.CLOUD_AGNOSTIC, CloudProvider.UNKNWON, CloudProvider.OTHER]
65
+
66
+ if cloud_provider in without_cloud_logging_handler or enable_local_streamer:
67
+ handler = logging.StreamHandler()
68
+ handler.setFormatter(LocalLogFormatter())
69
+ logger.addHandler(handler)
70
+
71
+ if cloud_provider == CloudProvider.GCP:
72
+ setup_gcp_logging(logger=logger, formatter=cloud_formatter, enable_error_reporting=enable_error_reporting)
73
+ elif cloud_provider not in without_cloud_logging_handler:
74
+ raise ValueError(f"Unsupported cloud provider: {cloud_provider}. Supported cloud providers: {CloudProvider.GCP.value}")
75
+
76
+ return logger
@@ -0,0 +1,44 @@
1
+ # pylint: disable=missing-module-docstring
2
+ # pylint: disable=missing-function-docstring
3
+ # pylint: disable=missing-class-docstring
4
+ # pylint: disable=broad-exception-caught
5
+ # pylint: disable=line-too-long
6
+ # pylint: disable=unused-variable
7
+ # pylint: disable=broad-exception-caught
8
+ from typing import Optional
9
+ from ipulse_shared_core_ftredge.enums.enums_cloud import CloudProvider
10
+ from .utils_cloud_gcp import (write_json_to_gcs_extended, read_json_from_gcs)
11
+
12
+ #######################################################################################################################
13
+ #######################################################################################################################
14
+ ################################################# cloud IO functions ########################################
15
+
16
+ # Define the central function that routes to the relevant cloud-specific function
17
+ def write_json_to_cloud_storage_extended(cloud_provider:CloudProvider, storage_client, data:dict | list | str, bucket_name: str, file_name: str,
18
+ file_exists_if_starts_with_prefix:Optional[str] =None, overwrite_if_exists:bool=False, increment_if_exists:bool=False,
19
+ max_retries:int=2, max_deletable_files:int=1, logger=None, print_out=False):
20
+
21
+
22
+ if cloud_provider == CloudProvider.GCP:
23
+ return write_json_to_gcs_extended(
24
+ storage_client=storage_client,
25
+ data=data,
26
+ bucket_name=bucket_name,
27
+ file_name=file_name,
28
+ file_exists_if_starts_with_prefix=file_exists_if_starts_with_prefix,
29
+ overwrite_if_exists=overwrite_if_exists,
30
+ increment_if_exists=increment_if_exists,
31
+ max_retries=max_retries,
32
+ max_deletable_files=max_deletable_files,
33
+ logger=logger,
34
+ print_out=print_out
35
+ )
36
+
37
+ raise ValueError(f"Unsupported cloud provider: {cloud_provider}. Supported cloud providers: gcp")
38
+
39
+
40
+ def read_json_from_cloud_storage(cloud_provider:CloudProvider, storage_client, bucket_name, file_name, logger=None, print_out=False):
41
+ if cloud_provider == CloudProvider.GCP:
42
+ return read_json_from_gcs(storage_client=storage_client, bucket_name=bucket_name, file_name=file_name, logger=logger, print_out=print_out)
43
+
44
+ raise ValueError(f"Unsupported cloud provider: {cloud_provider}. Supported cloud providers: gcp")
@@ -0,0 +1,311 @@
1
+ # pylint: disable=missing-module-docstring
2
+ # pylint: disable=missing-function-docstring
3
+ # pylint: disable=missing-class-docstring
4
+ # pylint: disable=broad-exception-caught
5
+ # pylint: disable=line-too-long
6
+ # pylint: disable=unused-variable
7
+ # pylint: disable=broad-exception-raised
8
+ import json
9
+ import csv
10
+ from io import StringIO
11
+ import os
12
+ import time
13
+ import logging
14
+ from typing import Optional
15
+ import traceback
16
+ from google.api_core.exceptions import NotFound
17
+ from google.cloud import error_reporting
18
+ from google.cloud import logging as cloud_logging
19
+ from google.cloud.storage import Client as GCSClient
20
+ from google.cloud import bigquery
21
+
22
+ ############################################################################
23
+ ##################### GOOGLE CLOUD UTILS ##################################
24
+ ############################################################################
25
+
26
+ def log_error(message, logger=None , print_out=False, exc_info=False):
27
+ if logger:
28
+ logger.error(message, exc_info=exc_info)
29
+ elif print_out:
30
+ print(message)
31
+
32
+ def log_warning(message, logger=None, print_out=False):
33
+ if logger:
34
+ logger.warning(message)
35
+ elif print_out:
36
+ print(message)
37
+
38
+ def log_info(message, logger=None, print_out=False):
39
+ if logger:
40
+ logger.info(message)
41
+ elif print_out:
42
+ print(message)
43
+
44
+
45
+
46
+ ############################################################################
47
+ ##################### LOGGING and ERROR reporting ##########################
48
+ ####DEPCREACATED: THIS APPROACH WAS GOOD, BUT ERRORS WERE NOT REPORTED TO ERROR REPORTING
49
+ # logging.basicConfig(level=logging.INFO)
50
+ # logging_client = google.cloud.logging.Client()
51
+ # logging_client.setup_logging()
52
+ ###################################
53
+ def setup_gcp_logging(logger, formatter, enable_error_reporting=True):
54
+
55
+ class CustomGCPErrorReportingHandler(logging.Handler):
56
+ def __init__(self, level=logging.ERROR):
57
+ super().__init__(level)
58
+ self.error_client = error_reporting.Client()
59
+ self.propagate = True
60
+
61
+ def emit(self, record):
62
+ try:
63
+ if record.levelno >= logging.ERROR:
64
+ log_struct = {
65
+ 'message': self.format(record),
66
+ 'severity': record.levelname,
67
+ 'pathname': getattr(record, 'pathname', None),
68
+ 'lineno': getattr(record, 'lineno', None)
69
+ }
70
+ if record.exc_info:
71
+ log_struct['exception'] = ''.join(
72
+ traceback.format_exception(*record.exc_info)
73
+ )
74
+ self.error_client.report(str(log_struct))
75
+ except Exception as e:
76
+ self.handleError(record)
77
+
78
+ class CustomGCPLoggingHandler(cloud_logging.handlers.CloudLoggingHandler):
79
+ """Custom handler for Google Cloud Logging with a dynamic logName."""
80
+ def __init__(self, client, name, resource=None, labels=None):
81
+ super().__init__(client=client, name=name, resource=resource, labels=labels)
82
+
83
+ def emit(self, record):
84
+ # 1. Create the basic log entry dictionary
85
+ log_entry = {
86
+ 'message': record.msg,
87
+ 'severity': record.levelname,
88
+ 'name': record.name,
89
+ 'pathname': record.filename,
90
+ 'lineno': record.lineno,
91
+ }
92
+ if record.exc_info:
93
+ log_entry['exception_traceback'] = ''.join(
94
+ traceback.format_exception(*record.exc_info)
95
+ )
96
+
97
+ # 2. Apply the formatter to the 'message' field if it's a dictionary
98
+ if isinstance(record.msg, dict):
99
+ formatted_message = self.formatter.format(record)
100
+ try:
101
+ log_entry['message'] = json.loads(formatted_message)
102
+ except json.JSONDecodeError:
103
+ log_entry['message'] = formatted_message
104
+ else:
105
+ log_entry['message'] = record.msg
106
+
107
+ # 3. Set the custom logName
108
+ log_entry['logName'] = f"projects/{self.client.project}/logs/{record.name}"
109
+
110
+ # 4. Send to Google Cloud Logging
111
+ super().emit(record)
112
+
113
+ # Create Google Cloud Logging handler
114
+ cloud_logging_client = cloud_logging.Client()
115
+ cloud_logging_handler = CustomGCPLoggingHandler(cloud_logging_client, logger.name) # No prefix needed
116
+ cloud_logging_handler.setFormatter(formatter)
117
+ logger.addHandler(cloud_logging_handler)
118
+
119
+ if enable_error_reporting:
120
+ # Create and add Error Reporting handler
121
+ error_reporting_handler = CustomGCPErrorReportingHandler()
122
+ logger.addHandler(error_reporting_handler)
123
+
124
+
125
+
126
+ def create_bigquery_schema_from_json(json_schema: list) -> list:
127
+ schema = []
128
+ for field in json_schema:
129
+ if "max_length" in field:
130
+ schema.append(bigquery.SchemaField(field["name"], field["type"], mode=field["mode"], max_length=field["max_length"]))
131
+ else:
132
+ schema.append(bigquery.SchemaField(field["name"], field["type"], mode=field["mode"]))
133
+ return schema
134
+
135
+
136
+ def read_json_from_gcs(storage_client:GCSClient, bucket_name:str, file_name:str, logger=None,print_out=False):
137
+ """ Helper function to read a JSON file from Google Cloud Storage """
138
+ try:
139
+ bucket = storage_client.bucket(bucket_name)
140
+ blob = bucket.blob(file_name)
141
+ data_string = blob.download_as_text()
142
+ data = json.loads(data_string)
143
+ return data
144
+ except NotFound:
145
+ log_error(message=f"Error: The file {file_name} was not found in the bucket {bucket_name}.", logger=logger, print_out=print_out)
146
+ return None
147
+ except json.JSONDecodeError:
148
+ log_error(message=f"Error: The file {file_name} could not be decoded as JSON.", logger=logger, print_out=print_out)
149
+ return None
150
+ except Exception as e:
151
+ log_error(message=f"An unexpected error occurred: {e}", exc_info=True, logger=logger, print_out=print_out)
152
+ return None
153
+
154
+ def read_csv_from_gcs(bucket_name:str, file_name:str, storage_client:GCSClient, logger=None, print_out=False):
155
+ """ Helper function to read a CSV file from Google Cloud Storage """
156
+
157
+ try:
158
+ bucket = storage_client.bucket(bucket_name)
159
+ blob = bucket.blob(file_name)
160
+ data_string = blob.download_as_text()
161
+ data_file = StringIO(data_string)
162
+ reader = csv.DictReader(data_file)
163
+ return list(reader)
164
+ except NotFound:
165
+ log_error(message=f"Error: The file {file_name} was not found in the bucket {bucket_name}.", logger=logger, print_out=print_out)
166
+ return None
167
+ except csv.Error:
168
+ log_error(message=f"Error: The file {file_name} could not be read as CSV.", logger=logger, print_out=print_out)
169
+ return None
170
+ except Exception as e:
171
+ log_error(message=f"An unexpected error occurred: {e}", logger=logger, print_out=print_out, exc_info=True)
172
+ return None
173
+
174
+
175
+
176
+ def write_json_to_gcs_extended( storage_client:GCSClient, data:dict | list | str, bucket_name: str, file_name: str,
177
+ file_exists_if_starts_with_prefix:Optional[str] =None, overwrite_if_exists:bool=False, increment_if_exists:bool=False,
178
+ max_retries:int=2, max_deletable_files:int=1, logger=None, print_out=False):
179
+
180
+ """Saves data to Google Cloud Storage and optionally locally.
181
+
182
+ This function attempts to upload data to GCS.
183
+ - If the upload fails after retries and `save_locally` is True or `local_path` is provided, it attempts to save the data locally.
184
+ - It handles file name conflicts based on these rules:
185
+ - If `overwrite_if_exists` is True:
186
+ - If `file_exists_if_contains_substr` is provided, ANY existing file containing the substring is deleted, and the new file is saved with the provided `file_name`.
187
+ - If `file_exists_if_contains_substr` is None, and a file with the exact `file_name` exists, it's overwritten.
188
+ - If `increment_if_exists` is True:
189
+ - If `file_exists_if_contains_substr` is provided, a new file with an incremented version is created ONLY if a file with the EXACT `file_name` exists.
190
+ - If `file_exists_if_contains_substr` is None, a new file with an incremented version is created if a file with the exact `file_name` exists.
191
+
192
+ -If both overwrite_if_exists and increment_if_exists are provided as Ture, an exception will be raised.
193
+ """
194
+ # GCS upload exception
195
+ # Input validation
196
+ if overwrite_if_exists and increment_if_exists:
197
+ raise ValueError("Both 'overwrite_if_exists' and 'increment_if_exists' cannot be True simultaneously.")
198
+ if not isinstance(data, (list, dict, str)):
199
+ raise ValueError("Data should be a list, dict, or string.")
200
+ if max_deletable_files > 10:
201
+ raise ValueError("max_deletable_files should be less than 10 for safety. For more use another method.")
202
+
203
+ # Prepare data
204
+ if isinstance(data, (list, dict)):
205
+ data_str = json.dumps(data, indent=2)
206
+ else:
207
+ data_str = data
208
+
209
+ bucket = storage_client.bucket(bucket_name)
210
+ base_file_name, ext = os.path.splitext(file_name)
211
+ increment = 0
212
+ attempts = 0
213
+ success = False
214
+
215
+ # GCS-related metadata
216
+ cloud_storage_path = None
217
+ cloud_storage_file_overwritten = False
218
+ cloud_storage_file_already_exists = False
219
+ cloud_storage_file_saved_with_increment = False
220
+ cloud_storage_file_exists_checked_on_name = file_name
221
+ cloud_storage_deleted_files=[]
222
+
223
+
224
+ upload_allowed = True
225
+ # --- Overwrite Logic ---
226
+ if overwrite_if_exists:
227
+ if file_exists_if_starts_with_prefix:
228
+ cloud_storage_file_exists_checked_on_name = file_exists_if_starts_with_prefix
229
+ blobs_to_delete = list(bucket.list_blobs(prefix=file_exists_if_starts_with_prefix))
230
+ if len(blobs_to_delete) > max_deletable_files:
231
+ raise Exception(f"Error: Attempt to delete {len(blobs_to_delete)} matched files, but limit is {max_deletable_files}.")
232
+ if blobs_to_delete:
233
+ for blob in blobs_to_delete:
234
+ cloud_storage_path_del = f"gs://{bucket_name}/{blob.name}"
235
+ blob.delete()
236
+ cloud_storage_deleted_files.append(cloud_storage_path_del)
237
+ cloud_storage_file_overwritten = True
238
+ elif bucket.blob(file_name).exists():
239
+ cloud_storage_file_already_exists = True
240
+ cloud_storage_path_del = f"gs://{bucket_name}/{file_name}"
241
+ blob.delete() # Delete the existing blob
242
+ cloud_storage_deleted_files.append(cloud_storage_path_del)
243
+ cloud_storage_file_overwritten = True
244
+ # --- Increment Logic ---
245
+ elif increment_if_exists:
246
+ cloud_storage_file_exists_checked_on_name = file_name # We only increment if the exact name exists
247
+ while bucket.blob(file_name).exists():
248
+ cloud_storage_file_already_exists = True
249
+ increment += 1
250
+ file_name = f"{base_file_name}_v{increment}{ext}"
251
+ cloud_storage_file_saved_with_increment = True
252
+ if increment>0:
253
+ cloud_storage_path = f"gs://{bucket_name}/{file_name}"
254
+ # --- Check for Conflicts (Including Prefix) ---
255
+ else:
256
+ if file_exists_if_starts_with_prefix:
257
+ blobs_matched = list(bucket.list_blobs(prefix=file_exists_if_starts_with_prefix))
258
+ cloud_storage_file_exists_checked_on_name = file_exists_if_starts_with_prefix
259
+ if blobs_matched:
260
+ upload_allowed = False
261
+ cloud_storage_file_already_exists = True
262
+ elif bucket.blob(file_name).exists():
263
+ upload_allowed = False
264
+ cloud_storage_file_already_exists = True
265
+
266
+ # --- GCS Upload ---
267
+ cloud_storage_path = f"gs://{bucket_name}/{file_name}"
268
+ if overwrite_if_exists or increment_if_exists or upload_allowed:
269
+ while attempts < max_retries and not success:
270
+ try:
271
+ blob = bucket.blob(file_name) # Use the potentially updated file_name
272
+ blob.upload_from_string(data_str, content_type='application/json')
273
+ success = True
274
+ except Exception as e:
275
+ attempts += 1
276
+ if attempts < max_retries:
277
+ time.sleep(2 ** attempts)
278
+ else:
279
+ raise e
280
+
281
+ # --- Return Metadata ---
282
+ return {
283
+ "cloud_storage_path": cloud_storage_path if (success or not upload_allowed ) else None,
284
+ "cloud_storage_file_already_exists": cloud_storage_file_already_exists,
285
+ "cloud_storage_file_exists_checked_on_name":cloud_storage_file_exists_checked_on_name ,
286
+ "cloud_storage_file_overwritten": cloud_storage_file_overwritten,
287
+ "cloud_storage_deleted_file_names": ",,,".join(cloud_storage_deleted_files) if cloud_storage_deleted_files else None,
288
+ "cloud_storage_file_saved_with_increment": cloud_storage_file_saved_with_increment
289
+ }
290
+
291
+
292
+ def write_csv_to_gcs(bucket_name:str, file_name:str, data:dict | list | str, storage_client:GCSClient, logger,log_info_verbose=True):
293
+ """ Helper function to write a CSV file to Google Cloud Storage """
294
+ try:
295
+ bucket = storage_client.bucket(bucket_name)
296
+ blob = bucket.blob(file_name)
297
+ data_file = StringIO()
298
+ if data and isinstance(data, list) and isinstance(data[0], dict):
299
+ fieldnames = data[0].keys()
300
+ writer = csv.DictWriter(data_file, fieldnames=fieldnames)
301
+ writer.writeheader()
302
+ writer.writerows(data)
303
+ else:
304
+ raise ValueError("Data should be a list of dictionaries")
305
+ blob.upload_from_string(data_file.getvalue(), content_type='text/csv')
306
+ if log_info_verbose:
307
+ logger.info(f"Successfully wrote CSV to {file_name} in bucket {bucket_name}.")
308
+ except ValueError as e:
309
+ logger.error(f"ValueError: {e}")
310
+ except Exception as e:
311
+ logger.error(f"An unexpected error occurred while writing CSV to GCS: {e}", exc_info=True)