ml-analytics-tools 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1646 @@
1
+ """
2
+ Google Sheets connector for reading and writing data to Google Sheets.
3
+
4
+ Data Cleaning Features:
5
+ - Automatically handles missing values (NaN, None) by converting to empty strings
6
+ - Replaces infinity values (inf, -inf) with empty strings
7
+ - Normalizes null-like string values ('None', 'none', 'null', 'NULL')
8
+ - Converts object columns to strings to avoid type issues
9
+ - Pads rows with missing columns to match header length (handles trailing empty cells)
10
+ - Truncates rows that are longer than headers
11
+ - All cleaning is applied automatically during read/write operations
12
+ """
13
+
14
+ import io
15
+ import json
16
+ import os
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ import pandas as pd
21
+ from google.oauth2 import service_account
22
+ from googleapiclient.discovery import build
23
+ from googleapiclient.errors import HttpError
24
+
25
+ from .utils import get_credential_value, get_logger, log_and_raise_error
26
+
27
+
28
+ class GSheet:
29
+ """
30
+ A connector class for interacting with Google Sheets API.
31
+
32
+ This class provides methods to read from and write to Google Sheets,
33
+ with support for both service account and OAuth2 authentication.
34
+ """
35
+
36
+ @staticmethod
37
+ def _dataframe_to_values(data: pd.DataFrame, include_headers: bool = True) -> list[list[Any]]:
38
+ """
39
+ Convert a DataFrame into JSON-serializable rows for the Sheets API.
40
+
41
+ Handles Categorical, NaN/None/inf, datetime/Timestamp (including tz-aware),
42
+ Period, and timedelta columns that would otherwise fail json.dumps.
43
+ """
44
+ from pandas.api import types as pdt
45
+
46
+ data_clean = data.copy()
47
+ for col in data_clean.columns:
48
+ series = data_clean[col]
49
+ if isinstance(series.dtype, pd.CategoricalDtype):
50
+ data_clean[col] = series.astype(object)
51
+ continue
52
+ if pdt.is_datetime64_any_dtype(series):
53
+ data_clean[col] = series.dt.strftime("%Y-%m-%d %H:%M:%S")
54
+ elif isinstance(series.dtype, pd.PeriodDtype) or pdt.is_timedelta64_dtype(series):
55
+ data_clean[col] = series.astype(str)
56
+ data_clean = data_clean.fillna("")
57
+ data_clean = data_clean.replace([float("inf"), float("-inf")], "")
58
+ for col in data_clean.columns:
59
+ if data_clean[col].dtype == "object":
60
+ data_clean[col] = data_clean[col].astype(str).replace("nan", "").replace("None", "")
61
+ if include_headers:
62
+ return [data_clean.columns.tolist()] + data_clean.values.tolist()
63
+ return data_clean.values.tolist()
64
+
65
+ @staticmethod
66
+ def _format_sheet_name(sheet_name: str) -> str:
67
+ """
68
+ Format a sheet name for use in A1 notation.
69
+ Adds single quotes around the name if it contains spaces or special characters.
70
+
71
+ Parameters
72
+ ----------
73
+ sheet_name : str
74
+ The sheet name to format.
75
+
76
+ Returns
77
+ -------
78
+ str
79
+ Properly formatted sheet name for A1 notation.
80
+ """
81
+ # If sheet name contains spaces or special characters, wrap in single quotes
82
+ if any(char in sheet_name for char in [" ", "!", "'"]):
83
+ # Escape any single quotes in the sheet name by doubling them
84
+ escaped_name = sheet_name.replace("'", "''")
85
+ return f"'{escaped_name}'"
86
+ return sheet_name
87
+
88
+ @staticmethod
89
+ def _find_google_credentials_file() -> Path | None:
90
+ """
91
+ Search for a Google service account credentials JSON file starting from the project root,
92
+ then current directory and parent directory.
93
+
94
+ Returns
95
+ -------
96
+ Path | None
97
+ Path to the credentials file if found, None otherwise.
98
+ """
99
+ from .utils import find_project_root
100
+
101
+ directories_to_search = []
102
+
103
+ # Try to find project root first
104
+ try:
105
+ project_root = find_project_root()
106
+ directories_to_search.append(project_root)
107
+ except FileNotFoundError:
108
+ pass
109
+
110
+ # Then check current directory and parent directory
111
+ current_dir = Path.cwd()
112
+ directories_to_search.extend([current_dir, current_dir.parent])
113
+
114
+ for search_dir in directories_to_search:
115
+ # Look for JSON files in directory
116
+ for json_file in search_dir.glob("*.json"):
117
+ try:
118
+ with open(json_file) as f:
119
+ data = json.load(f)
120
+ # Check if it's a Google service account file
121
+ if (
122
+ isinstance(data, dict)
123
+ and data.get("type") == "service_account"
124
+ and (
125
+ "googleapis.com" in str(data.get("auth_uri", ""))
126
+ or "googleapis.com" in str(data.get("token_uri", ""))
127
+ or "client_email" in data
128
+ )
129
+ ):
130
+ return json_file
131
+ except (json.JSONDecodeError, Exception):
132
+ continue
133
+
134
+ return None
135
+
136
+ def __init__(
137
+ self,
138
+ credentials_path: str | Path = None,
139
+ credentials_json: dict = None,
140
+ scopes: list[str] = None,
141
+ log_level: str = "INFO",
142
+ scope: str = "ml",
143
+ spreadsheet_id: str = None,
144
+ ):
145
+ """
146
+ Initialize the Google Sheets connector.
147
+
148
+ Parameters
149
+ ----------
150
+ credentials_path : str | Path, optional
151
+ Path to the service account credentials JSON file.
152
+ If not provided, will look for 'gsheet_credentials.json' in the current directory.
153
+ credentials_json : dict, optional
154
+ Service account credentials as a dictionary (alternative to credentials_path).
155
+ scopes : list[str], optional
156
+ Google API scopes to use. Defaults to read/write access to Google Sheets.
157
+ log_level : str, optional
158
+ Logging level. Default is "INFO".
159
+ scope : str, optional
160
+ Scope for mounted secrets (e.g., '/mnt/{scope}/GOOGLE_CREDENTIALS').
161
+ Default is "ml".
162
+ spreadsheet_id : str, optional
163
+ Default spreadsheet ID used by any method that accepts a ``spreadsheet_id``
164
+ argument. A ``spreadsheet_id`` passed to an individual method call always
165
+ takes precedence over this default. When neither is provided, falls back
166
+ to the ``GSHEET_SPREADSHEET_ID`` environment variable if set.
167
+
168
+ Examples
169
+ --------
170
+ >>> # Using credentials file
171
+ >>> gsheet = GSheet(credentials_path="path/to/credentials.json")
172
+ >>>
173
+ >>> # Using credentials dictionary
174
+ >>> creds_dict = json.loads(os.environ['GOOGLE_CREDENTIALS'])
175
+ >>> gsheet = GSheet(credentials_json=creds_dict)
176
+ >>>
177
+ >>> # Auto-load from default location
178
+ >>> gsheet = GSheet() # Looks for gsheet_credentials.json
179
+ >>>
180
+ >>> # Using mounted secrets with a custom scope
181
+ >>> gsheet = GSheet(scope="custom-scope")
182
+ >>>
183
+ >>> # Bind a default spreadsheet ID so later calls can omit it
184
+ >>> gsheet = GSheet(spreadsheet_id="1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms")
185
+ >>> gsheet.read_sheet() # uses the bound ID
186
+ >>> gsheet.write_sheet(df, spreadsheet_id="other-id") # per-call ID overrides
187
+ """
188
+ self._logger = get_logger("GSheet")
189
+ self._logger.setLevel(log_level)
190
+ self._scope = scope
191
+
192
+ if scopes is None:
193
+ self.scopes = [
194
+ "https://www.googleapis.com/auth/spreadsheets",
195
+ "https://www.googleapis.com/auth/drive.file",
196
+ "https://www.googleapis.com/auth/drive",
197
+ ]
198
+ else:
199
+ self.scopes = scopes
200
+
201
+ # Initialize credentials
202
+ self.credentials = self._initialize_credentials(credentials_path, credentials_json)
203
+ self.service_account_email = self.credentials.service_account_email
204
+
205
+ # Build the service
206
+ try:
207
+ self.service = build("sheets", "v4", credentials=self.credentials)
208
+ except Exception as e:
209
+ log_and_raise_error(self._logger, f"Failed to initialize Google Sheets API service: {e}")
210
+
211
+ # Build Drive API service for sharing capabilities
212
+ try:
213
+ self.drive_service = build("drive", "v3", credentials=self.credentials)
214
+ except Exception as e:
215
+ self._logger.warning(f"Failed to initialize Google Drive API service: {e}")
216
+ self.drive_service = None
217
+
218
+ # Single success message after all services initialized
219
+ self._logger.info("Google API services initialized successfully")
220
+
221
+ # Resolve default spreadsheet_id: explicit arg wins, else GSHEET_SPREADSHEET_ID env var
222
+ if spreadsheet_id is None:
223
+ spreadsheet_id = os.environ.get("GSHEET_SPREADSHEET_ID")
224
+ if spreadsheet_id:
225
+ self._logger.debug("Using GSHEET_SPREADSHEET_ID from environment")
226
+ self.spreadsheet_id = spreadsheet_id
227
+
228
+ def _assemble_credentials_from_components(self) -> dict | None:
229
+ """
230
+ Assemble Google service account credentials from individual Vault secrets.
231
+
232
+ Supports credentials stored as separate fields:
233
+ - GOOGLE_PROJECT_ID
234
+ - GOOGLE_API_PKEY_ID
235
+ - GOOGLE_API_PKEY
236
+ - GOOGLE_CLIENT_EMAIL
237
+ - GOOGLE_CLIENT_ID
238
+ - GOOGLE_CERT_URL
239
+
240
+ Returns
241
+ -------
242
+ dict | None
243
+ Service account credentials dictionary if all required fields found, None otherwise.
244
+ """
245
+ try:
246
+ # Fetch all required credential components
247
+ project_id = get_credential_value("GOOGLE_PROJECT_ID", scope=self._scope)
248
+ private_key_id = get_credential_value("GOOGLE_API_PKEY_ID", scope=self._scope)
249
+ private_key = get_credential_value("GOOGLE_API_PKEY", scope=self._scope)
250
+ client_email = get_credential_value("GOOGLE_CLIENT_EMAIL", scope=self._scope)
251
+ client_id = get_credential_value("GOOGLE_CLIENT_ID", scope=self._scope)
252
+ cert_url = get_credential_value("GOOGLE_CERT_URL", scope=self._scope)
253
+
254
+ # Handle escaped newlines in private key (common in Vault)
255
+ if "\\n" in private_key:
256
+ private_key = private_key.replace("\\n", "\n")
257
+
258
+ # Build the service account credentials dictionary
259
+ credentials_dict = {
260
+ "type": "service_account",
261
+ "project_id": project_id,
262
+ "private_key_id": private_key_id,
263
+ "private_key": private_key,
264
+ "client_email": client_email,
265
+ "client_id": client_id,
266
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
267
+ "token_uri": "https://oauth2.googleapis.com/token",
268
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
269
+ "client_x509_cert_url": cert_url,
270
+ }
271
+
272
+ self._logger.debug("Assembled Google credentials from individual Vault secrets")
273
+ return credentials_dict
274
+
275
+ except Exception as e:
276
+ self._logger.debug(f"Could not assemble credentials from individual components: {e}")
277
+ return None
278
+
279
+ def _initialize_credentials(
280
+ self, credentials_path: str | Path = None, credentials_json: dict = None
281
+ ) -> service_account.Credentials:
282
+ """
283
+ Initialize Google API credentials from file or dictionary.
284
+
285
+ Parameters
286
+ ----------
287
+ credentials_path : str | Path, optional
288
+ Path to credentials JSON file.
289
+ credentials_json : dict, optional
290
+ Credentials dictionary.
291
+
292
+ Returns
293
+ -------
294
+ service_account.Credentials
295
+ Google service account credentials.
296
+ """
297
+ # Try to auto-load from default location if no credentials provided
298
+ if credentials_path is None and credentials_json is None:
299
+ # First try to get credentials from environment variable or mounted secret (single JSON)
300
+ try:
301
+ credentials_str = get_credential_value("GOOGLE_CREDENTIALS", scope=self._scope)
302
+ credentials_json = json.loads(credentials_str)
303
+ self._logger.debug("Using GOOGLE_CREDENTIALS from environment or mounted secret")
304
+ except Exception:
305
+ # Try assembling from individual Vault secrets
306
+ credentials_json = self._assemble_credentials_from_components()
307
+
308
+ if credentials_json is None:
309
+ # Fall back to file-based credential resolution
310
+ # First try the default filename
311
+ default_path = Path.cwd() / "gsheet_credentials.json"
312
+ if default_path.exists():
313
+ credentials_path = default_path
314
+ else:
315
+ # Search for any JSON file containing Google credentials
316
+ found_creds = GSheet._find_google_credentials_file()
317
+ if found_creds:
318
+ credentials_path = found_creds
319
+ else:
320
+ self._logger.error(
321
+ "No credentials provided and no Google credentials JSON file found in current directory."
322
+ )
323
+ self._logger.info(
324
+ "Please provide credentials via 'credentials_path' parameter or place a Google service account JSON file in the current directory." # noqa: E501
325
+ )
326
+ log_and_raise_error(
327
+ self._logger,
328
+ "Either 'credentials_path' or 'credentials_json' must be provided, or a Google service account JSON file must exist in the current directory", # noqa: E501
329
+ )
330
+
331
+ try:
332
+ if credentials_path is not None:
333
+ credentials_path = Path(credentials_path)
334
+ if not credentials_path.exists():
335
+ log_and_raise_error(
336
+ self._logger,
337
+ f"Credentials file not found at: {credentials_path}",
338
+ )
339
+ credentials = service_account.Credentials.from_service_account_file(
340
+ str(credentials_path), scopes=self.scopes
341
+ )
342
+ else:
343
+ credentials = service_account.Credentials.from_service_account_info(
344
+ credentials_json, scopes=self.scopes
345
+ )
346
+
347
+ return credentials
348
+
349
+ except Exception as e:
350
+ log_and_raise_error(self._logger, f"Failed to initialize credentials: {e}")
351
+
352
+ def _resolve_spreadsheet_id(self, spreadsheet_id: str | None) -> str | None:
353
+ """Return the per-call spreadsheet_id if provided, else the instance default."""
354
+ return spreadsheet_id if spreadsheet_id is not None else self.spreadsheet_id
355
+
356
+ def read_sheet(
357
+ self,
358
+ spreadsheet_id: str = None,
359
+ range_name: str = None,
360
+ sheet_name: str = None,
361
+ return_as: str = "dataframe",
362
+ ) -> pd.DataFrame | list[list[Any]]:
363
+ """
364
+ Read data from a Google Sheet.
365
+
366
+ Parameters
367
+ ----------
368
+ spreadsheet_id : str, optional
369
+ The ID of the spreadsheet to read from. Falls back to the instance
370
+ default set via ``GSheet(spreadsheet_id=...)`` when omitted.
371
+ range_name : str, optional
372
+ The A1 notation range to read (e.g., 'Sheet1!A1:D10').
373
+ If None and sheet_name is provided, reads entire sheet.
374
+ sheet_name : str, optional
375
+ The name of the sheet to read from. Used if range_name is not provided.
376
+ return_as : str, optional
377
+ Format to return data: 'dataframe' (default) or 'list'.
378
+
379
+ Returns
380
+ -------
381
+ pd.DataFrame | list[list[Any]]
382
+ The data from the sheet as a DataFrame or list of lists.
383
+
384
+ Examples
385
+ --------
386
+ >>> gsheet = GSheet(credentials_path="creds.json")
387
+ >>> df = gsheet.read_sheet("1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms")
388
+ >>> df = gsheet.read_sheet("1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms",
389
+ ... range_name="Sheet1!A1:D10")
390
+ """
391
+ spreadsheet_id = self._resolve_spreadsheet_id(spreadsheet_id)
392
+ if spreadsheet_id is None:
393
+ log_and_raise_error(
394
+ self._logger,
395
+ "No spreadsheet_id provided and no default set on the GSheet instance",
396
+ )
397
+
398
+ # Build the range
399
+ if range_name is None:
400
+ if sheet_name is None:
401
+ range_name = "A:ZZ" # Read all columns
402
+ else:
403
+ formatted_sheet = self._format_sheet_name(sheet_name)
404
+ range_name = f"{formatted_sheet}!A:ZZ"
405
+
406
+ try:
407
+ result = self.service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=range_name).execute()
408
+
409
+ values = result.get("values", [])
410
+
411
+ if not values:
412
+ self._logger.warning(f"No data found in range: {range_name}")
413
+ return pd.DataFrame() if return_as == "dataframe" else []
414
+
415
+ self._logger.info(f"Successfully read {len(values)} rows from spreadsheet {spreadsheet_id}")
416
+
417
+ if return_as == "dataframe":
418
+ # Use first row as headers
419
+ if len(values) > 0:
420
+ headers = values[0]
421
+ data_rows = values[1:]
422
+
423
+ # Ensure all rows have the same length as headers by padding with empty strings
424
+ # This handles cases where rows have trailing empty cells that Google Sheets API omits
425
+ num_columns = len(headers)
426
+ normalized_rows = []
427
+ for row in data_rows:
428
+ if len(row) < num_columns:
429
+ # Pad row with empty strings to match header length
430
+ row = row + [""] * (num_columns - len(row))
431
+ elif len(row) > num_columns:
432
+ # Truncate row if it's longer than headers (rare but possible)
433
+ row = row[:num_columns]
434
+ normalized_rows.append(row)
435
+
436
+ df = pd.DataFrame(normalized_rows, columns=headers)
437
+ # Handle missing values: replace empty strings and None with empty string
438
+ df = df.fillna("")
439
+ # Replace any remaining None-like values that might come from sheets
440
+ df = df.replace([None, "None", "none", "null", "NULL"], "")
441
+ return df
442
+ return pd.DataFrame()
443
+ else:
444
+ return values
445
+
446
+ except HttpError as e:
447
+ log_and_raise_error(
448
+ self._logger,
449
+ f"HTTP error reading from Google Sheet: {e}",
450
+ )
451
+ except Exception as e:
452
+ log_and_raise_error(
453
+ self._logger,
454
+ f"Error reading from Google Sheet: {e}",
455
+ )
456
+
457
+ def write_sheet(
458
+ self,
459
+ data: pd.DataFrame | list[list[Any]],
460
+ spreadsheet_id: str = None,
461
+ spreadsheet_title: str = None,
462
+ range_name: str = None,
463
+ sheet_name: str = None,
464
+ value_input_option: str = "USER_ENTERED",
465
+ include_headers: bool = True,
466
+ clear_before_write: bool = False,
467
+ share_with: list[str] | str = None,
468
+ role: str = "writer",
469
+ autofit_columns: bool = True,
470
+ column_padding: int = 30,
471
+ ) -> dict | tuple[dict, str]:
472
+ """
473
+ Write data to a Google Sheet. Creates a new spreadsheet if it doesn't exist.
474
+
475
+ Parameters
476
+ ----------
477
+ data : pd.DataFrame | list[list[Any]]
478
+ The data to write. Can be a DataFrame or list of lists.
479
+ spreadsheet_id : str, optional
480
+ The ID of the spreadsheet to write to. Falls back to the instance
481
+ default set via ``GSheet(spreadsheet_id=...)`` when omitted. If both
482
+ are None, ``spreadsheet_title`` must be provided and a new
483
+ spreadsheet will be created.
484
+ spreadsheet_title : str, optional
485
+ Title for a new spreadsheet. Used only if no spreadsheet_id is resolved.
486
+ A new spreadsheet will be created with this title.
487
+ range_name : str, optional
488
+ The A1 notation range to write to (e.g., 'Sheet1!A1').
489
+ sheet_name : str, optional
490
+ The name of the sheet to write to. Used if range_name is not provided.
491
+ value_input_option : str, optional
492
+ How to interpret the input data. Options: 'RAW' or 'USER_ENTERED' (default).
493
+ include_headers : bool, optional
494
+ Whether to include DataFrame column names as headers. Default is True.
495
+ clear_before_write : bool, optional
496
+ Whether to clear the range before writing. Default is False.
497
+ share_with : list[str] | str, optional
498
+ Email address(es) to share the spreadsheet with (only used when creating new spreadsheet).
499
+ role : str, optional
500
+ Permission level when sharing: 'reader', 'writer', or 'owner'. Default is 'writer'.
501
+ autofit_columns : bool, optional
502
+ Whether to auto-resize column widths to fit content after writing.
503
+ Default is True.
504
+ column_padding : int, optional
505
+ Extra pixels to add to each column width after auto-resize for readability.
506
+ Default is 30. Set to 0 for a tight fit with no padding.
507
+
508
+ Returns
509
+ -------
510
+ dict | tuple[dict, str]
511
+ If spreadsheet exists: returns the API response containing update information.
512
+ If spreadsheet created: returns tuple of (API response, spreadsheet_id).
513
+
514
+ Examples
515
+ --------
516
+ >>> gsheet = GSheet(credentials_path="creds.json")
517
+ >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
518
+ >>>
519
+ >>> # Write to existing spreadsheet
520
+ >>> gsheet.write_sheet(df, spreadsheet_id="1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms")
521
+ >>>
522
+ >>> # Create new spreadsheet and write data
523
+ >>> result, new_id = gsheet.write_sheet(
524
+ ... df,
525
+ ... spreadsheet_title="My Data",
526
+ ... share_with="user@example.com"
527
+ ... )
528
+ """
529
+ spreadsheet_id = self._resolve_spreadsheet_id(spreadsheet_id)
530
+
531
+ # Create new spreadsheet if ID not provided
532
+ created_new = False
533
+ if spreadsheet_id is None:
534
+ if spreadsheet_title is None:
535
+ log_and_raise_error(self._logger, "Either 'spreadsheet_id' or 'spreadsheet_title' must be provided")
536
+
537
+ # Determine sheet names from data if needed
538
+ sheet_names_to_create = None
539
+ if sheet_name:
540
+ sheet_names_to_create = [sheet_name]
541
+
542
+ spreadsheet_id = self.create_spreadsheet(
543
+ title=spreadsheet_title,
544
+ sheet_names=sheet_names_to_create,
545
+ share_with=None, # Don't share here, will be done after writing
546
+ role=role,
547
+ )
548
+ created_new = True
549
+ self._logger.info(f"Created new spreadsheet '{spreadsheet_title}' with ID: {spreadsheet_id}")
550
+
551
+ # Build the range
552
+ if range_name is None:
553
+ if sheet_name is None:
554
+ range_name = "Sheet1!A1"
555
+ else:
556
+ # Ensure the sheet exists (create if needed)
557
+ if not created_new:
558
+ self._ensure_sheet_exists(spreadsheet_id, sheet_name)
559
+ formatted_sheet = self._format_sheet_name(sheet_name)
560
+ range_name = f"{formatted_sheet}!A1"
561
+
562
+ # Convert DataFrame to list of lists if needed
563
+ # Build the range
564
+ if range_name is None:
565
+ if sheet_name is None:
566
+ range_name = "Sheet1!A1"
567
+ else:
568
+ formatted_sheet = self._format_sheet_name(sheet_name)
569
+ range_name = f"{formatted_sheet}!A1"
570
+
571
+ # Check if sheet exists and create it if needed (only when using sheet_name, not range_name)
572
+ if sheet_name and not created_new:
573
+ self._ensure_sheet_exists(spreadsheet_id, sheet_name)
574
+
575
+ # Convert DataFrame to list of lists if needed
576
+ if isinstance(data, pd.DataFrame):
577
+ values = self._dataframe_to_values(data, include_headers=include_headers)
578
+ else:
579
+ values = data
580
+
581
+ try:
582
+ if clear_before_write:
583
+ if "!" in range_name:
584
+ sheet_part = range_name.split("!")[0]
585
+ clear_range_full = f"{sheet_part}!A1:ZZZ100000"
586
+ else:
587
+ clear_range_full = "A1:ZZZ100000"
588
+ self.clear_range(spreadsheet_id, clear_range_full)
589
+
590
+ # Write the data
591
+ body = {"values": values}
592
+ result = (
593
+ self.service.spreadsheets()
594
+ .values()
595
+ .update(
596
+ spreadsheetId=spreadsheet_id,
597
+ range=range_name,
598
+ valueInputOption=value_input_option,
599
+ body=body,
600
+ )
601
+ .execute()
602
+ )
603
+
604
+ updated_cells = result.get("updatedCells", 0)
605
+ self._logger.info(f"Successfully wrote {updated_cells} cells to spreadsheet {spreadsheet_id}")
606
+
607
+ # Share the spreadsheet if email addresses are provided (works for both new and existing)
608
+ if share_with:
609
+ self.share_spreadsheet(
610
+ spreadsheet_id=spreadsheet_id,
611
+ email_addresses=share_with,
612
+ role=role,
613
+ send_notification=True,
614
+ )
615
+
616
+ # Auto-fit column widths to content
617
+ if autofit_columns:
618
+ num_cols = len(values[0]) if values else None
619
+ effective_sheet = sheet_name if sheet_name else "Sheet1"
620
+ self._autofit_columns(spreadsheet_id, effective_sheet, num_cols, padding_pixels=column_padding)
621
+
622
+ # Return spreadsheet_id if newly created, otherwise just the result
623
+ if created_new:
624
+ return result, spreadsheet_id
625
+ return result
626
+
627
+ except HttpError as e:
628
+ log_and_raise_error(
629
+ self._logger,
630
+ f"HTTP error writing to Google Sheet: {e}",
631
+ )
632
+ except Exception as e:
633
+ log_and_raise_error(
634
+ self._logger,
635
+ f"Error writing to Google Sheet: {e}",
636
+ )
637
+
638
+ def append_sheet(
639
+ self,
640
+ spreadsheet_id: str = None,
641
+ data: pd.DataFrame | list[list[Any]] = None,
642
+ range_name: str = None,
643
+ sheet_name: str = None,
644
+ value_input_option: str = "USER_ENTERED",
645
+ include_headers: bool = False,
646
+ autofit_columns: bool = False,
647
+ column_padding: int = 30,
648
+ ) -> dict:
649
+ """
650
+ Append data to a Google Sheet.
651
+
652
+ Parameters
653
+ ----------
654
+ spreadsheet_id : str, optional
655
+ The ID of the spreadsheet to append to. Falls back to the instance
656
+ default set via ``GSheet(spreadsheet_id=...)`` when omitted.
657
+ data : pd.DataFrame | list[list[Any]]
658
+ The data to append.
659
+ range_name : str, optional
660
+ The A1 notation range to append to (e.g., 'Sheet1!A1').
661
+ sheet_name : str, optional
662
+ The name of the sheet to append to. Used if range_name is not provided.
663
+ value_input_option : str, optional
664
+ How to interpret the input data. Options: 'RAW' or 'USER_ENTERED' (default).
665
+ include_headers : bool, optional
666
+ Whether to include DataFrame column names as headers. Default is False.
667
+ autofit_columns : bool, optional
668
+ Whether to auto-resize column widths to fit content after appending.
669
+ Default is False.
670
+ column_padding : int, optional
671
+ Extra pixels to add to each column width after auto-resize for readability.
672
+ Default is 30. Set to 0 for a tight fit with no padding.
673
+
674
+ Returns
675
+ -------
676
+ dict
677
+ The API response containing append information.
678
+
679
+ Examples
680
+ --------
681
+ >>> gsheet = GSheet(credentials_path="creds.json")
682
+ >>> df = pd.DataFrame({'A': [7, 8, 9], 'B': [10, 11, 12]})
683
+ >>> gsheet.append_sheet("1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms", df)
684
+ """
685
+ spreadsheet_id = self._resolve_spreadsheet_id(spreadsheet_id)
686
+ if spreadsheet_id is None:
687
+ log_and_raise_error(
688
+ self._logger,
689
+ "No spreadsheet_id provided and no default set on the GSheet instance",
690
+ )
691
+ if data is None:
692
+ log_and_raise_error(self._logger, "'data' is required for append_sheet")
693
+
694
+ # Build the range
695
+ if range_name is None:
696
+ if sheet_name is None:
697
+ range_name = "Sheet1!A1"
698
+ else:
699
+ # Ensure the sheet exists (create if needed)
700
+ self._ensure_sheet_exists(spreadsheet_id, sheet_name)
701
+ formatted_sheet = self._format_sheet_name(sheet_name)
702
+ range_name = f"{formatted_sheet}!A1"
703
+
704
+ # Convert DataFrame to list of lists if needed
705
+ if isinstance(data, pd.DataFrame):
706
+ values = self._dataframe_to_values(data, include_headers=include_headers)
707
+ else:
708
+ values = data
709
+
710
+ try:
711
+ body = {"values": values}
712
+ result = (
713
+ self.service.spreadsheets()
714
+ .values()
715
+ .append(
716
+ spreadsheetId=spreadsheet_id,
717
+ range=range_name,
718
+ valueInputOption=value_input_option,
719
+ insertDataOption="INSERT_ROWS",
720
+ body=body,
721
+ )
722
+ .execute()
723
+ )
724
+
725
+ updated_cells = result.get("updates", {}).get("updatedCells", 0)
726
+ self._logger.info(f"Successfully appended {updated_cells} cells to spreadsheet {spreadsheet_id}")
727
+
728
+ # Auto-fit column widths to content
729
+ if autofit_columns:
730
+ num_cols = len(values[0]) if values else None
731
+ effective_sheet = sheet_name if sheet_name else "Sheet1"
732
+ self._autofit_columns(spreadsheet_id, effective_sheet, num_cols, padding_pixels=column_padding)
733
+
734
+ return result
735
+
736
+ except HttpError as e:
737
+ log_and_raise_error(
738
+ self._logger,
739
+ f"HTTP error appending to Google Sheet: {e}",
740
+ )
741
+ except Exception as e:
742
+ log_and_raise_error(
743
+ self._logger,
744
+ f"Error appending to Google Sheet: {e}",
745
+ )
746
+
747
+ def clear_range(self, spreadsheet_id: str = None, range_name: str = None) -> dict:
748
+ """
749
+ Clear values from a range in a Google Sheet.
750
+
751
+ Parameters
752
+ ----------
753
+ spreadsheet_id : str, optional
754
+ The ID of the spreadsheet. Falls back to the instance default set
755
+ via ``GSheet(spreadsheet_id=...)`` when omitted.
756
+ range_name : str
757
+ The A1 notation range to clear.
758
+
759
+ Returns
760
+ -------
761
+ dict
762
+ The API response.
763
+
764
+ Examples
765
+ --------
766
+ >>> gsheet = GSheet(credentials_path="creds.json")
767
+ >>> gsheet.clear_range("1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms",
768
+ ... "Sheet1!A1:D10")
769
+ """
770
+ spreadsheet_id = self._resolve_spreadsheet_id(spreadsheet_id)
771
+ if spreadsheet_id is None:
772
+ log_and_raise_error(
773
+ self._logger,
774
+ "No spreadsheet_id provided and no default set on the GSheet instance",
775
+ )
776
+ if range_name is None:
777
+ log_and_raise_error(self._logger, "'range_name' is required for clear_range")
778
+
779
+ try:
780
+ result = (
781
+ self.service.spreadsheets().values().clear(spreadsheetId=spreadsheet_id, range=range_name).execute()
782
+ )
783
+
784
+ self._logger.info(f"Successfully cleared range {range_name}")
785
+ return result
786
+
787
+ except HttpError as e:
788
+ log_and_raise_error(
789
+ self._logger,
790
+ f"HTTP error clearing Google Sheet range: {e}",
791
+ )
792
+ except Exception as e:
793
+ log_and_raise_error(
794
+ self._logger,
795
+ f"Error clearing Google Sheet range: {e}",
796
+ )
797
+
798
+ def create_spreadsheet(
799
+ self,
800
+ title: str,
801
+ sheet_names: list[str] = None,
802
+ share_with: list[str] | str = None,
803
+ role: str = "writer",
804
+ send_notification: bool = True,
805
+ ) -> str:
806
+ """
807
+ Create a new Google Spreadsheet and optionally share it with specified email addresses.
808
+
809
+ Parameters
810
+ ----------
811
+ title : str
812
+ The title of the new spreadsheet.
813
+ sheet_names : list[str], optional
814
+ List of sheet names to create. If None, creates a single sheet named "Sheet1".
815
+ share_with : list[str] | str, optional
816
+ Email address(es) to share the spreadsheet with.
817
+ Can be a single email string or a list of email strings.
818
+ role : str, optional
819
+ Permission level for shared users: 'reader', 'writer', or 'owner'.
820
+ Default is 'writer'.
821
+ send_notification : bool, optional
822
+ Whether to send email notifications to users when sharing.
823
+ Default is True.
824
+
825
+ Returns
826
+ -------
827
+ str
828
+ The spreadsheet ID of the newly created spreadsheet.
829
+
830
+ Examples
831
+ --------
832
+ >>> gsheet = GSheet(credentials_path="creds.json")
833
+ >>> # Create and share with one person
834
+ >>> spreadsheet_id = gsheet.create_spreadsheet(
835
+ ... "My New Spreadsheet",
836
+ ... sheet_names=["Data", "Analysis"],
837
+ ... share_with="user@example.com"
838
+ ... )
839
+ >>>
840
+ >>> # Create and share with multiple people
841
+ >>> spreadsheet_id = gsheet.create_spreadsheet(
842
+ ... "Team Dashboard",
843
+ ... share_with=["alice@example.com", "bob@example.com"],
844
+ ... role="reader"
845
+ ... )
846
+ """
847
+ if self.drive_service is None:
848
+ log_and_raise_error(
849
+ self._logger,
850
+ "Drive API service is not initialized. Cannot create spreadsheet. "
851
+ "Please ensure the Drive API is enabled in your Google Cloud project.",
852
+ )
853
+
854
+ try:
855
+ # Use Drive API to create the spreadsheet file
856
+ file_metadata = {"name": title, "mimeType": "application/vnd.google-apps.spreadsheet"}
857
+
858
+ file = self.drive_service.files().create(body=file_metadata, fields="id").execute()
859
+
860
+ spreadsheet_id = file.get("id")
861
+ self._logger.info(f"Created new spreadsheet '{title}' with ID: {spreadsheet_id}")
862
+
863
+ # If custom sheet names are specified, update the spreadsheet
864
+ if sheet_names:
865
+ try:
866
+ requests = []
867
+ # Delete the default "Sheet1" if we're creating custom sheets
868
+ requests.append(
869
+ {
870
+ "deleteSheet": {
871
+ "sheetId": 0 # Default sheet ID
872
+ }
873
+ }
874
+ )
875
+ # Add custom sheets
876
+ for i, name in enumerate(sheet_names):
877
+ requests.append({"addSheet": {"properties": {"sheetId": i + 1, "title": name}}})
878
+
879
+ batch_update_request = {"requests": requests}
880
+ self.service.spreadsheets().batchUpdate(
881
+ spreadsheetId=spreadsheet_id, body=batch_update_request
882
+ ).execute()
883
+ self._logger.info(f"Added custom sheets: {sheet_names}")
884
+ except Exception as e:
885
+ self._logger.warning(f"Could not add custom sheets: {e}")
886
+
887
+ # Share the spreadsheet if email addresses are provided
888
+ if share_with:
889
+ self.share_spreadsheet(
890
+ spreadsheet_id=spreadsheet_id,
891
+ email_addresses=share_with,
892
+ role=role,
893
+ send_notification=send_notification,
894
+ )
895
+
896
+ return spreadsheet_id
897
+
898
+ except HttpError as e:
899
+ log_and_raise_error(
900
+ self._logger,
901
+ f"HTTP error creating Google Spreadsheet: {e}",
902
+ )
903
+ except Exception as e:
904
+ log_and_raise_error(
905
+ self._logger,
906
+ f"Error creating Google Spreadsheet: {e}",
907
+ )
908
+
909
+ def share_spreadsheet(
910
+ self,
911
+ spreadsheet_id: str = None,
912
+ email_addresses: list[str] | str = None,
913
+ role: str = "writer",
914
+ send_notification: bool = True,
915
+ ) -> list[dict]:
916
+ """
917
+ Share a Google Spreadsheet with one or more email addresses.
918
+
919
+ Parameters
920
+ ----------
921
+ spreadsheet_id : str, optional
922
+ The ID of the spreadsheet to share. Falls back to the instance
923
+ default set via ``GSheet(spreadsheet_id=...)`` when omitted.
924
+ email_addresses : list[str] | str
925
+ Email address(es) to share the spreadsheet with.
926
+ Can be a single email string or a list of email strings.
927
+ role : str, optional
928
+ Permission level: 'reader', 'writer', or 'owner'. Default is 'writer'.
929
+ send_notification : bool, optional
930
+ Whether to send email notifications. Default is True.
931
+
932
+ Returns
933
+ -------
934
+ list[dict]
935
+ List of permission objects created.
936
+
937
+ Examples
938
+ --------
939
+ >>> gsheet = GSheet(credentials_path="creds.json")
940
+ >>> gsheet.share_spreadsheet(
941
+ ... "1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms",
942
+ ... "user@example.com",
943
+ ... role="writer"
944
+ ... )
945
+ >>>
946
+ >>> # Share with multiple users
947
+ >>> gsheet.share_spreadsheet(
948
+ ... "1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms",
949
+ ... ["alice@example.com", "bob@example.com"],
950
+ ... role="reader"
951
+ ... )
952
+ """
953
+ spreadsheet_id = self._resolve_spreadsheet_id(spreadsheet_id)
954
+ if spreadsheet_id is None:
955
+ log_and_raise_error(
956
+ self._logger,
957
+ "No spreadsheet_id provided and no default set on the GSheet instance",
958
+ )
959
+ if email_addresses is None:
960
+ log_and_raise_error(self._logger, "'email_addresses' is required for share_spreadsheet")
961
+
962
+ if self.drive_service is None:
963
+ log_and_raise_error(
964
+ self._logger,
965
+ "Drive API service is not initialized. Cannot share spreadsheet.",
966
+ )
967
+
968
+ # Convert single email to list
969
+ if isinstance(email_addresses, str):
970
+ email_addresses = [email_addresses]
971
+
972
+ permissions = []
973
+ for email in email_addresses:
974
+ try:
975
+ permission = {
976
+ "type": "user",
977
+ "role": role,
978
+ "emailAddress": email,
979
+ }
980
+
981
+ result = (
982
+ self.drive_service.permissions()
983
+ .create(
984
+ fileId=spreadsheet_id,
985
+ body=permission,
986
+ sendNotificationEmail=send_notification,
987
+ )
988
+ .execute()
989
+ )
990
+
991
+ permissions.append(result)
992
+ self._logger.info(f"Shared spreadsheet {spreadsheet_id} with {email} as {role}")
993
+
994
+ except HttpError as e:
995
+ self._logger.error(f"Failed to share with {email}: {e}")
996
+ except Exception as e:
997
+ self._logger.error(f"Error sharing with {email}: {e}")
998
+
999
+ return permissions
1000
+
1001
+ def get_service_account_email(self) -> str:
1002
+ """
1003
+ Get the service account email address.
1004
+
1005
+ This email should be used to share Google Spreadsheets for programmatic access.
1006
+
1007
+ Returns
1008
+ -------
1009
+ str
1010
+ The service account email address.
1011
+
1012
+ Examples
1013
+ --------
1014
+ >>> gsheet = GSheet(credentials_path="creds.json")
1015
+ >>> email = gsheet.get_service_account_email()
1016
+ >>> print(f"Share your spreadsheet with: {email}")
1017
+ """
1018
+ return self.service_account_email
1019
+
1020
+ def _ensure_sheet_exists(self, spreadsheet_id: str, sheet_name: str) -> bool:
1021
+ """
1022
+ Check if a sheet exists in the spreadsheet, create it if it doesn't.
1023
+
1024
+ Parameters
1025
+ ----------
1026
+ spreadsheet_id : str
1027
+ The ID of the spreadsheet.
1028
+ sheet_name : str
1029
+ The name of the sheet to check/create.
1030
+
1031
+ Returns
1032
+ -------
1033
+ bool
1034
+ True if sheet was created, False if it already existed.
1035
+ """
1036
+ try:
1037
+ # Get spreadsheet info to check existing sheets
1038
+ spreadsheet = self.get_spreadsheet_info(spreadsheet_id)
1039
+ existing_sheets = [sheet["properties"]["title"] for sheet in spreadsheet["sheets"]]
1040
+
1041
+ if sheet_name in existing_sheets:
1042
+ return False # Sheet already exists
1043
+
1044
+ # Create the sheet
1045
+ requests = [{"addSheet": {"properties": {"title": sheet_name}}}]
1046
+
1047
+ batch_update_request = {"requests": requests}
1048
+ self.service.spreadsheets().batchUpdate(spreadsheetId=spreadsheet_id, body=batch_update_request).execute()
1049
+
1050
+ self._logger.info(f"Created new sheet '{sheet_name}' in spreadsheet {spreadsheet_id}")
1051
+ return True # Sheet was created
1052
+
1053
+ except HttpError as e:
1054
+ self._logger.warning(f"Could not check/create sheet '{sheet_name}': {e}")
1055
+ return False
1056
+ except Exception as e:
1057
+ self._logger.warning(f"Error checking/creating sheet '{sheet_name}': {e}")
1058
+ return False
1059
+
1060
+ def _autofit_columns(
1061
+ self,
1062
+ spreadsheet_id: str,
1063
+ sheet_name: str = None,
1064
+ num_columns: int = None,
1065
+ padding_pixels: int = 30,
1066
+ ) -> None:
1067
+ """
1068
+ Auto-resize columns to fit their content, then add padding for readability.
1069
+
1070
+ First uses Google Sheets autoResizeDimensions to fit content, then reads
1071
+ back the resulting column widths and adds extra padding so the data doesn't
1072
+ look cramped.
1073
+
1074
+ Parameters
1075
+ ----------
1076
+ spreadsheet_id : str
1077
+ The ID of the spreadsheet.
1078
+ sheet_name : str, optional
1079
+ The name of the sheet. Defaults to "Sheet1".
1080
+ num_columns : int, optional
1081
+ Number of columns to resize. If None, resizes all columns in the sheet.
1082
+ padding_pixels : int, optional
1083
+ Extra pixels to add to each column width after auto-resize.
1084
+ Default is 30. Set to 0 to skip padding.
1085
+ """
1086
+ try:
1087
+ target_name = sheet_name or "Sheet1"
1088
+ info = self.get_spreadsheet_info(spreadsheet_id)
1089
+ sheet_id = None
1090
+ for sheet in info["sheets"]:
1091
+ if sheet["properties"]["title"] == target_name:
1092
+ sheet_id = sheet["properties"]["sheetId"]
1093
+ break
1094
+
1095
+ if sheet_id is None:
1096
+ self._logger.warning(f"Sheet '{target_name}' not found for auto-fit; skipping column resize")
1097
+ return
1098
+
1099
+ # Step 1: Auto-resize columns to fit content
1100
+ dimensions = {
1101
+ "sheetId": sheet_id,
1102
+ "dimension": "COLUMNS",
1103
+ "startIndex": 0,
1104
+ }
1105
+ if num_columns is not None:
1106
+ dimensions["endIndex"] = num_columns
1107
+
1108
+ request = {"autoResizeDimensions": {"dimensions": dimensions}}
1109
+ self.service.spreadsheets().batchUpdate(
1110
+ spreadsheetId=spreadsheet_id, body={"requests": [request]}
1111
+ ).execute()
1112
+ self._logger.debug(f"Auto-fit columns for sheet '{target_name}'")
1113
+
1114
+ # Step 2: Add padding to each column for better readability
1115
+ if padding_pixels > 0:
1116
+ self._add_column_padding(spreadsheet_id, sheet_id, target_name, num_columns, padding_pixels)
1117
+
1118
+ except Exception as e:
1119
+ self._logger.warning(f"Could not auto-fit columns: {e}")
1120
+
1121
+ def _add_column_padding(
1122
+ self,
1123
+ spreadsheet_id: str,
1124
+ sheet_id: int,
1125
+ sheet_name: str,
1126
+ num_columns: int | None,
1127
+ padding_pixels: int,
1128
+ ) -> None:
1129
+ """
1130
+ Add extra padding to column widths after auto-resize.
1131
+
1132
+ Reads back the current column widths from the sheet metadata and adds
1133
+ the specified padding to each column.
1134
+
1135
+ Parameters
1136
+ ----------
1137
+ spreadsheet_id : str
1138
+ The ID of the spreadsheet.
1139
+ sheet_id : int
1140
+ The numeric sheet ID within the spreadsheet.
1141
+ sheet_name : str
1142
+ The sheet name (used for logging).
1143
+ num_columns : int | None
1144
+ Number of columns to pad. If None, pads all columns with metadata.
1145
+ padding_pixels : int
1146
+ Extra pixels to add to each column width.
1147
+ """
1148
+ try:
1149
+ fields = "sheets(properties(sheetId,title),data(columnMetadata(pixelSize)))"
1150
+ spreadsheet = self.service.spreadsheets().get(spreadsheetId=spreadsheet_id, fields=fields).execute()
1151
+
1152
+ column_widths = []
1153
+ for sheet in spreadsheet.get("sheets", []):
1154
+ if sheet["properties"]["sheetId"] == sheet_id:
1155
+ for data_section in sheet.get("data", []):
1156
+ for col_meta in data_section.get("columnMetadata", []):
1157
+ column_widths.append(col_meta.get("pixelSize", 100))
1158
+ break
1159
+
1160
+ if not column_widths:
1161
+ self._logger.debug("No column metadata found; skipping padding")
1162
+ return
1163
+
1164
+ end_col = num_columns if num_columns is not None else len(column_widths)
1165
+ end_col = min(end_col, len(column_widths))
1166
+
1167
+ padding_requests = []
1168
+ for i in range(end_col):
1169
+ new_width = column_widths[i] + padding_pixels
1170
+ padding_requests.append(
1171
+ {
1172
+ "updateDimensionProperties": {
1173
+ "range": {
1174
+ "sheetId": sheet_id,
1175
+ "dimension": "COLUMNS",
1176
+ "startIndex": i,
1177
+ "endIndex": i + 1,
1178
+ },
1179
+ "properties": {"pixelSize": new_width},
1180
+ "fields": "pixelSize",
1181
+ }
1182
+ }
1183
+ )
1184
+
1185
+ if padding_requests:
1186
+ self.service.spreadsheets().batchUpdate(
1187
+ spreadsheetId=spreadsheet_id, body={"requests": padding_requests}
1188
+ ).execute()
1189
+ self._logger.debug(f"Added {padding_pixels}px padding to {end_col} columns in sheet '{sheet_name}'")
1190
+
1191
+ except Exception as e:
1192
+ self._logger.warning(f"Could not add column padding: {e}")
1193
+
1194
+ def format_columns_as_percent(
1195
+ self,
1196
+ spreadsheet_id: str = None,
1197
+ columns: list[str | int] = None,
1198
+ sheet_name: str = None,
1199
+ pattern: str = "0.0%",
1200
+ has_header: bool = True,
1201
+ ) -> dict:
1202
+ """
1203
+ Apply percent number formatting to one or more columns in a sheet.
1204
+
1205
+ Values should be written as raw ratios (e.g. 0.143, not "14.3%"). Users
1206
+ will see "14.3%" in the UI while sorting and filtering remain numeric.
1207
+
1208
+ Parameters
1209
+ ----------
1210
+ spreadsheet_id : str
1211
+ The ID of the spreadsheet to format.
1212
+ columns : list[str | int]
1213
+ Columns to format. Each entry is either a column name (matched
1214
+ against the header row) or a 0-based column index.
1215
+ sheet_name : str, optional
1216
+ The name of the sheet. Defaults to "Sheet1".
1217
+ pattern : str, optional
1218
+ Google Sheets number format pattern. Default is "0.0%".
1219
+ Examples: "0%", "0.00%", "0.0%;[red]-0.0%".
1220
+ has_header : bool, optional
1221
+ If True (default), the first row is treated as a header and left
1222
+ unformatted; formatting starts at row 2. If False, formatting
1223
+ starts at row 1.
1224
+
1225
+ Returns
1226
+ -------
1227
+ dict
1228
+ The batchUpdate API response.
1229
+
1230
+ Examples
1231
+ --------
1232
+ >>> gsheet = GSheet(credentials_path="creds.json")
1233
+ >>> gsheet.write_sheet(df, spreadsheet_id=sid, sheet_name="Summary")
1234
+ >>> gsheet.format_columns_as_percent(
1235
+ ... spreadsheet_id=sid,
1236
+ ... columns=["conversion_rate", "bounce_rate"],
1237
+ ... sheet_name="Summary",
1238
+ ... )
1239
+ """
1240
+ return self._apply_number_format(
1241
+ spreadsheet_id=spreadsheet_id,
1242
+ columns=columns,
1243
+ number_format={"type": "PERCENT", "pattern": pattern},
1244
+ sheet_name=sheet_name,
1245
+ has_header=has_header,
1246
+ )
1247
+
1248
+ def format_columns_as_number(
1249
+ self,
1250
+ spreadsheet_id: str = None,
1251
+ columns: list[str | int] = None,
1252
+ sheet_name: str = None,
1253
+ pattern: str = "#,##0.00",
1254
+ has_header: bool = True,
1255
+ ) -> dict:
1256
+ """
1257
+ Apply numeric formatting (e.g. thousands separators) to one or more columns.
1258
+
1259
+ Values should be written as raw numbers (e.g. 6302320.01). With the
1260
+ default pattern, users will see "6,302,320.01" in the UI while sorting
1261
+ and filtering remain numeric.
1262
+
1263
+ Parameters
1264
+ ----------
1265
+ spreadsheet_id : str
1266
+ The ID of the spreadsheet to format.
1267
+ columns : list[str | int]
1268
+ Columns to format. Each entry is either a column name (matched
1269
+ against the header row) or a 0-based column index.
1270
+ sheet_name : str, optional
1271
+ The name of the sheet. Defaults to "Sheet1".
1272
+ pattern : str, optional
1273
+ Google Sheets number format pattern. Default is "#,##0.00".
1274
+ Examples: "#,##0" (integer with thousands), "#,##0.00" (two decimals),
1275
+ "$#,##0.00" (currency-style prefix), "#,##0.00;[red]-#,##0.00".
1276
+ has_header : bool, optional
1277
+ If True (default), the first row is treated as a header and left
1278
+ unformatted; formatting starts at row 2. If False, formatting
1279
+ starts at row 1.
1280
+
1281
+ Returns
1282
+ -------
1283
+ dict
1284
+ The batchUpdate API response.
1285
+
1286
+ Examples
1287
+ --------
1288
+ >>> gsheet = GSheet(credentials_path="creds.json")
1289
+ >>> gsheet.write_sheet(df, spreadsheet_id=sid, sheet_name="Summary")
1290
+ >>> gsheet.format_columns_as_number(
1291
+ ... spreadsheet_id=sid,
1292
+ ... columns=["revenue", "cost"],
1293
+ ... sheet_name="Summary",
1294
+ ... pattern="#,##0.00",
1295
+ ... )
1296
+ """
1297
+ return self._apply_number_format(
1298
+ spreadsheet_id=spreadsheet_id,
1299
+ columns=columns,
1300
+ number_format={"type": "NUMBER", "pattern": pattern},
1301
+ sheet_name=sheet_name,
1302
+ has_header=has_header,
1303
+ )
1304
+
1305
+ def format_columns_as_date(
1306
+ self,
1307
+ spreadsheet_id: str = None,
1308
+ columns: list[str | int] = None,
1309
+ sheet_name: str = None,
1310
+ pattern: str = "yyyy-mm-dd",
1311
+ has_header: bool = True,
1312
+ include_time: bool = False,
1313
+ ) -> dict:
1314
+ """
1315
+ Apply date (or date-time) formatting to one or more columns.
1316
+
1317
+ Works with cells that Sheets has parsed as dates. When writing via
1318
+ ``write_sheet``/``append_sheet`` with ``value_input_option="USER_ENTERED"``
1319
+ (the default), the helper ``_dataframe_to_values`` converts pandas
1320
+ datetime columns to ``"YYYY-MM-DD HH:MM:SS"`` strings, which Sheets
1321
+ parses back into serial date values. Applying this format controls
1322
+ how they are displayed while preserving sort/filter semantics.
1323
+
1324
+ Parameters
1325
+ ----------
1326
+ spreadsheet_id : str
1327
+ The ID of the spreadsheet to format.
1328
+ columns : list[str | int]
1329
+ Columns to format. Each entry is either a column name (matched
1330
+ against the header row) or a 0-based column index.
1331
+ sheet_name : str, optional
1332
+ The name of the sheet. Defaults to "Sheet1".
1333
+ pattern : str, optional
1334
+ Google Sheets date/time format pattern. Default is "yyyy-mm-dd".
1335
+ Examples: "yyyy-mm-dd", "dd/mm/yyyy", "mmm d, yyyy",
1336
+ "yyyy-mm-dd hh:mm:ss".
1337
+ has_header : bool, optional
1338
+ If True (default), the first row is treated as a header and left
1339
+ unformatted; formatting starts at row 2. If False, formatting
1340
+ starts at row 1.
1341
+ include_time : bool, optional
1342
+ If True, use the DATE_TIME number format type (so the cell is
1343
+ treated as a timestamp). If False (default), use the DATE type.
1344
+ The ``pattern`` still controls the exact display either way.
1345
+
1346
+ Returns
1347
+ -------
1348
+ dict
1349
+ The batchUpdate API response.
1350
+
1351
+ Examples
1352
+ --------
1353
+ >>> gsheet = GSheet(credentials_path="creds.json")
1354
+ >>> gsheet.write_sheet(df, spreadsheet_id=sid, sheet_name="Summary")
1355
+ >>> gsheet.format_columns_as_date(
1356
+ ... spreadsheet_id=sid,
1357
+ ... columns=["created_at", "updated_at"],
1358
+ ... sheet_name="Summary",
1359
+ ... pattern="yyyy-mm-dd hh:mm:ss",
1360
+ ... include_time=True,
1361
+ ... )
1362
+ """
1363
+ fmt_type = "DATE_TIME" if include_time else "DATE"
1364
+ return self._apply_number_format(
1365
+ spreadsheet_id=spreadsheet_id,
1366
+ columns=columns,
1367
+ number_format={"type": fmt_type, "pattern": pattern},
1368
+ sheet_name=sheet_name,
1369
+ has_header=has_header,
1370
+ )
1371
+
1372
+ def _apply_number_format(
1373
+ self,
1374
+ spreadsheet_id: str,
1375
+ columns: list[str | int],
1376
+ number_format: dict,
1377
+ sheet_name: str = None,
1378
+ has_header: bool = True,
1379
+ ) -> dict:
1380
+ """
1381
+ Apply a Google Sheets numberFormat dict to a set of columns.
1382
+
1383
+ Shared implementation for format_columns_as_percent and
1384
+ format_columns_as_number. Resolves column names via the header row,
1385
+ looks up the sheetId, and issues a single batchUpdate with one
1386
+ repeatCell request per column.
1387
+ """
1388
+ spreadsheet_id = self._resolve_spreadsheet_id(spreadsheet_id)
1389
+ if spreadsheet_id is None:
1390
+ log_and_raise_error(
1391
+ self._logger,
1392
+ "No spreadsheet_id provided and no default set on the GSheet instance",
1393
+ )
1394
+ if columns is None:
1395
+ log_and_raise_error(self._logger, "'columns' is required for column formatting")
1396
+
1397
+ target_sheet = sheet_name or "Sheet1"
1398
+
1399
+ info = self.get_spreadsheet_info(spreadsheet_id)
1400
+ sheet_id = None
1401
+ for sheet in info.get("sheets", []):
1402
+ if sheet["properties"]["title"] == target_sheet:
1403
+ sheet_id = sheet["properties"]["sheetId"]
1404
+ break
1405
+ if sheet_id is None:
1406
+ log_and_raise_error(
1407
+ self._logger,
1408
+ f"Sheet '{target_sheet}' not found in spreadsheet {spreadsheet_id}",
1409
+ )
1410
+
1411
+ header = None
1412
+ column_indices = []
1413
+ for col in columns:
1414
+ if isinstance(col, bool) or not isinstance(col, int | str):
1415
+ log_and_raise_error(self._logger, f"Invalid column identifier: {col!r}")
1416
+ if isinstance(col, int):
1417
+ column_indices.append(col)
1418
+ continue
1419
+ if header is None:
1420
+ formatted_sheet = self._format_sheet_name(target_sheet)
1421
+ header_range = f"{formatted_sheet}!1:1"
1422
+ result = (
1423
+ self.service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=header_range).execute()
1424
+ )
1425
+ header_values = result.get("values", [])
1426
+ header = header_values[0] if header_values else []
1427
+ try:
1428
+ column_indices.append(header.index(col))
1429
+ except ValueError:
1430
+ log_and_raise_error(
1431
+ self._logger,
1432
+ f"Column '{col}' not found in header row of sheet '{target_sheet}'",
1433
+ )
1434
+
1435
+ start_row = 1 if has_header else 0
1436
+ requests = [
1437
+ {
1438
+ "repeatCell": {
1439
+ "range": {
1440
+ "sheetId": sheet_id,
1441
+ "startRowIndex": start_row,
1442
+ "startColumnIndex": idx,
1443
+ "endColumnIndex": idx + 1,
1444
+ },
1445
+ "cell": {"userEnteredFormat": {"numberFormat": number_format}},
1446
+ "fields": "userEnteredFormat.numberFormat",
1447
+ }
1448
+ }
1449
+ for idx in column_indices
1450
+ ]
1451
+
1452
+ try:
1453
+ result = (
1454
+ self.service.spreadsheets()
1455
+ .batchUpdate(spreadsheetId=spreadsheet_id, body={"requests": requests})
1456
+ .execute()
1457
+ )
1458
+ self._logger.info(
1459
+ f"Applied {number_format['type']} format '{number_format['pattern']}' "
1460
+ f"to {len(requests)} column(s) in sheet '{target_sheet}'"
1461
+ )
1462
+ return result
1463
+
1464
+ except HttpError as e:
1465
+ log_and_raise_error(
1466
+ self._logger,
1467
+ f"HTTP error applying number format: {e}",
1468
+ )
1469
+ except Exception as e:
1470
+ log_and_raise_error(
1471
+ self._logger,
1472
+ f"Error applying number format: {e}",
1473
+ )
1474
+
1475
+ def get_spreadsheet_info(self, spreadsheet_id: str = None) -> dict:
1476
+ """
1477
+ Get information about a spreadsheet.
1478
+
1479
+ Parameters
1480
+ ----------
1481
+ spreadsheet_id : str, optional
1482
+ The ID of the spreadsheet. Falls back to the instance default set
1483
+ via ``GSheet(spreadsheet_id=...)`` when omitted.
1484
+
1485
+ Returns
1486
+ -------
1487
+ dict
1488
+ Spreadsheet metadata including sheets, properties, etc.
1489
+
1490
+ Examples
1491
+ --------
1492
+ >>> gsheet = GSheet(credentials_path="creds.json")
1493
+ >>> info = gsheet.get_spreadsheet_info("1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms")
1494
+ >>> print(info['properties']['title'])
1495
+ """
1496
+ spreadsheet_id = self._resolve_spreadsheet_id(spreadsheet_id)
1497
+ if spreadsheet_id is None:
1498
+ log_and_raise_error(
1499
+ self._logger,
1500
+ "No spreadsheet_id provided and no default set on the GSheet instance",
1501
+ )
1502
+ try:
1503
+ spreadsheet = self.service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute()
1504
+
1505
+ return spreadsheet
1506
+
1507
+ except HttpError as e:
1508
+ log_and_raise_error(
1509
+ self._logger,
1510
+ f"HTTP error getting spreadsheet info: {e}",
1511
+ )
1512
+ except Exception as e:
1513
+ log_and_raise_error(
1514
+ self._logger,
1515
+ f"Error getting spreadsheet info: {e}",
1516
+ )
1517
+
1518
+ def to_csv(self, spreadsheet_id: str = None, range_name: str = None, sheet_name: str = None) -> str:
1519
+ """
1520
+ Read data from Google Sheet and convert to CSV string.
1521
+
1522
+ Parameters
1523
+ ----------
1524
+ spreadsheet_id : str, optional
1525
+ The ID of the spreadsheet to read from. Falls back to the instance
1526
+ default set via ``GSheet(spreadsheet_id=...)`` when omitted.
1527
+ range_name : str, optional
1528
+ The A1 notation range to read.
1529
+ sheet_name : str, optional
1530
+ The name of the sheet to read from.
1531
+
1532
+ Returns
1533
+ -------
1534
+ str
1535
+ CSV formatted string.
1536
+
1537
+ Examples
1538
+ --------
1539
+ >>> gsheet = GSheet(credentials_path="creds.json")
1540
+ >>> csv_data = gsheet.to_csv("1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms")
1541
+ """
1542
+ df = self.read_sheet(spreadsheet_id, range_name, sheet_name, return_as="dataframe")
1543
+ return df.to_csv(index=False)
1544
+
1545
+ def gsheet_to_s3(
1546
+ self,
1547
+ spreadsheet_id: str = None,
1548
+ file_name: str = None,
1549
+ directory: str = None,
1550
+ range_name: str = None,
1551
+ sheet_name: str = None,
1552
+ file_format: str = "csv",
1553
+ s3_connector=None,
1554
+ bucket: str = None,
1555
+ ) -> None:
1556
+ """
1557
+ Transfer data from Google Sheet to S3.
1558
+
1559
+ Parameters
1560
+ ----------
1561
+ spreadsheet_id : str, optional
1562
+ The ID of the spreadsheet to read from. Falls back to the instance
1563
+ default set via ``GSheet(spreadsheet_id=...)`` when omitted.
1564
+ file_name : str
1565
+ The name of the file (without extension).
1566
+ directory : str, optional
1567
+ The directory path where the file will be saved.
1568
+ range_name : str, optional
1569
+ The A1 notation range to read.
1570
+ sheet_name : str, optional
1571
+ The name of the sheet to read from.
1572
+ file_format : str, optional
1573
+ File format to save: 'csv' or 'parquet'. Default is 'csv'.
1574
+ s3_connector : S3Connector, optional
1575
+ Existing S3Connector instance. If provided, bucket parameter is ignored and bucket is taken from the connector.
1576
+ bucket : str, optional
1577
+ S3 bucket name. Only used if s3_connector is None. If both s3_connector and bucket are None, raises an error.
1578
+
1579
+ Examples
1580
+ --------
1581
+ >>> gsheet = GSheet(credentials_path="creds.json")
1582
+ >>> s3 = S3Connector(bucket="my-bucket", s3_root="my-project")
1583
+ >>> gsheet.gsheet_to_s3(
1584
+ ... spreadsheet_id="1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms",
1585
+ ... s3_connector=s3,
1586
+ ... directory="data",
1587
+ ... file_name="output"
1588
+ ... )
1589
+ """ # noqa: E501
1590
+ from .s3_connector import S3Connector
1591
+
1592
+ if file_name is None:
1593
+ log_and_raise_error(self._logger, "'file_name' is required for gsheet_to_s3")
1594
+
1595
+ # Read data from Google Sheet (read_sheet resolves the spreadsheet_id)
1596
+ df = self.read_sheet(spreadsheet_id, range_name, sheet_name, return_as="dataframe")
1597
+
1598
+ # Additional cleaning for S3 transfer
1599
+ df = df.fillna("")
1600
+ df = df.replace([float("inf"), float("-inf")], "")
1601
+
1602
+ # Initialize S3 connector if not provided
1603
+ if s3_connector is None:
1604
+ if bucket is None:
1605
+ log_and_raise_error(
1606
+ self._logger,
1607
+ "Either 's3_connector' or 'bucket' parameter must be provided.",
1608
+ )
1609
+ s3_connector = S3Connector(bucket=bucket, auto_sso_login=True)
1610
+
1611
+ # Get bucket and s3_root from s3_connector
1612
+ target_bucket = s3_connector.bucket
1613
+ s3_root = s3_connector.s3_root
1614
+
1615
+ # Add file extension if not present
1616
+ file_extension = f".{file_format.lower()}"
1617
+ if not file_name.endswith(file_extension):
1618
+ file_name_with_ext = f"{file_name}{file_extension}"
1619
+ else:
1620
+ file_name_with_ext = file_name
1621
+
1622
+ # Construct full S3 key with s3_root and directory
1623
+ if directory is None:
1624
+ directory = ""
1625
+
1626
+ parts = [s3_root, directory, file_name_with_ext]
1627
+ full_s3_key = "/".join(part.strip("/") for part in parts if part).lstrip("/")
1628
+
1629
+ # Convert to appropriate format
1630
+ if file_format.lower() == "csv":
1631
+ buffer = io.StringIO()
1632
+ df.to_csv(buffer, index=False)
1633
+ body = buffer.getvalue()
1634
+ elif file_format.lower() == "parquet":
1635
+ buffer = io.BytesIO()
1636
+ df.to_parquet(buffer, index=False)
1637
+ body = buffer.getvalue()
1638
+ else:
1639
+ log_and_raise_error(
1640
+ self._logger,
1641
+ f"Unsupported file format: {file_format}. Use 'csv' or 'parquet'.",
1642
+ )
1643
+
1644
+ # Upload to S3
1645
+ s3_connector.s3.put_object(Bucket=target_bucket, Key=full_s3_key, Body=body)
1646
+ self._logger.info(f"Successfully transferred data to s3://{target_bucket}/{full_s3_key}")