ml-analytics-tools 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ml_analytics/__init__.py +53 -0
- ml_analytics/aws_auth.py +169 -0
- ml_analytics/cli.py +58 -0
- ml_analytics/data_connector.py +2615 -0
- ml_analytics/gsheet_connector.py +1646 -0
- ml_analytics/model_manager.py +1208 -0
- ml_analytics/model_tools.py +990 -0
- ml_analytics/s3_connector.py +1381 -0
- ml_analytics/slack_connector.py +637 -0
- ml_analytics/tunnel_manager.py +277 -0
- ml_analytics/utils.py +673 -0
- ml_analytics_tools-0.2.0.dist-info/METADATA +231 -0
- ml_analytics_tools-0.2.0.dist-info/RECORD +17 -0
- ml_analytics_tools-0.2.0.dist-info/WHEEL +5 -0
- ml_analytics_tools-0.2.0.dist-info/entry_points.txt +4 -0
- ml_analytics_tools-0.2.0.dist-info/licenses/LICENSE +21 -0
- ml_analytics_tools-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1646 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Google Sheets connector for reading and writing data to Google Sheets.
|
|
3
|
+
|
|
4
|
+
Data Cleaning Features:
|
|
5
|
+
- Automatically handles missing values (NaN, None) by converting to empty strings
|
|
6
|
+
- Replaces infinity values (inf, -inf) with empty strings
|
|
7
|
+
- Normalizes null-like string values ('None', 'none', 'null', 'NULL')
|
|
8
|
+
- Converts object columns to strings to avoid type issues
|
|
9
|
+
- Pads rows with missing columns to match header length (handles trailing empty cells)
|
|
10
|
+
- Truncates rows that are longer than headers
|
|
11
|
+
- All cleaning is applied automatically during read/write operations
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import io
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
import pandas as pd
|
|
21
|
+
from google.oauth2 import service_account
|
|
22
|
+
from googleapiclient.discovery import build
|
|
23
|
+
from googleapiclient.errors import HttpError
|
|
24
|
+
|
|
25
|
+
from .utils import get_credential_value, get_logger, log_and_raise_error
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class GSheet:
|
|
29
|
+
"""
|
|
30
|
+
A connector class for interacting with Google Sheets API.
|
|
31
|
+
|
|
32
|
+
This class provides methods to read from and write to Google Sheets,
|
|
33
|
+
with support for both service account and OAuth2 authentication.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
@staticmethod
|
|
37
|
+
def _dataframe_to_values(data: pd.DataFrame, include_headers: bool = True) -> list[list[Any]]:
|
|
38
|
+
"""
|
|
39
|
+
Convert a DataFrame into JSON-serializable rows for the Sheets API.
|
|
40
|
+
|
|
41
|
+
Handles Categorical, NaN/None/inf, datetime/Timestamp (including tz-aware),
|
|
42
|
+
Period, and timedelta columns that would otherwise fail json.dumps.
|
|
43
|
+
"""
|
|
44
|
+
from pandas.api import types as pdt
|
|
45
|
+
|
|
46
|
+
data_clean = data.copy()
|
|
47
|
+
for col in data_clean.columns:
|
|
48
|
+
series = data_clean[col]
|
|
49
|
+
if isinstance(series.dtype, pd.CategoricalDtype):
|
|
50
|
+
data_clean[col] = series.astype(object)
|
|
51
|
+
continue
|
|
52
|
+
if pdt.is_datetime64_any_dtype(series):
|
|
53
|
+
data_clean[col] = series.dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
54
|
+
elif isinstance(series.dtype, pd.PeriodDtype) or pdt.is_timedelta64_dtype(series):
|
|
55
|
+
data_clean[col] = series.astype(str)
|
|
56
|
+
data_clean = data_clean.fillna("")
|
|
57
|
+
data_clean = data_clean.replace([float("inf"), float("-inf")], "")
|
|
58
|
+
for col in data_clean.columns:
|
|
59
|
+
if data_clean[col].dtype == "object":
|
|
60
|
+
data_clean[col] = data_clean[col].astype(str).replace("nan", "").replace("None", "")
|
|
61
|
+
if include_headers:
|
|
62
|
+
return [data_clean.columns.tolist()] + data_clean.values.tolist()
|
|
63
|
+
return data_clean.values.tolist()
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def _format_sheet_name(sheet_name: str) -> str:
|
|
67
|
+
"""
|
|
68
|
+
Format a sheet name for use in A1 notation.
|
|
69
|
+
Adds single quotes around the name if it contains spaces or special characters.
|
|
70
|
+
|
|
71
|
+
Parameters
|
|
72
|
+
----------
|
|
73
|
+
sheet_name : str
|
|
74
|
+
The sheet name to format.
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
str
|
|
79
|
+
Properly formatted sheet name for A1 notation.
|
|
80
|
+
"""
|
|
81
|
+
# If sheet name contains spaces or special characters, wrap in single quotes
|
|
82
|
+
if any(char in sheet_name for char in [" ", "!", "'"]):
|
|
83
|
+
# Escape any single quotes in the sheet name by doubling them
|
|
84
|
+
escaped_name = sheet_name.replace("'", "''")
|
|
85
|
+
return f"'{escaped_name}'"
|
|
86
|
+
return sheet_name
|
|
87
|
+
|
|
88
|
+
@staticmethod
|
|
89
|
+
def _find_google_credentials_file() -> Path | None:
|
|
90
|
+
"""
|
|
91
|
+
Search for a Google service account credentials JSON file starting from the project root,
|
|
92
|
+
then current directory and parent directory.
|
|
93
|
+
|
|
94
|
+
Returns
|
|
95
|
+
-------
|
|
96
|
+
Path | None
|
|
97
|
+
Path to the credentials file if found, None otherwise.
|
|
98
|
+
"""
|
|
99
|
+
from .utils import find_project_root
|
|
100
|
+
|
|
101
|
+
directories_to_search = []
|
|
102
|
+
|
|
103
|
+
# Try to find project root first
|
|
104
|
+
try:
|
|
105
|
+
project_root = find_project_root()
|
|
106
|
+
directories_to_search.append(project_root)
|
|
107
|
+
except FileNotFoundError:
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
# Then check current directory and parent directory
|
|
111
|
+
current_dir = Path.cwd()
|
|
112
|
+
directories_to_search.extend([current_dir, current_dir.parent])
|
|
113
|
+
|
|
114
|
+
for search_dir in directories_to_search:
|
|
115
|
+
# Look for JSON files in directory
|
|
116
|
+
for json_file in search_dir.glob("*.json"):
|
|
117
|
+
try:
|
|
118
|
+
with open(json_file) as f:
|
|
119
|
+
data = json.load(f)
|
|
120
|
+
# Check if it's a Google service account file
|
|
121
|
+
if (
|
|
122
|
+
isinstance(data, dict)
|
|
123
|
+
and data.get("type") == "service_account"
|
|
124
|
+
and (
|
|
125
|
+
"googleapis.com" in str(data.get("auth_uri", ""))
|
|
126
|
+
or "googleapis.com" in str(data.get("token_uri", ""))
|
|
127
|
+
or "client_email" in data
|
|
128
|
+
)
|
|
129
|
+
):
|
|
130
|
+
return json_file
|
|
131
|
+
except (json.JSONDecodeError, Exception):
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
def __init__(
|
|
137
|
+
self,
|
|
138
|
+
credentials_path: str | Path = None,
|
|
139
|
+
credentials_json: dict = None,
|
|
140
|
+
scopes: list[str] = None,
|
|
141
|
+
log_level: str = "INFO",
|
|
142
|
+
scope: str = "ml",
|
|
143
|
+
spreadsheet_id: str = None,
|
|
144
|
+
):
|
|
145
|
+
"""
|
|
146
|
+
Initialize the Google Sheets connector.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
credentials_path : str | Path, optional
|
|
151
|
+
Path to the service account credentials JSON file.
|
|
152
|
+
If not provided, will look for 'gsheet_credentials.json' in the current directory.
|
|
153
|
+
credentials_json : dict, optional
|
|
154
|
+
Service account credentials as a dictionary (alternative to credentials_path).
|
|
155
|
+
scopes : list[str], optional
|
|
156
|
+
Google API scopes to use. Defaults to read/write access to Google Sheets.
|
|
157
|
+
log_level : str, optional
|
|
158
|
+
Logging level. Default is "INFO".
|
|
159
|
+
scope : str, optional
|
|
160
|
+
Scope for mounted secrets (e.g., '/mnt/{scope}/GOOGLE_CREDENTIALS').
|
|
161
|
+
Default is "ml".
|
|
162
|
+
spreadsheet_id : str, optional
|
|
163
|
+
Default spreadsheet ID used by any method that accepts a ``spreadsheet_id``
|
|
164
|
+
argument. A ``spreadsheet_id`` passed to an individual method call always
|
|
165
|
+
takes precedence over this default. When neither is provided, falls back
|
|
166
|
+
to the ``GSHEET_SPREADSHEET_ID`` environment variable if set.
|
|
167
|
+
|
|
168
|
+
Examples
|
|
169
|
+
--------
|
|
170
|
+
>>> # Using credentials file
|
|
171
|
+
>>> gsheet = GSheet(credentials_path="path/to/credentials.json")
|
|
172
|
+
>>>
|
|
173
|
+
>>> # Using credentials dictionary
|
|
174
|
+
>>> creds_dict = json.loads(os.environ['GOOGLE_CREDENTIALS'])
|
|
175
|
+
>>> gsheet = GSheet(credentials_json=creds_dict)
|
|
176
|
+
>>>
|
|
177
|
+
>>> # Auto-load from default location
|
|
178
|
+
>>> gsheet = GSheet() # Looks for gsheet_credentials.json
|
|
179
|
+
>>>
|
|
180
|
+
>>> # Using mounted secrets with a custom scope
|
|
181
|
+
>>> gsheet = GSheet(scope="custom-scope")
|
|
182
|
+
>>>
|
|
183
|
+
>>> # Bind a default spreadsheet ID so later calls can omit it
|
|
184
|
+
>>> gsheet = GSheet(spreadsheet_id="1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms")
|
|
185
|
+
>>> gsheet.read_sheet() # uses the bound ID
|
|
186
|
+
>>> gsheet.write_sheet(df, spreadsheet_id="other-id") # per-call ID overrides
|
|
187
|
+
"""
|
|
188
|
+
self._logger = get_logger("GSheet")
|
|
189
|
+
self._logger.setLevel(log_level)
|
|
190
|
+
self._scope = scope
|
|
191
|
+
|
|
192
|
+
if scopes is None:
|
|
193
|
+
self.scopes = [
|
|
194
|
+
"https://www.googleapis.com/auth/spreadsheets",
|
|
195
|
+
"https://www.googleapis.com/auth/drive.file",
|
|
196
|
+
"https://www.googleapis.com/auth/drive",
|
|
197
|
+
]
|
|
198
|
+
else:
|
|
199
|
+
self.scopes = scopes
|
|
200
|
+
|
|
201
|
+
# Initialize credentials
|
|
202
|
+
self.credentials = self._initialize_credentials(credentials_path, credentials_json)
|
|
203
|
+
self.service_account_email = self.credentials.service_account_email
|
|
204
|
+
|
|
205
|
+
# Build the service
|
|
206
|
+
try:
|
|
207
|
+
self.service = build("sheets", "v4", credentials=self.credentials)
|
|
208
|
+
except Exception as e:
|
|
209
|
+
log_and_raise_error(self._logger, f"Failed to initialize Google Sheets API service: {e}")
|
|
210
|
+
|
|
211
|
+
# Build Drive API service for sharing capabilities
|
|
212
|
+
try:
|
|
213
|
+
self.drive_service = build("drive", "v3", credentials=self.credentials)
|
|
214
|
+
except Exception as e:
|
|
215
|
+
self._logger.warning(f"Failed to initialize Google Drive API service: {e}")
|
|
216
|
+
self.drive_service = None
|
|
217
|
+
|
|
218
|
+
# Single success message after all services initialized
|
|
219
|
+
self._logger.info("Google API services initialized successfully")
|
|
220
|
+
|
|
221
|
+
# Resolve default spreadsheet_id: explicit arg wins, else GSHEET_SPREADSHEET_ID env var
|
|
222
|
+
if spreadsheet_id is None:
|
|
223
|
+
spreadsheet_id = os.environ.get("GSHEET_SPREADSHEET_ID")
|
|
224
|
+
if spreadsheet_id:
|
|
225
|
+
self._logger.debug("Using GSHEET_SPREADSHEET_ID from environment")
|
|
226
|
+
self.spreadsheet_id = spreadsheet_id
|
|
227
|
+
|
|
228
|
+
def _assemble_credentials_from_components(self) -> dict | None:
|
|
229
|
+
"""
|
|
230
|
+
Assemble Google service account credentials from individual Vault secrets.
|
|
231
|
+
|
|
232
|
+
Supports credentials stored as separate fields:
|
|
233
|
+
- GOOGLE_PROJECT_ID
|
|
234
|
+
- GOOGLE_API_PKEY_ID
|
|
235
|
+
- GOOGLE_API_PKEY
|
|
236
|
+
- GOOGLE_CLIENT_EMAIL
|
|
237
|
+
- GOOGLE_CLIENT_ID
|
|
238
|
+
- GOOGLE_CERT_URL
|
|
239
|
+
|
|
240
|
+
Returns
|
|
241
|
+
-------
|
|
242
|
+
dict | None
|
|
243
|
+
Service account credentials dictionary if all required fields found, None otherwise.
|
|
244
|
+
"""
|
|
245
|
+
try:
|
|
246
|
+
# Fetch all required credential components
|
|
247
|
+
project_id = get_credential_value("GOOGLE_PROJECT_ID", scope=self._scope)
|
|
248
|
+
private_key_id = get_credential_value("GOOGLE_API_PKEY_ID", scope=self._scope)
|
|
249
|
+
private_key = get_credential_value("GOOGLE_API_PKEY", scope=self._scope)
|
|
250
|
+
client_email = get_credential_value("GOOGLE_CLIENT_EMAIL", scope=self._scope)
|
|
251
|
+
client_id = get_credential_value("GOOGLE_CLIENT_ID", scope=self._scope)
|
|
252
|
+
cert_url = get_credential_value("GOOGLE_CERT_URL", scope=self._scope)
|
|
253
|
+
|
|
254
|
+
# Handle escaped newlines in private key (common in Vault)
|
|
255
|
+
if "\\n" in private_key:
|
|
256
|
+
private_key = private_key.replace("\\n", "\n")
|
|
257
|
+
|
|
258
|
+
# Build the service account credentials dictionary
|
|
259
|
+
credentials_dict = {
|
|
260
|
+
"type": "service_account",
|
|
261
|
+
"project_id": project_id,
|
|
262
|
+
"private_key_id": private_key_id,
|
|
263
|
+
"private_key": private_key,
|
|
264
|
+
"client_email": client_email,
|
|
265
|
+
"client_id": client_id,
|
|
266
|
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
|
267
|
+
"token_uri": "https://oauth2.googleapis.com/token",
|
|
268
|
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
|
269
|
+
"client_x509_cert_url": cert_url,
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
self._logger.debug("Assembled Google credentials from individual Vault secrets")
|
|
273
|
+
return credentials_dict
|
|
274
|
+
|
|
275
|
+
except Exception as e:
|
|
276
|
+
self._logger.debug(f"Could not assemble credentials from individual components: {e}")
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
def _initialize_credentials(
|
|
280
|
+
self, credentials_path: str | Path = None, credentials_json: dict = None
|
|
281
|
+
) -> service_account.Credentials:
|
|
282
|
+
"""
|
|
283
|
+
Initialize Google API credentials from file or dictionary.
|
|
284
|
+
|
|
285
|
+
Parameters
|
|
286
|
+
----------
|
|
287
|
+
credentials_path : str | Path, optional
|
|
288
|
+
Path to credentials JSON file.
|
|
289
|
+
credentials_json : dict, optional
|
|
290
|
+
Credentials dictionary.
|
|
291
|
+
|
|
292
|
+
Returns
|
|
293
|
+
-------
|
|
294
|
+
service_account.Credentials
|
|
295
|
+
Google service account credentials.
|
|
296
|
+
"""
|
|
297
|
+
# Try to auto-load from default location if no credentials provided
|
|
298
|
+
if credentials_path is None and credentials_json is None:
|
|
299
|
+
# First try to get credentials from environment variable or mounted secret (single JSON)
|
|
300
|
+
try:
|
|
301
|
+
credentials_str = get_credential_value("GOOGLE_CREDENTIALS", scope=self._scope)
|
|
302
|
+
credentials_json = json.loads(credentials_str)
|
|
303
|
+
self._logger.debug("Using GOOGLE_CREDENTIALS from environment or mounted secret")
|
|
304
|
+
except Exception:
|
|
305
|
+
# Try assembling from individual Vault secrets
|
|
306
|
+
credentials_json = self._assemble_credentials_from_components()
|
|
307
|
+
|
|
308
|
+
if credentials_json is None:
|
|
309
|
+
# Fall back to file-based credential resolution
|
|
310
|
+
# First try the default filename
|
|
311
|
+
default_path = Path.cwd() / "gsheet_credentials.json"
|
|
312
|
+
if default_path.exists():
|
|
313
|
+
credentials_path = default_path
|
|
314
|
+
else:
|
|
315
|
+
# Search for any JSON file containing Google credentials
|
|
316
|
+
found_creds = GSheet._find_google_credentials_file()
|
|
317
|
+
if found_creds:
|
|
318
|
+
credentials_path = found_creds
|
|
319
|
+
else:
|
|
320
|
+
self._logger.error(
|
|
321
|
+
"No credentials provided and no Google credentials JSON file found in current directory."
|
|
322
|
+
)
|
|
323
|
+
self._logger.info(
|
|
324
|
+
"Please provide credentials via 'credentials_path' parameter or place a Google service account JSON file in the current directory." # noqa: E501
|
|
325
|
+
)
|
|
326
|
+
log_and_raise_error(
|
|
327
|
+
self._logger,
|
|
328
|
+
"Either 'credentials_path' or 'credentials_json' must be provided, or a Google service account JSON file must exist in the current directory", # noqa: E501
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
try:
|
|
332
|
+
if credentials_path is not None:
|
|
333
|
+
credentials_path = Path(credentials_path)
|
|
334
|
+
if not credentials_path.exists():
|
|
335
|
+
log_and_raise_error(
|
|
336
|
+
self._logger,
|
|
337
|
+
f"Credentials file not found at: {credentials_path}",
|
|
338
|
+
)
|
|
339
|
+
credentials = service_account.Credentials.from_service_account_file(
|
|
340
|
+
str(credentials_path), scopes=self.scopes
|
|
341
|
+
)
|
|
342
|
+
else:
|
|
343
|
+
credentials = service_account.Credentials.from_service_account_info(
|
|
344
|
+
credentials_json, scopes=self.scopes
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
return credentials
|
|
348
|
+
|
|
349
|
+
except Exception as e:
|
|
350
|
+
log_and_raise_error(self._logger, f"Failed to initialize credentials: {e}")
|
|
351
|
+
|
|
352
|
+
def _resolve_spreadsheet_id(self, spreadsheet_id: str | None) -> str | None:
|
|
353
|
+
"""Return the per-call spreadsheet_id if provided, else the instance default."""
|
|
354
|
+
return spreadsheet_id if spreadsheet_id is not None else self.spreadsheet_id
|
|
355
|
+
|
|
356
|
+
def read_sheet(
|
|
357
|
+
self,
|
|
358
|
+
spreadsheet_id: str = None,
|
|
359
|
+
range_name: str = None,
|
|
360
|
+
sheet_name: str = None,
|
|
361
|
+
return_as: str = "dataframe",
|
|
362
|
+
) -> pd.DataFrame | list[list[Any]]:
|
|
363
|
+
"""
|
|
364
|
+
Read data from a Google Sheet.
|
|
365
|
+
|
|
366
|
+
Parameters
|
|
367
|
+
----------
|
|
368
|
+
spreadsheet_id : str, optional
|
|
369
|
+
The ID of the spreadsheet to read from. Falls back to the instance
|
|
370
|
+
default set via ``GSheet(spreadsheet_id=...)`` when omitted.
|
|
371
|
+
range_name : str, optional
|
|
372
|
+
The A1 notation range to read (e.g., 'Sheet1!A1:D10').
|
|
373
|
+
If None and sheet_name is provided, reads entire sheet.
|
|
374
|
+
sheet_name : str, optional
|
|
375
|
+
The name of the sheet to read from. Used if range_name is not provided.
|
|
376
|
+
return_as : str, optional
|
|
377
|
+
Format to return data: 'dataframe' (default) or 'list'.
|
|
378
|
+
|
|
379
|
+
Returns
|
|
380
|
+
-------
|
|
381
|
+
pd.DataFrame | list[list[Any]]
|
|
382
|
+
The data from the sheet as a DataFrame or list of lists.
|
|
383
|
+
|
|
384
|
+
Examples
|
|
385
|
+
--------
|
|
386
|
+
>>> gsheet = GSheet(credentials_path="creds.json")
|
|
387
|
+
>>> df = gsheet.read_sheet("1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms")
|
|
388
|
+
>>> df = gsheet.read_sheet("1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms",
|
|
389
|
+
... range_name="Sheet1!A1:D10")
|
|
390
|
+
"""
|
|
391
|
+
spreadsheet_id = self._resolve_spreadsheet_id(spreadsheet_id)
|
|
392
|
+
if spreadsheet_id is None:
|
|
393
|
+
log_and_raise_error(
|
|
394
|
+
self._logger,
|
|
395
|
+
"No spreadsheet_id provided and no default set on the GSheet instance",
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
# Build the range
|
|
399
|
+
if range_name is None:
|
|
400
|
+
if sheet_name is None:
|
|
401
|
+
range_name = "A:ZZ" # Read all columns
|
|
402
|
+
else:
|
|
403
|
+
formatted_sheet = self._format_sheet_name(sheet_name)
|
|
404
|
+
range_name = f"{formatted_sheet}!A:ZZ"
|
|
405
|
+
|
|
406
|
+
try:
|
|
407
|
+
result = self.service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=range_name).execute()
|
|
408
|
+
|
|
409
|
+
values = result.get("values", [])
|
|
410
|
+
|
|
411
|
+
if not values:
|
|
412
|
+
self._logger.warning(f"No data found in range: {range_name}")
|
|
413
|
+
return pd.DataFrame() if return_as == "dataframe" else []
|
|
414
|
+
|
|
415
|
+
self._logger.info(f"Successfully read {len(values)} rows from spreadsheet {spreadsheet_id}")
|
|
416
|
+
|
|
417
|
+
if return_as == "dataframe":
|
|
418
|
+
# Use first row as headers
|
|
419
|
+
if len(values) > 0:
|
|
420
|
+
headers = values[0]
|
|
421
|
+
data_rows = values[1:]
|
|
422
|
+
|
|
423
|
+
# Ensure all rows have the same length as headers by padding with empty strings
|
|
424
|
+
# This handles cases where rows have trailing empty cells that Google Sheets API omits
|
|
425
|
+
num_columns = len(headers)
|
|
426
|
+
normalized_rows = []
|
|
427
|
+
for row in data_rows:
|
|
428
|
+
if len(row) < num_columns:
|
|
429
|
+
# Pad row with empty strings to match header length
|
|
430
|
+
row = row + [""] * (num_columns - len(row))
|
|
431
|
+
elif len(row) > num_columns:
|
|
432
|
+
# Truncate row if it's longer than headers (rare but possible)
|
|
433
|
+
row = row[:num_columns]
|
|
434
|
+
normalized_rows.append(row)
|
|
435
|
+
|
|
436
|
+
df = pd.DataFrame(normalized_rows, columns=headers)
|
|
437
|
+
# Handle missing values: replace empty strings and None with empty string
|
|
438
|
+
df = df.fillna("")
|
|
439
|
+
# Replace any remaining None-like values that might come from sheets
|
|
440
|
+
df = df.replace([None, "None", "none", "null", "NULL"], "")
|
|
441
|
+
return df
|
|
442
|
+
return pd.DataFrame()
|
|
443
|
+
else:
|
|
444
|
+
return values
|
|
445
|
+
|
|
446
|
+
except HttpError as e:
|
|
447
|
+
log_and_raise_error(
|
|
448
|
+
self._logger,
|
|
449
|
+
f"HTTP error reading from Google Sheet: {e}",
|
|
450
|
+
)
|
|
451
|
+
except Exception as e:
|
|
452
|
+
log_and_raise_error(
|
|
453
|
+
self._logger,
|
|
454
|
+
f"Error reading from Google Sheet: {e}",
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
def write_sheet(
|
|
458
|
+
self,
|
|
459
|
+
data: pd.DataFrame | list[list[Any]],
|
|
460
|
+
spreadsheet_id: str = None,
|
|
461
|
+
spreadsheet_title: str = None,
|
|
462
|
+
range_name: str = None,
|
|
463
|
+
sheet_name: str = None,
|
|
464
|
+
value_input_option: str = "USER_ENTERED",
|
|
465
|
+
include_headers: bool = True,
|
|
466
|
+
clear_before_write: bool = False,
|
|
467
|
+
share_with: list[str] | str = None,
|
|
468
|
+
role: str = "writer",
|
|
469
|
+
autofit_columns: bool = True,
|
|
470
|
+
column_padding: int = 30,
|
|
471
|
+
) -> dict | tuple[dict, str]:
|
|
472
|
+
"""
|
|
473
|
+
Write data to a Google Sheet. Creates a new spreadsheet if it doesn't exist.
|
|
474
|
+
|
|
475
|
+
Parameters
|
|
476
|
+
----------
|
|
477
|
+
data : pd.DataFrame | list[list[Any]]
|
|
478
|
+
The data to write. Can be a DataFrame or list of lists.
|
|
479
|
+
spreadsheet_id : str, optional
|
|
480
|
+
The ID of the spreadsheet to write to. Falls back to the instance
|
|
481
|
+
default set via ``GSheet(spreadsheet_id=...)`` when omitted. If both
|
|
482
|
+
are None, ``spreadsheet_title`` must be provided and a new
|
|
483
|
+
spreadsheet will be created.
|
|
484
|
+
spreadsheet_title : str, optional
|
|
485
|
+
Title for a new spreadsheet. Used only if no spreadsheet_id is resolved.
|
|
486
|
+
A new spreadsheet will be created with this title.
|
|
487
|
+
range_name : str, optional
|
|
488
|
+
The A1 notation range to write to (e.g., 'Sheet1!A1').
|
|
489
|
+
sheet_name : str, optional
|
|
490
|
+
The name of the sheet to write to. Used if range_name is not provided.
|
|
491
|
+
value_input_option : str, optional
|
|
492
|
+
How to interpret the input data. Options: 'RAW' or 'USER_ENTERED' (default).
|
|
493
|
+
include_headers : bool, optional
|
|
494
|
+
Whether to include DataFrame column names as headers. Default is True.
|
|
495
|
+
clear_before_write : bool, optional
|
|
496
|
+
Whether to clear the range before writing. Default is False.
|
|
497
|
+
share_with : list[str] | str, optional
|
|
498
|
+
Email address(es) to share the spreadsheet with (only used when creating new spreadsheet).
|
|
499
|
+
role : str, optional
|
|
500
|
+
Permission level when sharing: 'reader', 'writer', or 'owner'. Default is 'writer'.
|
|
501
|
+
autofit_columns : bool, optional
|
|
502
|
+
Whether to auto-resize column widths to fit content after writing.
|
|
503
|
+
Default is True.
|
|
504
|
+
column_padding : int, optional
|
|
505
|
+
Extra pixels to add to each column width after auto-resize for readability.
|
|
506
|
+
Default is 30. Set to 0 for a tight fit with no padding.
|
|
507
|
+
|
|
508
|
+
Returns
|
|
509
|
+
-------
|
|
510
|
+
dict | tuple[dict, str]
|
|
511
|
+
If spreadsheet exists: returns the API response containing update information.
|
|
512
|
+
If spreadsheet created: returns tuple of (API response, spreadsheet_id).
|
|
513
|
+
|
|
514
|
+
Examples
|
|
515
|
+
--------
|
|
516
|
+
>>> gsheet = GSheet(credentials_path="creds.json")
|
|
517
|
+
>>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
|
518
|
+
>>>
|
|
519
|
+
>>> # Write to existing spreadsheet
|
|
520
|
+
>>> gsheet.write_sheet(df, spreadsheet_id="1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms")
|
|
521
|
+
>>>
|
|
522
|
+
>>> # Create new spreadsheet and write data
|
|
523
|
+
>>> result, new_id = gsheet.write_sheet(
|
|
524
|
+
... df,
|
|
525
|
+
... spreadsheet_title="My Data",
|
|
526
|
+
... share_with="user@example.com"
|
|
527
|
+
... )
|
|
528
|
+
"""
|
|
529
|
+
spreadsheet_id = self._resolve_spreadsheet_id(spreadsheet_id)
|
|
530
|
+
|
|
531
|
+
# Create new spreadsheet if ID not provided
|
|
532
|
+
created_new = False
|
|
533
|
+
if spreadsheet_id is None:
|
|
534
|
+
if spreadsheet_title is None:
|
|
535
|
+
log_and_raise_error(self._logger, "Either 'spreadsheet_id' or 'spreadsheet_title' must be provided")
|
|
536
|
+
|
|
537
|
+
# Determine sheet names from data if needed
|
|
538
|
+
sheet_names_to_create = None
|
|
539
|
+
if sheet_name:
|
|
540
|
+
sheet_names_to_create = [sheet_name]
|
|
541
|
+
|
|
542
|
+
spreadsheet_id = self.create_spreadsheet(
|
|
543
|
+
title=spreadsheet_title,
|
|
544
|
+
sheet_names=sheet_names_to_create,
|
|
545
|
+
share_with=None, # Don't share here, will be done after writing
|
|
546
|
+
role=role,
|
|
547
|
+
)
|
|
548
|
+
created_new = True
|
|
549
|
+
self._logger.info(f"Created new spreadsheet '{spreadsheet_title}' with ID: {spreadsheet_id}")
|
|
550
|
+
|
|
551
|
+
# Build the range
|
|
552
|
+
if range_name is None:
|
|
553
|
+
if sheet_name is None:
|
|
554
|
+
range_name = "Sheet1!A1"
|
|
555
|
+
else:
|
|
556
|
+
# Ensure the sheet exists (create if needed)
|
|
557
|
+
if not created_new:
|
|
558
|
+
self._ensure_sheet_exists(spreadsheet_id, sheet_name)
|
|
559
|
+
formatted_sheet = self._format_sheet_name(sheet_name)
|
|
560
|
+
range_name = f"{formatted_sheet}!A1"
|
|
561
|
+
|
|
562
|
+
# Convert DataFrame to list of lists if needed
|
|
563
|
+
# Build the range
|
|
564
|
+
if range_name is None:
|
|
565
|
+
if sheet_name is None:
|
|
566
|
+
range_name = "Sheet1!A1"
|
|
567
|
+
else:
|
|
568
|
+
formatted_sheet = self._format_sheet_name(sheet_name)
|
|
569
|
+
range_name = f"{formatted_sheet}!A1"
|
|
570
|
+
|
|
571
|
+
# Check if sheet exists and create it if needed (only when using sheet_name, not range_name)
|
|
572
|
+
if sheet_name and not created_new:
|
|
573
|
+
self._ensure_sheet_exists(spreadsheet_id, sheet_name)
|
|
574
|
+
|
|
575
|
+
# Convert DataFrame to list of lists if needed
|
|
576
|
+
if isinstance(data, pd.DataFrame):
|
|
577
|
+
values = self._dataframe_to_values(data, include_headers=include_headers)
|
|
578
|
+
else:
|
|
579
|
+
values = data
|
|
580
|
+
|
|
581
|
+
try:
|
|
582
|
+
if clear_before_write:
|
|
583
|
+
if "!" in range_name:
|
|
584
|
+
sheet_part = range_name.split("!")[0]
|
|
585
|
+
clear_range_full = f"{sheet_part}!A1:ZZZ100000"
|
|
586
|
+
else:
|
|
587
|
+
clear_range_full = "A1:ZZZ100000"
|
|
588
|
+
self.clear_range(spreadsheet_id, clear_range_full)
|
|
589
|
+
|
|
590
|
+
# Write the data
|
|
591
|
+
body = {"values": values}
|
|
592
|
+
result = (
|
|
593
|
+
self.service.spreadsheets()
|
|
594
|
+
.values()
|
|
595
|
+
.update(
|
|
596
|
+
spreadsheetId=spreadsheet_id,
|
|
597
|
+
range=range_name,
|
|
598
|
+
valueInputOption=value_input_option,
|
|
599
|
+
body=body,
|
|
600
|
+
)
|
|
601
|
+
.execute()
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
updated_cells = result.get("updatedCells", 0)
|
|
605
|
+
self._logger.info(f"Successfully wrote {updated_cells} cells to spreadsheet {spreadsheet_id}")
|
|
606
|
+
|
|
607
|
+
# Share the spreadsheet if email addresses are provided (works for both new and existing)
|
|
608
|
+
if share_with:
|
|
609
|
+
self.share_spreadsheet(
|
|
610
|
+
spreadsheet_id=spreadsheet_id,
|
|
611
|
+
email_addresses=share_with,
|
|
612
|
+
role=role,
|
|
613
|
+
send_notification=True,
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
# Auto-fit column widths to content
|
|
617
|
+
if autofit_columns:
|
|
618
|
+
num_cols = len(values[0]) if values else None
|
|
619
|
+
effective_sheet = sheet_name if sheet_name else "Sheet1"
|
|
620
|
+
self._autofit_columns(spreadsheet_id, effective_sheet, num_cols, padding_pixels=column_padding)
|
|
621
|
+
|
|
622
|
+
# Return spreadsheet_id if newly created, otherwise just the result
|
|
623
|
+
if created_new:
|
|
624
|
+
return result, spreadsheet_id
|
|
625
|
+
return result
|
|
626
|
+
|
|
627
|
+
except HttpError as e:
|
|
628
|
+
log_and_raise_error(
|
|
629
|
+
self._logger,
|
|
630
|
+
f"HTTP error writing to Google Sheet: {e}",
|
|
631
|
+
)
|
|
632
|
+
except Exception as e:
|
|
633
|
+
log_and_raise_error(
|
|
634
|
+
self._logger,
|
|
635
|
+
f"Error writing to Google Sheet: {e}",
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
def append_sheet(
|
|
639
|
+
self,
|
|
640
|
+
spreadsheet_id: str = None,
|
|
641
|
+
data: pd.DataFrame | list[list[Any]] = None,
|
|
642
|
+
range_name: str = None,
|
|
643
|
+
sheet_name: str = None,
|
|
644
|
+
value_input_option: str = "USER_ENTERED",
|
|
645
|
+
include_headers: bool = False,
|
|
646
|
+
autofit_columns: bool = False,
|
|
647
|
+
column_padding: int = 30,
|
|
648
|
+
) -> dict:
|
|
649
|
+
"""
|
|
650
|
+
Append data to a Google Sheet.
|
|
651
|
+
|
|
652
|
+
Parameters
|
|
653
|
+
----------
|
|
654
|
+
spreadsheet_id : str, optional
|
|
655
|
+
The ID of the spreadsheet to append to. Falls back to the instance
|
|
656
|
+
default set via ``GSheet(spreadsheet_id=...)`` when omitted.
|
|
657
|
+
data : pd.DataFrame | list[list[Any]]
|
|
658
|
+
The data to append.
|
|
659
|
+
range_name : str, optional
|
|
660
|
+
The A1 notation range to append to (e.g., 'Sheet1!A1').
|
|
661
|
+
sheet_name : str, optional
|
|
662
|
+
The name of the sheet to append to. Used if range_name is not provided.
|
|
663
|
+
value_input_option : str, optional
|
|
664
|
+
How to interpret the input data. Options: 'RAW' or 'USER_ENTERED' (default).
|
|
665
|
+
include_headers : bool, optional
|
|
666
|
+
Whether to include DataFrame column names as headers. Default is False.
|
|
667
|
+
autofit_columns : bool, optional
|
|
668
|
+
Whether to auto-resize column widths to fit content after appending.
|
|
669
|
+
Default is False.
|
|
670
|
+
column_padding : int, optional
|
|
671
|
+
Extra pixels to add to each column width after auto-resize for readability.
|
|
672
|
+
Default is 30. Set to 0 for a tight fit with no padding.
|
|
673
|
+
|
|
674
|
+
Returns
|
|
675
|
+
-------
|
|
676
|
+
dict
|
|
677
|
+
The API response containing append information.
|
|
678
|
+
|
|
679
|
+
Examples
|
|
680
|
+
--------
|
|
681
|
+
>>> gsheet = GSheet(credentials_path="creds.json")
|
|
682
|
+
>>> df = pd.DataFrame({'A': [7, 8, 9], 'B': [10, 11, 12]})
|
|
683
|
+
>>> gsheet.append_sheet("1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms", df)
|
|
684
|
+
"""
|
|
685
|
+
spreadsheet_id = self._resolve_spreadsheet_id(spreadsheet_id)
|
|
686
|
+
if spreadsheet_id is None:
|
|
687
|
+
log_and_raise_error(
|
|
688
|
+
self._logger,
|
|
689
|
+
"No spreadsheet_id provided and no default set on the GSheet instance",
|
|
690
|
+
)
|
|
691
|
+
if data is None:
|
|
692
|
+
log_and_raise_error(self._logger, "'data' is required for append_sheet")
|
|
693
|
+
|
|
694
|
+
# Build the range
|
|
695
|
+
if range_name is None:
|
|
696
|
+
if sheet_name is None:
|
|
697
|
+
range_name = "Sheet1!A1"
|
|
698
|
+
else:
|
|
699
|
+
# Ensure the sheet exists (create if needed)
|
|
700
|
+
self._ensure_sheet_exists(spreadsheet_id, sheet_name)
|
|
701
|
+
formatted_sheet = self._format_sheet_name(sheet_name)
|
|
702
|
+
range_name = f"{formatted_sheet}!A1"
|
|
703
|
+
|
|
704
|
+
# Convert DataFrame to list of lists if needed
|
|
705
|
+
if isinstance(data, pd.DataFrame):
|
|
706
|
+
values = self._dataframe_to_values(data, include_headers=include_headers)
|
|
707
|
+
else:
|
|
708
|
+
values = data
|
|
709
|
+
|
|
710
|
+
try:
|
|
711
|
+
body = {"values": values}
|
|
712
|
+
result = (
|
|
713
|
+
self.service.spreadsheets()
|
|
714
|
+
.values()
|
|
715
|
+
.append(
|
|
716
|
+
spreadsheetId=spreadsheet_id,
|
|
717
|
+
range=range_name,
|
|
718
|
+
valueInputOption=value_input_option,
|
|
719
|
+
insertDataOption="INSERT_ROWS",
|
|
720
|
+
body=body,
|
|
721
|
+
)
|
|
722
|
+
.execute()
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
updated_cells = result.get("updates", {}).get("updatedCells", 0)
|
|
726
|
+
self._logger.info(f"Successfully appended {updated_cells} cells to spreadsheet {spreadsheet_id}")
|
|
727
|
+
|
|
728
|
+
# Auto-fit column widths to content
|
|
729
|
+
if autofit_columns:
|
|
730
|
+
num_cols = len(values[0]) if values else None
|
|
731
|
+
effective_sheet = sheet_name if sheet_name else "Sheet1"
|
|
732
|
+
self._autofit_columns(spreadsheet_id, effective_sheet, num_cols, padding_pixels=column_padding)
|
|
733
|
+
|
|
734
|
+
return result
|
|
735
|
+
|
|
736
|
+
except HttpError as e:
|
|
737
|
+
log_and_raise_error(
|
|
738
|
+
self._logger,
|
|
739
|
+
f"HTTP error appending to Google Sheet: {e}",
|
|
740
|
+
)
|
|
741
|
+
except Exception as e:
|
|
742
|
+
log_and_raise_error(
|
|
743
|
+
self._logger,
|
|
744
|
+
f"Error appending to Google Sheet: {e}",
|
|
745
|
+
)
|
|
746
|
+
|
|
747
|
+
def clear_range(self, spreadsheet_id: str = None, range_name: str = None) -> dict:
|
|
748
|
+
"""
|
|
749
|
+
Clear values from a range in a Google Sheet.
|
|
750
|
+
|
|
751
|
+
Parameters
|
|
752
|
+
----------
|
|
753
|
+
spreadsheet_id : str, optional
|
|
754
|
+
The ID of the spreadsheet. Falls back to the instance default set
|
|
755
|
+
via ``GSheet(spreadsheet_id=...)`` when omitted.
|
|
756
|
+
range_name : str
|
|
757
|
+
The A1 notation range to clear.
|
|
758
|
+
|
|
759
|
+
Returns
|
|
760
|
+
-------
|
|
761
|
+
dict
|
|
762
|
+
The API response.
|
|
763
|
+
|
|
764
|
+
Examples
|
|
765
|
+
--------
|
|
766
|
+
>>> gsheet = GSheet(credentials_path="creds.json")
|
|
767
|
+
>>> gsheet.clear_range("1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms",
|
|
768
|
+
... "Sheet1!A1:D10")
|
|
769
|
+
"""
|
|
770
|
+
spreadsheet_id = self._resolve_spreadsheet_id(spreadsheet_id)
|
|
771
|
+
if spreadsheet_id is None:
|
|
772
|
+
log_and_raise_error(
|
|
773
|
+
self._logger,
|
|
774
|
+
"No spreadsheet_id provided and no default set on the GSheet instance",
|
|
775
|
+
)
|
|
776
|
+
if range_name is None:
|
|
777
|
+
log_and_raise_error(self._logger, "'range_name' is required for clear_range")
|
|
778
|
+
|
|
779
|
+
try:
|
|
780
|
+
result = (
|
|
781
|
+
self.service.spreadsheets().values().clear(spreadsheetId=spreadsheet_id, range=range_name).execute()
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
self._logger.info(f"Successfully cleared range {range_name}")
|
|
785
|
+
return result
|
|
786
|
+
|
|
787
|
+
except HttpError as e:
|
|
788
|
+
log_and_raise_error(
|
|
789
|
+
self._logger,
|
|
790
|
+
f"HTTP error clearing Google Sheet range: {e}",
|
|
791
|
+
)
|
|
792
|
+
except Exception as e:
|
|
793
|
+
log_and_raise_error(
|
|
794
|
+
self._logger,
|
|
795
|
+
f"Error clearing Google Sheet range: {e}",
|
|
796
|
+
)
|
|
797
|
+
|
|
798
|
+
def create_spreadsheet(
|
|
799
|
+
self,
|
|
800
|
+
title: str,
|
|
801
|
+
sheet_names: list[str] = None,
|
|
802
|
+
share_with: list[str] | str = None,
|
|
803
|
+
role: str = "writer",
|
|
804
|
+
send_notification: bool = True,
|
|
805
|
+
) -> str:
|
|
806
|
+
"""
|
|
807
|
+
Create a new Google Spreadsheet and optionally share it with specified email addresses.
|
|
808
|
+
|
|
809
|
+
Parameters
|
|
810
|
+
----------
|
|
811
|
+
title : str
|
|
812
|
+
The title of the new spreadsheet.
|
|
813
|
+
sheet_names : list[str], optional
|
|
814
|
+
List of sheet names to create. If None, creates a single sheet named "Sheet1".
|
|
815
|
+
share_with : list[str] | str, optional
|
|
816
|
+
Email address(es) to share the spreadsheet with.
|
|
817
|
+
Can be a single email string or a list of email strings.
|
|
818
|
+
role : str, optional
|
|
819
|
+
Permission level for shared users: 'reader', 'writer', or 'owner'.
|
|
820
|
+
Default is 'writer'.
|
|
821
|
+
send_notification : bool, optional
|
|
822
|
+
Whether to send email notifications to users when sharing.
|
|
823
|
+
Default is True.
|
|
824
|
+
|
|
825
|
+
Returns
|
|
826
|
+
-------
|
|
827
|
+
str
|
|
828
|
+
The spreadsheet ID of the newly created spreadsheet.
|
|
829
|
+
|
|
830
|
+
Examples
|
|
831
|
+
--------
|
|
832
|
+
>>> gsheet = GSheet(credentials_path="creds.json")
|
|
833
|
+
>>> # Create and share with one person
|
|
834
|
+
>>> spreadsheet_id = gsheet.create_spreadsheet(
|
|
835
|
+
... "My New Spreadsheet",
|
|
836
|
+
... sheet_names=["Data", "Analysis"],
|
|
837
|
+
... share_with="user@example.com"
|
|
838
|
+
... )
|
|
839
|
+
>>>
|
|
840
|
+
>>> # Create and share with multiple people
|
|
841
|
+
>>> spreadsheet_id = gsheet.create_spreadsheet(
|
|
842
|
+
... "Team Dashboard",
|
|
843
|
+
... share_with=["alice@example.com", "bob@example.com"],
|
|
844
|
+
... role="reader"
|
|
845
|
+
... )
|
|
846
|
+
"""
|
|
847
|
+
if self.drive_service is None:
|
|
848
|
+
log_and_raise_error(
|
|
849
|
+
self._logger,
|
|
850
|
+
"Drive API service is not initialized. Cannot create spreadsheet. "
|
|
851
|
+
"Please ensure the Drive API is enabled in your Google Cloud project.",
|
|
852
|
+
)
|
|
853
|
+
|
|
854
|
+
try:
|
|
855
|
+
# Use Drive API to create the spreadsheet file
|
|
856
|
+
file_metadata = {"name": title, "mimeType": "application/vnd.google-apps.spreadsheet"}
|
|
857
|
+
|
|
858
|
+
file = self.drive_service.files().create(body=file_metadata, fields="id").execute()
|
|
859
|
+
|
|
860
|
+
spreadsheet_id = file.get("id")
|
|
861
|
+
self._logger.info(f"Created new spreadsheet '{title}' with ID: {spreadsheet_id}")
|
|
862
|
+
|
|
863
|
+
# If custom sheet names are specified, update the spreadsheet
|
|
864
|
+
if sheet_names:
|
|
865
|
+
try:
|
|
866
|
+
requests = []
|
|
867
|
+
# Delete the default "Sheet1" if we're creating custom sheets
|
|
868
|
+
requests.append(
|
|
869
|
+
{
|
|
870
|
+
"deleteSheet": {
|
|
871
|
+
"sheetId": 0 # Default sheet ID
|
|
872
|
+
}
|
|
873
|
+
}
|
|
874
|
+
)
|
|
875
|
+
# Add custom sheets
|
|
876
|
+
for i, name in enumerate(sheet_names):
|
|
877
|
+
requests.append({"addSheet": {"properties": {"sheetId": i + 1, "title": name}}})
|
|
878
|
+
|
|
879
|
+
batch_update_request = {"requests": requests}
|
|
880
|
+
self.service.spreadsheets().batchUpdate(
|
|
881
|
+
spreadsheetId=spreadsheet_id, body=batch_update_request
|
|
882
|
+
).execute()
|
|
883
|
+
self._logger.info(f"Added custom sheets: {sheet_names}")
|
|
884
|
+
except Exception as e:
|
|
885
|
+
self._logger.warning(f"Could not add custom sheets: {e}")
|
|
886
|
+
|
|
887
|
+
# Share the spreadsheet if email addresses are provided
|
|
888
|
+
if share_with:
|
|
889
|
+
self.share_spreadsheet(
|
|
890
|
+
spreadsheet_id=spreadsheet_id,
|
|
891
|
+
email_addresses=share_with,
|
|
892
|
+
role=role,
|
|
893
|
+
send_notification=send_notification,
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
return spreadsheet_id
|
|
897
|
+
|
|
898
|
+
except HttpError as e:
|
|
899
|
+
log_and_raise_error(
|
|
900
|
+
self._logger,
|
|
901
|
+
f"HTTP error creating Google Spreadsheet: {e}",
|
|
902
|
+
)
|
|
903
|
+
except Exception as e:
|
|
904
|
+
log_and_raise_error(
|
|
905
|
+
self._logger,
|
|
906
|
+
f"Error creating Google Spreadsheet: {e}",
|
|
907
|
+
)
|
|
908
|
+
|
|
909
|
+
def share_spreadsheet(
|
|
910
|
+
self,
|
|
911
|
+
spreadsheet_id: str = None,
|
|
912
|
+
email_addresses: list[str] | str = None,
|
|
913
|
+
role: str = "writer",
|
|
914
|
+
send_notification: bool = True,
|
|
915
|
+
) -> list[dict]:
|
|
916
|
+
"""
|
|
917
|
+
Share a Google Spreadsheet with one or more email addresses.
|
|
918
|
+
|
|
919
|
+
Parameters
|
|
920
|
+
----------
|
|
921
|
+
spreadsheet_id : str, optional
|
|
922
|
+
The ID of the spreadsheet to share. Falls back to the instance
|
|
923
|
+
default set via ``GSheet(spreadsheet_id=...)`` when omitted.
|
|
924
|
+
email_addresses : list[str] | str
|
|
925
|
+
Email address(es) to share the spreadsheet with.
|
|
926
|
+
Can be a single email string or a list of email strings.
|
|
927
|
+
role : str, optional
|
|
928
|
+
Permission level: 'reader', 'writer', or 'owner'. Default is 'writer'.
|
|
929
|
+
send_notification : bool, optional
|
|
930
|
+
Whether to send email notifications. Default is True.
|
|
931
|
+
|
|
932
|
+
Returns
|
|
933
|
+
-------
|
|
934
|
+
list[dict]
|
|
935
|
+
List of permission objects created.
|
|
936
|
+
|
|
937
|
+
Examples
|
|
938
|
+
--------
|
|
939
|
+
>>> gsheet = GSheet(credentials_path="creds.json")
|
|
940
|
+
>>> gsheet.share_spreadsheet(
|
|
941
|
+
... "1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms",
|
|
942
|
+
... "user@example.com",
|
|
943
|
+
... role="writer"
|
|
944
|
+
... )
|
|
945
|
+
>>>
|
|
946
|
+
>>> # Share with multiple users
|
|
947
|
+
>>> gsheet.share_spreadsheet(
|
|
948
|
+
... "1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms",
|
|
949
|
+
... ["alice@example.com", "bob@example.com"],
|
|
950
|
+
... role="reader"
|
|
951
|
+
... )
|
|
952
|
+
"""
|
|
953
|
+
spreadsheet_id = self._resolve_spreadsheet_id(spreadsheet_id)
|
|
954
|
+
if spreadsheet_id is None:
|
|
955
|
+
log_and_raise_error(
|
|
956
|
+
self._logger,
|
|
957
|
+
"No spreadsheet_id provided and no default set on the GSheet instance",
|
|
958
|
+
)
|
|
959
|
+
if email_addresses is None:
|
|
960
|
+
log_and_raise_error(self._logger, "'email_addresses' is required for share_spreadsheet")
|
|
961
|
+
|
|
962
|
+
if self.drive_service is None:
|
|
963
|
+
log_and_raise_error(
|
|
964
|
+
self._logger,
|
|
965
|
+
"Drive API service is not initialized. Cannot share spreadsheet.",
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
# Convert single email to list
|
|
969
|
+
if isinstance(email_addresses, str):
|
|
970
|
+
email_addresses = [email_addresses]
|
|
971
|
+
|
|
972
|
+
permissions = []
|
|
973
|
+
for email in email_addresses:
|
|
974
|
+
try:
|
|
975
|
+
permission = {
|
|
976
|
+
"type": "user",
|
|
977
|
+
"role": role,
|
|
978
|
+
"emailAddress": email,
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
result = (
|
|
982
|
+
self.drive_service.permissions()
|
|
983
|
+
.create(
|
|
984
|
+
fileId=spreadsheet_id,
|
|
985
|
+
body=permission,
|
|
986
|
+
sendNotificationEmail=send_notification,
|
|
987
|
+
)
|
|
988
|
+
.execute()
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
permissions.append(result)
|
|
992
|
+
self._logger.info(f"Shared spreadsheet {spreadsheet_id} with {email} as {role}")
|
|
993
|
+
|
|
994
|
+
except HttpError as e:
|
|
995
|
+
self._logger.error(f"Failed to share with {email}: {e}")
|
|
996
|
+
except Exception as e:
|
|
997
|
+
self._logger.error(f"Error sharing with {email}: {e}")
|
|
998
|
+
|
|
999
|
+
return permissions
|
|
1000
|
+
|
|
1001
|
+
def get_service_account_email(self) -> str:
|
|
1002
|
+
"""
|
|
1003
|
+
Get the service account email address.
|
|
1004
|
+
|
|
1005
|
+
This email should be used to share Google Spreadsheets for programmatic access.
|
|
1006
|
+
|
|
1007
|
+
Returns
|
|
1008
|
+
-------
|
|
1009
|
+
str
|
|
1010
|
+
The service account email address.
|
|
1011
|
+
|
|
1012
|
+
Examples
|
|
1013
|
+
--------
|
|
1014
|
+
>>> gsheet = GSheet(credentials_path="creds.json")
|
|
1015
|
+
>>> email = gsheet.get_service_account_email()
|
|
1016
|
+
>>> print(f"Share your spreadsheet with: {email}")
|
|
1017
|
+
"""
|
|
1018
|
+
return self.service_account_email
|
|
1019
|
+
|
|
1020
|
+
def _ensure_sheet_exists(self, spreadsheet_id: str, sheet_name: str) -> bool:
|
|
1021
|
+
"""
|
|
1022
|
+
Check if a sheet exists in the spreadsheet, create it if it doesn't.
|
|
1023
|
+
|
|
1024
|
+
Parameters
|
|
1025
|
+
----------
|
|
1026
|
+
spreadsheet_id : str
|
|
1027
|
+
The ID of the spreadsheet.
|
|
1028
|
+
sheet_name : str
|
|
1029
|
+
The name of the sheet to check/create.
|
|
1030
|
+
|
|
1031
|
+
Returns
|
|
1032
|
+
-------
|
|
1033
|
+
bool
|
|
1034
|
+
True if sheet was created, False if it already existed.
|
|
1035
|
+
"""
|
|
1036
|
+
try:
|
|
1037
|
+
# Get spreadsheet info to check existing sheets
|
|
1038
|
+
spreadsheet = self.get_spreadsheet_info(spreadsheet_id)
|
|
1039
|
+
existing_sheets = [sheet["properties"]["title"] for sheet in spreadsheet["sheets"]]
|
|
1040
|
+
|
|
1041
|
+
if sheet_name in existing_sheets:
|
|
1042
|
+
return False # Sheet already exists
|
|
1043
|
+
|
|
1044
|
+
# Create the sheet
|
|
1045
|
+
requests = [{"addSheet": {"properties": {"title": sheet_name}}}]
|
|
1046
|
+
|
|
1047
|
+
batch_update_request = {"requests": requests}
|
|
1048
|
+
self.service.spreadsheets().batchUpdate(spreadsheetId=spreadsheet_id, body=batch_update_request).execute()
|
|
1049
|
+
|
|
1050
|
+
self._logger.info(f"Created new sheet '{sheet_name}' in spreadsheet {spreadsheet_id}")
|
|
1051
|
+
return True # Sheet was created
|
|
1052
|
+
|
|
1053
|
+
except HttpError as e:
|
|
1054
|
+
self._logger.warning(f"Could not check/create sheet '{sheet_name}': {e}")
|
|
1055
|
+
return False
|
|
1056
|
+
except Exception as e:
|
|
1057
|
+
self._logger.warning(f"Error checking/creating sheet '{sheet_name}': {e}")
|
|
1058
|
+
return False
|
|
1059
|
+
|
|
1060
|
+
def _autofit_columns(
|
|
1061
|
+
self,
|
|
1062
|
+
spreadsheet_id: str,
|
|
1063
|
+
sheet_name: str = None,
|
|
1064
|
+
num_columns: int = None,
|
|
1065
|
+
padding_pixels: int = 30,
|
|
1066
|
+
) -> None:
|
|
1067
|
+
"""
|
|
1068
|
+
Auto-resize columns to fit their content, then add padding for readability.
|
|
1069
|
+
|
|
1070
|
+
First uses Google Sheets autoResizeDimensions to fit content, then reads
|
|
1071
|
+
back the resulting column widths and adds extra padding so the data doesn't
|
|
1072
|
+
look cramped.
|
|
1073
|
+
|
|
1074
|
+
Parameters
|
|
1075
|
+
----------
|
|
1076
|
+
spreadsheet_id : str
|
|
1077
|
+
The ID of the spreadsheet.
|
|
1078
|
+
sheet_name : str, optional
|
|
1079
|
+
The name of the sheet. Defaults to "Sheet1".
|
|
1080
|
+
num_columns : int, optional
|
|
1081
|
+
Number of columns to resize. If None, resizes all columns in the sheet.
|
|
1082
|
+
padding_pixels : int, optional
|
|
1083
|
+
Extra pixels to add to each column width after auto-resize.
|
|
1084
|
+
Default is 30. Set to 0 to skip padding.
|
|
1085
|
+
"""
|
|
1086
|
+
try:
|
|
1087
|
+
target_name = sheet_name or "Sheet1"
|
|
1088
|
+
info = self.get_spreadsheet_info(spreadsheet_id)
|
|
1089
|
+
sheet_id = None
|
|
1090
|
+
for sheet in info["sheets"]:
|
|
1091
|
+
if sheet["properties"]["title"] == target_name:
|
|
1092
|
+
sheet_id = sheet["properties"]["sheetId"]
|
|
1093
|
+
break
|
|
1094
|
+
|
|
1095
|
+
if sheet_id is None:
|
|
1096
|
+
self._logger.warning(f"Sheet '{target_name}' not found for auto-fit; skipping column resize")
|
|
1097
|
+
return
|
|
1098
|
+
|
|
1099
|
+
# Step 1: Auto-resize columns to fit content
|
|
1100
|
+
dimensions = {
|
|
1101
|
+
"sheetId": sheet_id,
|
|
1102
|
+
"dimension": "COLUMNS",
|
|
1103
|
+
"startIndex": 0,
|
|
1104
|
+
}
|
|
1105
|
+
if num_columns is not None:
|
|
1106
|
+
dimensions["endIndex"] = num_columns
|
|
1107
|
+
|
|
1108
|
+
request = {"autoResizeDimensions": {"dimensions": dimensions}}
|
|
1109
|
+
self.service.spreadsheets().batchUpdate(
|
|
1110
|
+
spreadsheetId=spreadsheet_id, body={"requests": [request]}
|
|
1111
|
+
).execute()
|
|
1112
|
+
self._logger.debug(f"Auto-fit columns for sheet '{target_name}'")
|
|
1113
|
+
|
|
1114
|
+
# Step 2: Add padding to each column for better readability
|
|
1115
|
+
if padding_pixels > 0:
|
|
1116
|
+
self._add_column_padding(spreadsheet_id, sheet_id, target_name, num_columns, padding_pixels)
|
|
1117
|
+
|
|
1118
|
+
except Exception as e:
|
|
1119
|
+
self._logger.warning(f"Could not auto-fit columns: {e}")
|
|
1120
|
+
|
|
1121
|
+
def _add_column_padding(
|
|
1122
|
+
self,
|
|
1123
|
+
spreadsheet_id: str,
|
|
1124
|
+
sheet_id: int,
|
|
1125
|
+
sheet_name: str,
|
|
1126
|
+
num_columns: int | None,
|
|
1127
|
+
padding_pixels: int,
|
|
1128
|
+
) -> None:
|
|
1129
|
+
"""
|
|
1130
|
+
Add extra padding to column widths after auto-resize.
|
|
1131
|
+
|
|
1132
|
+
Reads back the current column widths from the sheet metadata and adds
|
|
1133
|
+
the specified padding to each column.
|
|
1134
|
+
|
|
1135
|
+
Parameters
|
|
1136
|
+
----------
|
|
1137
|
+
spreadsheet_id : str
|
|
1138
|
+
The ID of the spreadsheet.
|
|
1139
|
+
sheet_id : int
|
|
1140
|
+
The numeric sheet ID within the spreadsheet.
|
|
1141
|
+
sheet_name : str
|
|
1142
|
+
The sheet name (used for logging).
|
|
1143
|
+
num_columns : int | None
|
|
1144
|
+
Number of columns to pad. If None, pads all columns with metadata.
|
|
1145
|
+
padding_pixels : int
|
|
1146
|
+
Extra pixels to add to each column width.
|
|
1147
|
+
"""
|
|
1148
|
+
try:
|
|
1149
|
+
fields = "sheets(properties(sheetId,title),data(columnMetadata(pixelSize)))"
|
|
1150
|
+
spreadsheet = self.service.spreadsheets().get(spreadsheetId=spreadsheet_id, fields=fields).execute()
|
|
1151
|
+
|
|
1152
|
+
column_widths = []
|
|
1153
|
+
for sheet in spreadsheet.get("sheets", []):
|
|
1154
|
+
if sheet["properties"]["sheetId"] == sheet_id:
|
|
1155
|
+
for data_section in sheet.get("data", []):
|
|
1156
|
+
for col_meta in data_section.get("columnMetadata", []):
|
|
1157
|
+
column_widths.append(col_meta.get("pixelSize", 100))
|
|
1158
|
+
break
|
|
1159
|
+
|
|
1160
|
+
if not column_widths:
|
|
1161
|
+
self._logger.debug("No column metadata found; skipping padding")
|
|
1162
|
+
return
|
|
1163
|
+
|
|
1164
|
+
end_col = num_columns if num_columns is not None else len(column_widths)
|
|
1165
|
+
end_col = min(end_col, len(column_widths))
|
|
1166
|
+
|
|
1167
|
+
padding_requests = []
|
|
1168
|
+
for i in range(end_col):
|
|
1169
|
+
new_width = column_widths[i] + padding_pixels
|
|
1170
|
+
padding_requests.append(
|
|
1171
|
+
{
|
|
1172
|
+
"updateDimensionProperties": {
|
|
1173
|
+
"range": {
|
|
1174
|
+
"sheetId": sheet_id,
|
|
1175
|
+
"dimension": "COLUMNS",
|
|
1176
|
+
"startIndex": i,
|
|
1177
|
+
"endIndex": i + 1,
|
|
1178
|
+
},
|
|
1179
|
+
"properties": {"pixelSize": new_width},
|
|
1180
|
+
"fields": "pixelSize",
|
|
1181
|
+
}
|
|
1182
|
+
}
|
|
1183
|
+
)
|
|
1184
|
+
|
|
1185
|
+
if padding_requests:
|
|
1186
|
+
self.service.spreadsheets().batchUpdate(
|
|
1187
|
+
spreadsheetId=spreadsheet_id, body={"requests": padding_requests}
|
|
1188
|
+
).execute()
|
|
1189
|
+
self._logger.debug(f"Added {padding_pixels}px padding to {end_col} columns in sheet '{sheet_name}'")
|
|
1190
|
+
|
|
1191
|
+
except Exception as e:
|
|
1192
|
+
self._logger.warning(f"Could not add column padding: {e}")
|
|
1193
|
+
|
|
1194
|
+
def format_columns_as_percent(
|
|
1195
|
+
self,
|
|
1196
|
+
spreadsheet_id: str = None,
|
|
1197
|
+
columns: list[str | int] = None,
|
|
1198
|
+
sheet_name: str = None,
|
|
1199
|
+
pattern: str = "0.0%",
|
|
1200
|
+
has_header: bool = True,
|
|
1201
|
+
) -> dict:
|
|
1202
|
+
"""
|
|
1203
|
+
Apply percent number formatting to one or more columns in a sheet.
|
|
1204
|
+
|
|
1205
|
+
Values should be written as raw ratios (e.g. 0.143, not "14.3%"). Users
|
|
1206
|
+
will see "14.3%" in the UI while sorting and filtering remain numeric.
|
|
1207
|
+
|
|
1208
|
+
Parameters
|
|
1209
|
+
----------
|
|
1210
|
+
spreadsheet_id : str
|
|
1211
|
+
The ID of the spreadsheet to format.
|
|
1212
|
+
columns : list[str | int]
|
|
1213
|
+
Columns to format. Each entry is either a column name (matched
|
|
1214
|
+
against the header row) or a 0-based column index.
|
|
1215
|
+
sheet_name : str, optional
|
|
1216
|
+
The name of the sheet. Defaults to "Sheet1".
|
|
1217
|
+
pattern : str, optional
|
|
1218
|
+
Google Sheets number format pattern. Default is "0.0%".
|
|
1219
|
+
Examples: "0%", "0.00%", "0.0%;[red]-0.0%".
|
|
1220
|
+
has_header : bool, optional
|
|
1221
|
+
If True (default), the first row is treated as a header and left
|
|
1222
|
+
unformatted; formatting starts at row 2. If False, formatting
|
|
1223
|
+
starts at row 1.
|
|
1224
|
+
|
|
1225
|
+
Returns
|
|
1226
|
+
-------
|
|
1227
|
+
dict
|
|
1228
|
+
The batchUpdate API response.
|
|
1229
|
+
|
|
1230
|
+
Examples
|
|
1231
|
+
--------
|
|
1232
|
+
>>> gsheet = GSheet(credentials_path="creds.json")
|
|
1233
|
+
>>> gsheet.write_sheet(df, spreadsheet_id=sid, sheet_name="Summary")
|
|
1234
|
+
>>> gsheet.format_columns_as_percent(
|
|
1235
|
+
... spreadsheet_id=sid,
|
|
1236
|
+
... columns=["conversion_rate", "bounce_rate"],
|
|
1237
|
+
... sheet_name="Summary",
|
|
1238
|
+
... )
|
|
1239
|
+
"""
|
|
1240
|
+
return self._apply_number_format(
|
|
1241
|
+
spreadsheet_id=spreadsheet_id,
|
|
1242
|
+
columns=columns,
|
|
1243
|
+
number_format={"type": "PERCENT", "pattern": pattern},
|
|
1244
|
+
sheet_name=sheet_name,
|
|
1245
|
+
has_header=has_header,
|
|
1246
|
+
)
|
|
1247
|
+
|
|
1248
|
+
def format_columns_as_number(
|
|
1249
|
+
self,
|
|
1250
|
+
spreadsheet_id: str = None,
|
|
1251
|
+
columns: list[str | int] = None,
|
|
1252
|
+
sheet_name: str = None,
|
|
1253
|
+
pattern: str = "#,##0.00",
|
|
1254
|
+
has_header: bool = True,
|
|
1255
|
+
) -> dict:
|
|
1256
|
+
"""
|
|
1257
|
+
Apply numeric formatting (e.g. thousands separators) to one or more columns.
|
|
1258
|
+
|
|
1259
|
+
Values should be written as raw numbers (e.g. 6302320.01). With the
|
|
1260
|
+
default pattern, users will see "6,302,320.01" in the UI while sorting
|
|
1261
|
+
and filtering remain numeric.
|
|
1262
|
+
|
|
1263
|
+
Parameters
|
|
1264
|
+
----------
|
|
1265
|
+
spreadsheet_id : str
|
|
1266
|
+
The ID of the spreadsheet to format.
|
|
1267
|
+
columns : list[str | int]
|
|
1268
|
+
Columns to format. Each entry is either a column name (matched
|
|
1269
|
+
against the header row) or a 0-based column index.
|
|
1270
|
+
sheet_name : str, optional
|
|
1271
|
+
The name of the sheet. Defaults to "Sheet1".
|
|
1272
|
+
pattern : str, optional
|
|
1273
|
+
Google Sheets number format pattern. Default is "#,##0.00".
|
|
1274
|
+
Examples: "#,##0" (integer with thousands), "#,##0.00" (two decimals),
|
|
1275
|
+
"$#,##0.00" (currency-style prefix), "#,##0.00;[red]-#,##0.00".
|
|
1276
|
+
has_header : bool, optional
|
|
1277
|
+
If True (default), the first row is treated as a header and left
|
|
1278
|
+
unformatted; formatting starts at row 2. If False, formatting
|
|
1279
|
+
starts at row 1.
|
|
1280
|
+
|
|
1281
|
+
Returns
|
|
1282
|
+
-------
|
|
1283
|
+
dict
|
|
1284
|
+
The batchUpdate API response.
|
|
1285
|
+
|
|
1286
|
+
Examples
|
|
1287
|
+
--------
|
|
1288
|
+
>>> gsheet = GSheet(credentials_path="creds.json")
|
|
1289
|
+
>>> gsheet.write_sheet(df, spreadsheet_id=sid, sheet_name="Summary")
|
|
1290
|
+
>>> gsheet.format_columns_as_number(
|
|
1291
|
+
... spreadsheet_id=sid,
|
|
1292
|
+
... columns=["revenue", "cost"],
|
|
1293
|
+
... sheet_name="Summary",
|
|
1294
|
+
... pattern="#,##0.00",
|
|
1295
|
+
... )
|
|
1296
|
+
"""
|
|
1297
|
+
return self._apply_number_format(
|
|
1298
|
+
spreadsheet_id=spreadsheet_id,
|
|
1299
|
+
columns=columns,
|
|
1300
|
+
number_format={"type": "NUMBER", "pattern": pattern},
|
|
1301
|
+
sheet_name=sheet_name,
|
|
1302
|
+
has_header=has_header,
|
|
1303
|
+
)
|
|
1304
|
+
|
|
1305
|
+
def format_columns_as_date(
|
|
1306
|
+
self,
|
|
1307
|
+
spreadsheet_id: str = None,
|
|
1308
|
+
columns: list[str | int] = None,
|
|
1309
|
+
sheet_name: str = None,
|
|
1310
|
+
pattern: str = "yyyy-mm-dd",
|
|
1311
|
+
has_header: bool = True,
|
|
1312
|
+
include_time: bool = False,
|
|
1313
|
+
) -> dict:
|
|
1314
|
+
"""
|
|
1315
|
+
Apply date (or date-time) formatting to one or more columns.
|
|
1316
|
+
|
|
1317
|
+
Works with cells that Sheets has parsed as dates. When writing via
|
|
1318
|
+
``write_sheet``/``append_sheet`` with ``value_input_option="USER_ENTERED"``
|
|
1319
|
+
(the default), the helper ``_dataframe_to_values`` converts pandas
|
|
1320
|
+
datetime columns to ``"YYYY-MM-DD HH:MM:SS"`` strings, which Sheets
|
|
1321
|
+
parses back into serial date values. Applying this format controls
|
|
1322
|
+
how they are displayed while preserving sort/filter semantics.
|
|
1323
|
+
|
|
1324
|
+
Parameters
|
|
1325
|
+
----------
|
|
1326
|
+
spreadsheet_id : str
|
|
1327
|
+
The ID of the spreadsheet to format.
|
|
1328
|
+
columns : list[str | int]
|
|
1329
|
+
Columns to format. Each entry is either a column name (matched
|
|
1330
|
+
against the header row) or a 0-based column index.
|
|
1331
|
+
sheet_name : str, optional
|
|
1332
|
+
The name of the sheet. Defaults to "Sheet1".
|
|
1333
|
+
pattern : str, optional
|
|
1334
|
+
Google Sheets date/time format pattern. Default is "yyyy-mm-dd".
|
|
1335
|
+
Examples: "yyyy-mm-dd", "dd/mm/yyyy", "mmm d, yyyy",
|
|
1336
|
+
"yyyy-mm-dd hh:mm:ss".
|
|
1337
|
+
has_header : bool, optional
|
|
1338
|
+
If True (default), the first row is treated as a header and left
|
|
1339
|
+
unformatted; formatting starts at row 2. If False, formatting
|
|
1340
|
+
starts at row 1.
|
|
1341
|
+
include_time : bool, optional
|
|
1342
|
+
If True, use the DATE_TIME number format type (so the cell is
|
|
1343
|
+
treated as a timestamp). If False (default), use the DATE type.
|
|
1344
|
+
The ``pattern`` still controls the exact display either way.
|
|
1345
|
+
|
|
1346
|
+
Returns
|
|
1347
|
+
-------
|
|
1348
|
+
dict
|
|
1349
|
+
The batchUpdate API response.
|
|
1350
|
+
|
|
1351
|
+
Examples
|
|
1352
|
+
--------
|
|
1353
|
+
>>> gsheet = GSheet(credentials_path="creds.json")
|
|
1354
|
+
>>> gsheet.write_sheet(df, spreadsheet_id=sid, sheet_name="Summary")
|
|
1355
|
+
>>> gsheet.format_columns_as_date(
|
|
1356
|
+
... spreadsheet_id=sid,
|
|
1357
|
+
... columns=["created_at", "updated_at"],
|
|
1358
|
+
... sheet_name="Summary",
|
|
1359
|
+
... pattern="yyyy-mm-dd hh:mm:ss",
|
|
1360
|
+
... include_time=True,
|
|
1361
|
+
... )
|
|
1362
|
+
"""
|
|
1363
|
+
fmt_type = "DATE_TIME" if include_time else "DATE"
|
|
1364
|
+
return self._apply_number_format(
|
|
1365
|
+
spreadsheet_id=spreadsheet_id,
|
|
1366
|
+
columns=columns,
|
|
1367
|
+
number_format={"type": fmt_type, "pattern": pattern},
|
|
1368
|
+
sheet_name=sheet_name,
|
|
1369
|
+
has_header=has_header,
|
|
1370
|
+
)
|
|
1371
|
+
|
|
1372
|
+
def _apply_number_format(
|
|
1373
|
+
self,
|
|
1374
|
+
spreadsheet_id: str,
|
|
1375
|
+
columns: list[str | int],
|
|
1376
|
+
number_format: dict,
|
|
1377
|
+
sheet_name: str = None,
|
|
1378
|
+
has_header: bool = True,
|
|
1379
|
+
) -> dict:
|
|
1380
|
+
"""
|
|
1381
|
+
Apply a Google Sheets numberFormat dict to a set of columns.
|
|
1382
|
+
|
|
1383
|
+
Shared implementation for format_columns_as_percent and
|
|
1384
|
+
format_columns_as_number. Resolves column names via the header row,
|
|
1385
|
+
looks up the sheetId, and issues a single batchUpdate with one
|
|
1386
|
+
repeatCell request per column.
|
|
1387
|
+
"""
|
|
1388
|
+
spreadsheet_id = self._resolve_spreadsheet_id(spreadsheet_id)
|
|
1389
|
+
if spreadsheet_id is None:
|
|
1390
|
+
log_and_raise_error(
|
|
1391
|
+
self._logger,
|
|
1392
|
+
"No spreadsheet_id provided and no default set on the GSheet instance",
|
|
1393
|
+
)
|
|
1394
|
+
if columns is None:
|
|
1395
|
+
log_and_raise_error(self._logger, "'columns' is required for column formatting")
|
|
1396
|
+
|
|
1397
|
+
target_sheet = sheet_name or "Sheet1"
|
|
1398
|
+
|
|
1399
|
+
info = self.get_spreadsheet_info(spreadsheet_id)
|
|
1400
|
+
sheet_id = None
|
|
1401
|
+
for sheet in info.get("sheets", []):
|
|
1402
|
+
if sheet["properties"]["title"] == target_sheet:
|
|
1403
|
+
sheet_id = sheet["properties"]["sheetId"]
|
|
1404
|
+
break
|
|
1405
|
+
if sheet_id is None:
|
|
1406
|
+
log_and_raise_error(
|
|
1407
|
+
self._logger,
|
|
1408
|
+
f"Sheet '{target_sheet}' not found in spreadsheet {spreadsheet_id}",
|
|
1409
|
+
)
|
|
1410
|
+
|
|
1411
|
+
header = None
|
|
1412
|
+
column_indices = []
|
|
1413
|
+
for col in columns:
|
|
1414
|
+
if isinstance(col, bool) or not isinstance(col, int | str):
|
|
1415
|
+
log_and_raise_error(self._logger, f"Invalid column identifier: {col!r}")
|
|
1416
|
+
if isinstance(col, int):
|
|
1417
|
+
column_indices.append(col)
|
|
1418
|
+
continue
|
|
1419
|
+
if header is None:
|
|
1420
|
+
formatted_sheet = self._format_sheet_name(target_sheet)
|
|
1421
|
+
header_range = f"{formatted_sheet}!1:1"
|
|
1422
|
+
result = (
|
|
1423
|
+
self.service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=header_range).execute()
|
|
1424
|
+
)
|
|
1425
|
+
header_values = result.get("values", [])
|
|
1426
|
+
header = header_values[0] if header_values else []
|
|
1427
|
+
try:
|
|
1428
|
+
column_indices.append(header.index(col))
|
|
1429
|
+
except ValueError:
|
|
1430
|
+
log_and_raise_error(
|
|
1431
|
+
self._logger,
|
|
1432
|
+
f"Column '{col}' not found in header row of sheet '{target_sheet}'",
|
|
1433
|
+
)
|
|
1434
|
+
|
|
1435
|
+
start_row = 1 if has_header else 0
|
|
1436
|
+
requests = [
|
|
1437
|
+
{
|
|
1438
|
+
"repeatCell": {
|
|
1439
|
+
"range": {
|
|
1440
|
+
"sheetId": sheet_id,
|
|
1441
|
+
"startRowIndex": start_row,
|
|
1442
|
+
"startColumnIndex": idx,
|
|
1443
|
+
"endColumnIndex": idx + 1,
|
|
1444
|
+
},
|
|
1445
|
+
"cell": {"userEnteredFormat": {"numberFormat": number_format}},
|
|
1446
|
+
"fields": "userEnteredFormat.numberFormat",
|
|
1447
|
+
}
|
|
1448
|
+
}
|
|
1449
|
+
for idx in column_indices
|
|
1450
|
+
]
|
|
1451
|
+
|
|
1452
|
+
try:
|
|
1453
|
+
result = (
|
|
1454
|
+
self.service.spreadsheets()
|
|
1455
|
+
.batchUpdate(spreadsheetId=spreadsheet_id, body={"requests": requests})
|
|
1456
|
+
.execute()
|
|
1457
|
+
)
|
|
1458
|
+
self._logger.info(
|
|
1459
|
+
f"Applied {number_format['type']} format '{number_format['pattern']}' "
|
|
1460
|
+
f"to {len(requests)} column(s) in sheet '{target_sheet}'"
|
|
1461
|
+
)
|
|
1462
|
+
return result
|
|
1463
|
+
|
|
1464
|
+
except HttpError as e:
|
|
1465
|
+
log_and_raise_error(
|
|
1466
|
+
self._logger,
|
|
1467
|
+
f"HTTP error applying number format: {e}",
|
|
1468
|
+
)
|
|
1469
|
+
except Exception as e:
|
|
1470
|
+
log_and_raise_error(
|
|
1471
|
+
self._logger,
|
|
1472
|
+
f"Error applying number format: {e}",
|
|
1473
|
+
)
|
|
1474
|
+
|
|
1475
|
+
def get_spreadsheet_info(self, spreadsheet_id: str = None) -> dict:
|
|
1476
|
+
"""
|
|
1477
|
+
Get information about a spreadsheet.
|
|
1478
|
+
|
|
1479
|
+
Parameters
|
|
1480
|
+
----------
|
|
1481
|
+
spreadsheet_id : str, optional
|
|
1482
|
+
The ID of the spreadsheet. Falls back to the instance default set
|
|
1483
|
+
via ``GSheet(spreadsheet_id=...)`` when omitted.
|
|
1484
|
+
|
|
1485
|
+
Returns
|
|
1486
|
+
-------
|
|
1487
|
+
dict
|
|
1488
|
+
Spreadsheet metadata including sheets, properties, etc.
|
|
1489
|
+
|
|
1490
|
+
Examples
|
|
1491
|
+
--------
|
|
1492
|
+
>>> gsheet = GSheet(credentials_path="creds.json")
|
|
1493
|
+
>>> info = gsheet.get_spreadsheet_info("1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms")
|
|
1494
|
+
>>> print(info['properties']['title'])
|
|
1495
|
+
"""
|
|
1496
|
+
spreadsheet_id = self._resolve_spreadsheet_id(spreadsheet_id)
|
|
1497
|
+
if spreadsheet_id is None:
|
|
1498
|
+
log_and_raise_error(
|
|
1499
|
+
self._logger,
|
|
1500
|
+
"No spreadsheet_id provided and no default set on the GSheet instance",
|
|
1501
|
+
)
|
|
1502
|
+
try:
|
|
1503
|
+
spreadsheet = self.service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute()
|
|
1504
|
+
|
|
1505
|
+
return spreadsheet
|
|
1506
|
+
|
|
1507
|
+
except HttpError as e:
|
|
1508
|
+
log_and_raise_error(
|
|
1509
|
+
self._logger,
|
|
1510
|
+
f"HTTP error getting spreadsheet info: {e}",
|
|
1511
|
+
)
|
|
1512
|
+
except Exception as e:
|
|
1513
|
+
log_and_raise_error(
|
|
1514
|
+
self._logger,
|
|
1515
|
+
f"Error getting spreadsheet info: {e}",
|
|
1516
|
+
)
|
|
1517
|
+
|
|
1518
|
+
def to_csv(self, spreadsheet_id: str = None, range_name: str = None, sheet_name: str = None) -> str:
|
|
1519
|
+
"""
|
|
1520
|
+
Read data from Google Sheet and convert to CSV string.
|
|
1521
|
+
|
|
1522
|
+
Parameters
|
|
1523
|
+
----------
|
|
1524
|
+
spreadsheet_id : str, optional
|
|
1525
|
+
The ID of the spreadsheet to read from. Falls back to the instance
|
|
1526
|
+
default set via ``GSheet(spreadsheet_id=...)`` when omitted.
|
|
1527
|
+
range_name : str, optional
|
|
1528
|
+
The A1 notation range to read.
|
|
1529
|
+
sheet_name : str, optional
|
|
1530
|
+
The name of the sheet to read from.
|
|
1531
|
+
|
|
1532
|
+
Returns
|
|
1533
|
+
-------
|
|
1534
|
+
str
|
|
1535
|
+
CSV formatted string.
|
|
1536
|
+
|
|
1537
|
+
Examples
|
|
1538
|
+
--------
|
|
1539
|
+
>>> gsheet = GSheet(credentials_path="creds.json")
|
|
1540
|
+
>>> csv_data = gsheet.to_csv("1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms")
|
|
1541
|
+
"""
|
|
1542
|
+
df = self.read_sheet(spreadsheet_id, range_name, sheet_name, return_as="dataframe")
|
|
1543
|
+
return df.to_csv(index=False)
|
|
1544
|
+
|
|
1545
|
+
def gsheet_to_s3(
|
|
1546
|
+
self,
|
|
1547
|
+
spreadsheet_id: str = None,
|
|
1548
|
+
file_name: str = None,
|
|
1549
|
+
directory: str = None,
|
|
1550
|
+
range_name: str = None,
|
|
1551
|
+
sheet_name: str = None,
|
|
1552
|
+
file_format: str = "csv",
|
|
1553
|
+
s3_connector=None,
|
|
1554
|
+
bucket: str = None,
|
|
1555
|
+
) -> None:
|
|
1556
|
+
"""
|
|
1557
|
+
Transfer data from Google Sheet to S3.
|
|
1558
|
+
|
|
1559
|
+
Parameters
|
|
1560
|
+
----------
|
|
1561
|
+
spreadsheet_id : str, optional
|
|
1562
|
+
The ID of the spreadsheet to read from. Falls back to the instance
|
|
1563
|
+
default set via ``GSheet(spreadsheet_id=...)`` when omitted.
|
|
1564
|
+
file_name : str
|
|
1565
|
+
The name of the file (without extension).
|
|
1566
|
+
directory : str, optional
|
|
1567
|
+
The directory path where the file will be saved.
|
|
1568
|
+
range_name : str, optional
|
|
1569
|
+
The A1 notation range to read.
|
|
1570
|
+
sheet_name : str, optional
|
|
1571
|
+
The name of the sheet to read from.
|
|
1572
|
+
file_format : str, optional
|
|
1573
|
+
File format to save: 'csv' or 'parquet'. Default is 'csv'.
|
|
1574
|
+
s3_connector : S3Connector, optional
|
|
1575
|
+
Existing S3Connector instance. If provided, bucket parameter is ignored and bucket is taken from the connector.
|
|
1576
|
+
bucket : str, optional
|
|
1577
|
+
S3 bucket name. Only used if s3_connector is None. If both s3_connector and bucket are None, raises an error.
|
|
1578
|
+
|
|
1579
|
+
Examples
|
|
1580
|
+
--------
|
|
1581
|
+
>>> gsheet = GSheet(credentials_path="creds.json")
|
|
1582
|
+
>>> s3 = S3Connector(bucket="my-bucket", s3_root="my-project")
|
|
1583
|
+
>>> gsheet.gsheet_to_s3(
|
|
1584
|
+
... spreadsheet_id="1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms",
|
|
1585
|
+
... s3_connector=s3,
|
|
1586
|
+
... directory="data",
|
|
1587
|
+
... file_name="output"
|
|
1588
|
+
... )
|
|
1589
|
+
""" # noqa: E501
|
|
1590
|
+
from .s3_connector import S3Connector
|
|
1591
|
+
|
|
1592
|
+
if file_name is None:
|
|
1593
|
+
log_and_raise_error(self._logger, "'file_name' is required for gsheet_to_s3")
|
|
1594
|
+
|
|
1595
|
+
# Read data from Google Sheet (read_sheet resolves the spreadsheet_id)
|
|
1596
|
+
df = self.read_sheet(spreadsheet_id, range_name, sheet_name, return_as="dataframe")
|
|
1597
|
+
|
|
1598
|
+
# Additional cleaning for S3 transfer
|
|
1599
|
+
df = df.fillna("")
|
|
1600
|
+
df = df.replace([float("inf"), float("-inf")], "")
|
|
1601
|
+
|
|
1602
|
+
# Initialize S3 connector if not provided
|
|
1603
|
+
if s3_connector is None:
|
|
1604
|
+
if bucket is None:
|
|
1605
|
+
log_and_raise_error(
|
|
1606
|
+
self._logger,
|
|
1607
|
+
"Either 's3_connector' or 'bucket' parameter must be provided.",
|
|
1608
|
+
)
|
|
1609
|
+
s3_connector = S3Connector(bucket=bucket, auto_sso_login=True)
|
|
1610
|
+
|
|
1611
|
+
# Get bucket and s3_root from s3_connector
|
|
1612
|
+
target_bucket = s3_connector.bucket
|
|
1613
|
+
s3_root = s3_connector.s3_root
|
|
1614
|
+
|
|
1615
|
+
# Add file extension if not present
|
|
1616
|
+
file_extension = f".{file_format.lower()}"
|
|
1617
|
+
if not file_name.endswith(file_extension):
|
|
1618
|
+
file_name_with_ext = f"{file_name}{file_extension}"
|
|
1619
|
+
else:
|
|
1620
|
+
file_name_with_ext = file_name
|
|
1621
|
+
|
|
1622
|
+
# Construct full S3 key with s3_root and directory
|
|
1623
|
+
if directory is None:
|
|
1624
|
+
directory = ""
|
|
1625
|
+
|
|
1626
|
+
parts = [s3_root, directory, file_name_with_ext]
|
|
1627
|
+
full_s3_key = "/".join(part.strip("/") for part in parts if part).lstrip("/")
|
|
1628
|
+
|
|
1629
|
+
# Convert to appropriate format
|
|
1630
|
+
if file_format.lower() == "csv":
|
|
1631
|
+
buffer = io.StringIO()
|
|
1632
|
+
df.to_csv(buffer, index=False)
|
|
1633
|
+
body = buffer.getvalue()
|
|
1634
|
+
elif file_format.lower() == "parquet":
|
|
1635
|
+
buffer = io.BytesIO()
|
|
1636
|
+
df.to_parquet(buffer, index=False)
|
|
1637
|
+
body = buffer.getvalue()
|
|
1638
|
+
else:
|
|
1639
|
+
log_and_raise_error(
|
|
1640
|
+
self._logger,
|
|
1641
|
+
f"Unsupported file format: {file_format}. Use 'csv' or 'parquet'.",
|
|
1642
|
+
)
|
|
1643
|
+
|
|
1644
|
+
# Upload to S3
|
|
1645
|
+
s3_connector.s3.put_object(Bucket=target_bucket, Key=full_s3_key, Body=body)
|
|
1646
|
+
self._logger.info(f"Successfully transferred data to s3://{target_bucket}/{full_s3_key}")
|