airbyte-source-google-sheets 0.8.5__py3-none-any.whl → 0.9.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,279 +1,22 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ from typing import Any, Mapping, Optional
5
6
 
6
- import json
7
- import logging
8
- import socket
9
- from typing import Any, Generator, List, Mapping, MutableMapping, Optional, Union
7
+ from airbyte_cdk.models import ConfiguredAirbyteCatalog
8
+ from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource
9
+ from airbyte_cdk.sources.source import TState
10
10
 
11
- from apiclient import errors
12
- from google.auth import exceptions as google_exceptions
13
- from requests.status_codes import codes as status_codes
14
11
 
15
- from airbyte_cdk.models import FailureType
16
- from airbyte_cdk.models.airbyte_protocol import (
17
- AirbyteCatalog,
18
- AirbyteConnectionStatus,
19
- AirbyteMessage,
20
- AirbyteStateMessage,
21
- AirbyteStreamStatus,
22
- ConfiguredAirbyteCatalog,
23
- Status,
24
- Type,
25
- )
26
- from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
27
- from airbyte_cdk.sources.source import Source
28
- from airbyte_cdk.sources.streams.checkpoint import FullRefreshCheckpointReader
29
- from airbyte_cdk.utils import AirbyteTracedException
30
- from airbyte_cdk.utils.stream_status_utils import as_airbyte_message
12
+ """
13
+ This file provides the necessary constructs to interpret a provided declarative YAML configuration file into
14
+ source connector.
15
+ WARNING: Do not modify this file.
16
+ """
31
17
 
32
- from .client import GoogleSheetsClient
33
- from .helpers import Helpers
34
- from .models.spreadsheet import Spreadsheet
35
- from .models.spreadsheet_values import SpreadsheetValues
36
- from .utils import exception_description_by_status_code, safe_name_conversion
37
18
 
38
-
39
- # override default socket timeout to be 10 mins instead of 60 sec.
40
- # on behalf of https://github.com/airbytehq/oncall/issues/242
41
- DEFAULT_SOCKET_TIMEOUT: int = 600
42
- socket.setdefaulttimeout(DEFAULT_SOCKET_TIMEOUT)
43
-
44
-
45
- class SourceGoogleSheets(Source):
46
- """
47
- Spreadsheets API Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets
48
- """
49
-
50
- def check(self, logger: logging.Logger, config: json) -> AirbyteConnectionStatus:
51
- # Check involves verifying that the specified spreadsheet is reachable with our credentials.
52
- try:
53
- client = GoogleSheetsClient(self.get_credentials(config))
54
- except Exception as e:
55
- return AirbyteConnectionStatus(status=Status.FAILED, message=f"Please use valid credentials json file. Error: {e}")
56
-
57
- spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
58
-
59
- try:
60
- spreadsheet = client.get(spreadsheetId=spreadsheet_id, includeGridData=False)
61
- except errors.HttpError as err:
62
- message = "Config error: "
63
- # Give a clearer message if it's a common error like 404.
64
- if err.resp.status == status_codes.NOT_FOUND:
65
- message += "The spreadsheet link is not valid. Enter the URL of the Google spreadsheet you want to sync."
66
- raise AirbyteTracedException(
67
- message=message,
68
- internal_message=message,
69
- failure_type=FailureType.config_error,
70
- ) from err
71
- except google_exceptions.GoogleAuthError as err:
72
- message = "Access to the spreadsheet expired or was revoked. Re-authenticate to restore access."
73
- raise AirbyteTracedException(
74
- message=message,
75
- internal_message=message,
76
- failure_type=FailureType.config_error,
77
- ) from err
78
-
79
- # Check for duplicate headers
80
- spreadsheet_metadata = Spreadsheet.parse_obj(spreadsheet)
81
- grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
82
-
83
- duplicate_headers_in_sheet = {}
84
- for sheet_name in grid_sheets:
85
- try:
86
- header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
87
- if config.get("names_conversion"):
88
- header_row_data = [safe_name_conversion(h) for h in header_row_data]
89
- _, duplicate_headers = Helpers.get_valid_headers_and_duplicates(header_row_data)
90
- if duplicate_headers:
91
- duplicate_headers_in_sheet[sheet_name] = duplicate_headers
92
- except Exception as err:
93
- if str(err).startswith("Expected data for exactly one row for sheet"):
94
- logger.warn(f"Skip empty sheet: {sheet_name}")
95
- else:
96
- logger.error(str(err))
97
- return AirbyteConnectionStatus(
98
- status=Status.FAILED, message=f"Unable to read the schema of sheet {sheet_name}. Error: {str(err)}"
99
- )
100
- if duplicate_headers_in_sheet:
101
- duplicate_headers_error_message = ", ".join(
102
- [
103
- f"[sheet:{sheet_name}, headers:{duplicate_sheet_headers}]"
104
- for sheet_name, duplicate_sheet_headers in duplicate_headers_in_sheet.items()
105
- ]
106
- )
107
- return AirbyteConnectionStatus(
108
- status=Status.FAILED,
109
- message="The following duplicate headers were found in the following sheets. Please fix them to continue: "
110
- + duplicate_headers_error_message,
111
- )
112
-
113
- return AirbyteConnectionStatus(status=Status.SUCCEEDED)
114
-
115
- def discover(self, logger: logging.Logger, config: json) -> AirbyteCatalog:
116
- client = GoogleSheetsClient(self.get_credentials(config))
117
- spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
118
- try:
119
- logger.info(f"Running discovery on sheet {spreadsheet_id}")
120
- spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
121
- grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
122
- streams = []
123
- for sheet_name in grid_sheets:
124
- try:
125
- header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
126
- if config.get("names_conversion"):
127
- header_row_data = [safe_name_conversion(h) for h in header_row_data]
128
- stream = Helpers.headers_to_airbyte_stream(logger, sheet_name, header_row_data)
129
- streams.append(stream)
130
- except Exception as err:
131
- if str(err).startswith("Expected data for exactly one row for sheet"):
132
- logger.warn(f"Skip empty sheet: {sheet_name}")
133
- else:
134
- logger.error(str(err))
135
- return AirbyteCatalog(streams=streams)
136
-
137
- except errors.HttpError as err:
138
- error_description = exception_description_by_status_code(err.resp.status, spreadsheet_id)
139
- config_error_status_codes = [status_codes.NOT_FOUND, status_codes.FORBIDDEN]
140
- if err.resp.status in config_error_status_codes:
141
- message = f"{error_description}. {err.reason}."
142
- raise AirbyteTracedException(
143
- message=message,
144
- internal_message=message,
145
- failure_type=FailureType.config_error,
146
- ) from err
147
- raise Exception(f"Could not discover the schema of your spreadsheet. {error_description}. {err.reason}.")
148
- except google_exceptions.GoogleAuthError as err:
149
- message = "Access to the spreadsheet expired or was revoked. Re-authenticate to restore access."
150
- raise AirbyteTracedException(
151
- message=message,
152
- internal_message=message,
153
- failure_type=FailureType.config_error,
154
- ) from err
155
-
156
- def _read(
157
- self,
158
- logger: logging.Logger,
159
- config: json,
160
- catalog: ConfiguredAirbyteCatalog,
161
- state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
162
- ) -> Generator[AirbyteMessage, None, None]:
163
- client = GoogleSheetsClient(self.get_credentials(config))
164
- client.Backoff.row_batch_size = config.get("batch_size", 200)
165
-
166
- sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog)
167
- stream_instances = {s.stream.name: s.stream for s in catalog.streams}
168
- state_manager = ConnectorStateManager(stream_instance_map=stream_instances, state=state or {})
169
- spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
170
-
171
- logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
172
- # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
173
- # a blank row, emit the row batch
174
- sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(
175
- client, spreadsheet_id, sheet_to_column_name, config.get("names_conversion")
176
- )
177
- sheet_row_counts = Helpers.get_sheet_row_count(client, spreadsheet_id)
178
- logger.info(f"Row counts: {sheet_row_counts}")
179
- for sheet in sheet_to_column_index_to_name.keys():
180
- logger.info(f"Syncing sheet {sheet}")
181
- stream = stream_instances.get(sheet)
182
- yield as_airbyte_message(stream, AirbyteStreamStatus.STARTED)
183
- checkpoint_reader = FullRefreshCheckpointReader([])
184
- _ = checkpoint_reader.next()
185
- # We revalidate the sheet here to avoid errors in case the sheet was changed after the sync started
186
- is_valid, reason = Helpers.check_sheet_is_valid(client, spreadsheet_id, sheet)
187
- if not is_valid:
188
- logger.info(f"Skipping syncing sheet {sheet}: {reason}")
189
- yield self._checkpoint_state(checkpoint_reader.get_checkpoint(), state_manager, sheet, None)
190
- yield as_airbyte_message(stream, AirbyteStreamStatus.INCOMPLETE)
191
- continue
192
-
193
- column_index_to_name = sheet_to_column_index_to_name[sheet]
194
- row_cursor = 2 # we start syncing past the header row
195
- # For the loop, it is necessary that the initial row exists when we send a request to the API,
196
- # if the last row of the interval goes outside the sheet - this is normal, we will return
197
- # only the real data of the sheet and in the next iteration we will loop out.
198
- while row_cursor <= sheet_row_counts[sheet]:
199
- row_batch = SpreadsheetValues.parse_obj(
200
- client.get_values(
201
- sheet=sheet,
202
- row_cursor=row_cursor,
203
- spreadsheetId=spreadsheet_id,
204
- majorDimension="ROWS",
205
- )
206
- )
207
-
208
- row_cursor += client.Backoff.row_batch_size + 1
209
- # there should always be one range since we requested only one
210
- value_ranges = row_batch.valueRanges[0]
211
-
212
- if not value_ranges.values:
213
- break
214
-
215
- row_values = value_ranges.values
216
- if len(row_values) == 0:
217
- break
218
-
219
- yield as_airbyte_message(stream, AirbyteStreamStatus.RUNNING)
220
- for row in row_values:
221
- if not Helpers.is_row_empty(row) and Helpers.row_contains_relevant_data(row, column_index_to_name.keys()):
222
- yield AirbyteMessage(type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name))
223
-
224
- yield self._checkpoint_state(checkpoint_reader.get_checkpoint(), state_manager, sheet, None)
225
- yield as_airbyte_message(stream, AirbyteStreamStatus.COMPLETE)
226
-
227
- def _checkpoint_state(
228
- self,
229
- stream_state: Mapping[str, Any],
230
- state_manager,
231
- stream_name: str,
232
- stream_namespace: Optional[str],
233
- ) -> AirbyteMessage:
234
- state_manager.update_state_for_stream(stream_name, stream_namespace, stream_state)
235
- return state_manager.create_state_message(stream_name, stream_namespace)
236
-
237
- def read(
238
- self,
239
- logger: logging.Logger,
240
- config: json,
241
- catalog: ConfiguredAirbyteCatalog,
242
- state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
243
- ) -> Generator[AirbyteMessage, None, None]:
244
- spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
245
- try:
246
- yield from self._read(logger, config, catalog, state)
247
- except errors.HttpError as e:
248
- error_description = exception_description_by_status_code(e.status_code, spreadsheet_id)
249
-
250
- if e.status_code == status_codes.FORBIDDEN:
251
- raise AirbyteTracedException(
252
- message=f"Stopped syncing process. {error_description}",
253
- internal_message=error_description,
254
- failure_type=FailureType.config_error,
255
- ) from e
256
- if e.status_code == status_codes.TOO_MANY_REQUESTS:
257
- raise AirbyteTracedException(
258
- message=f"Stopped syncing process due to rate limits. {error_description}",
259
- internal_message=error_description,
260
- failure_type=FailureType.transient_error,
261
- ) from e
262
- else:
263
- logger.info(f"{e.status_code}: {e.reason}. {error_description}")
264
- raise AirbyteTracedException(
265
- message=f"Stopped syncing process. {error_description}",
266
- internal_message=error_description,
267
- failure_type=FailureType.transient_error,
268
- ) from e
269
- finally:
270
- logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
271
-
272
- @staticmethod
273
- def get_credentials(config):
274
- # backward compatible with old style config
275
- if config.get("credentials_json"):
276
- credentials = {"auth_type": "Service", "service_account_info": config.get("credentials_json")}
277
- return credentials
278
-
279
- return config.get("credentials")
19
+ # Declarative Source
20
+ class SourceGoogleSheets(YamlDeclarativeSource):
21
+ def __init__(self, catalog: Optional[ConfiguredAirbyteCatalog], config: Optional[Mapping[str, Any]], state: TState, **kwargs):
22
+ super().__init__(catalog=catalog, config=config, state=state, **{"path_to_yaml": "manifest.yaml"})
@@ -12,17 +12,15 @@ connectionSpecification:
12
12
  type: integer
13
13
  title: Row Batch Size
14
14
  description: >-
15
- Default value is 200.
15
+ Default value is 1000000.
16
16
  An integer representing row batch size for each sent request to Google Sheets API.
17
- Row batch size means how many rows are processed from the google sheet, for example default value 200
18
- would process rows 1-201, then 201-401 and so on.
17
+ Row batch size means how many rows are processed from the google sheet, for example default value 1000000
18
+ would process rows 2-1000002, then 1000003-2000003 and so on.
19
19
  Based on <a href='https://developers.google.com/sheets/api/limits'>Google Sheets API limits documentation</a>,
20
20
  it is possible to send up to 300 requests per minute, but each individual request has to be processed under 180 seconds,
21
21
  otherwise the request returns a timeout error. In regards to this information, consider network speed and
22
22
  number of columns of the google sheet when deciding a batch_size value.
23
- Default value should cover most of the cases, but if a google sheet has over 100,000 records or more,
24
- consider increasing batch_size value.
25
- default: 200
23
+ default: 1000000
26
24
  spreadsheet_id:
27
25
  type: string
28
26
  title: Spreadsheet Link
@@ -1,5 +1,5 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
 
@@ -1,14 +0,0 @@
1
- source_google_sheets/__init__.py,sha256=-aGVMRfrgWjYad3_cHofIptEEa5WMQzTvFD92HevQfw,73
2
- source_google_sheets/client.py,sha256=A2BynDswjJ4naSyfjJr9G8fAfGEqLlNXP5vlbAsI3_s,1917
3
- source_google_sheets/helpers.py,sha256=CbXNlEfC3sNMDTGNPb22DcalWoXfv7kAYs7LmNM76Ec,10644
4
- source_google_sheets/models/__init__.py,sha256=Z-4MTpxG5t2jGhXzs4PPoIOa83zw3jRnUDx0N9Puv3s,61
5
- source_google_sheets/models/spreadsheet.py,sha256=fsHREpPEN36wCzGdqgfJ2EVW40UDZ_lS863A4XT2pGo,1112
6
- source_google_sheets/models/spreadsheet_values.py,sha256=y8ytuTqwpziJ2ICl0xhlRWgjMkxTfxOalRd414PMHZM,440
7
- source_google_sheets/run.py,sha256=_f5-LNqMzBuHtCD1YoUBxnA0fszgqmdNGcN7y_AmXU0,237
8
- source_google_sheets/source.py,sha256=kuHugJM9VOEPsArkP4lo88nBrS8ipk1yAoUi7cVSYZY,13506
9
- source_google_sheets/spec.yaml,sha256=WrPdH2xLCdyM-kY-pRqbwICcNPhv8nqnb2gdbslTsaQ,5141
10
- source_google_sheets/utils.py,sha256=DI53ARcKln77ekvuzsb3x35O9aMgZ_9OY9ets0FtI24,2290
11
- airbyte_source_google_sheets-0.8.5.dist-info/METADATA,sha256=LkVsOKPv_UsgFioRkUDxOiDG8kO6jFF_F7MYl3zkXy8,5551
12
- airbyte_source_google_sheets-0.8.5.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
13
- airbyte_source_google_sheets-0.8.5.dist-info/entry_points.txt,sha256=Dtsfjohe5IPUFyqojk49SIoP7CifCTlNLG_pgivzppo,69
14
- airbyte_source_google_sheets-0.8.5.dist-info/RECORD,,
@@ -1,49 +0,0 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
- import logging
6
- from typing import Dict, List
7
-
8
- import backoff
9
- from googleapiclient import errors
10
- from requests import codes as status_codes
11
-
12
- from .helpers import SCOPES, Helpers
13
-
14
-
15
- logger = logging.getLogger("airbyte")
16
-
17
-
18
- class GoogleSheetsClient:
19
- class Backoff:
20
- row_batch_size = 200
21
-
22
- @classmethod
23
- def increase_row_batch_size(cls, details):
24
- if details["exception"].status_code == status_codes.TOO_MANY_REQUESTS and cls.row_batch_size < 1000:
25
- cls.row_batch_size = cls.row_batch_size + 100
26
- logger.info(f"Increasing number of records fetching due to rate limits. Current value: {cls.row_batch_size}")
27
-
28
- @staticmethod
29
- def give_up(error):
30
- code = error.resp.status
31
- # Stop retrying if it's not a problem with the rate limit or on the server end
32
- return not (code == status_codes.TOO_MANY_REQUESTS or 500 <= code < 600)
33
-
34
- def __init__(self, credentials: Dict[str, str], scopes: List[str] = SCOPES):
35
- self.client = Helpers.get_authenticated_sheets_client(credentials, scopes)
36
-
37
- @backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
38
- def get(self, **kwargs):
39
- return self.client.get(**kwargs).execute()
40
-
41
- @backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
42
- def get_values(self, **kwargs):
43
- range = self._create_range(kwargs.pop("sheet"), kwargs.pop("row_cursor"))
44
- logger.info(f"Fetching range {range}")
45
- return self.client.values().batchGet(ranges=range, **kwargs).execute()
46
-
47
- def _create_range(self, sheet, row_cursor):
48
- range = f"{sheet}!{row_cursor}:{row_cursor + self.Backoff.row_batch_size}"
49
- return range
@@ -1,234 +0,0 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
- import json
6
- import logging
7
- import re
8
- from collections import defaultdict
9
- from datetime import datetime
10
- from typing import Dict, FrozenSet, Iterable, List, Tuple
11
-
12
- from google.oauth2 import credentials as client_account
13
- from google.oauth2 import service_account
14
- from googleapiclient import discovery
15
-
16
- from airbyte_cdk.models.airbyte_protocol import AirbyteRecordMessage, AirbyteStream, ConfiguredAirbyteCatalog, SyncMode
17
-
18
- from .models.spreadsheet import RowData, Spreadsheet
19
- from .utils import safe_name_conversion
20
-
21
-
22
- SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly", "https://www.googleapis.com/auth/drive.readonly"]
23
-
24
- logger = logging.getLogger("airbyte")
25
-
26
-
27
- class Helpers(object):
28
- @staticmethod
29
- def get_authenticated_sheets_client(credentials: Dict[str, str], scopes: List[str] = SCOPES) -> discovery.Resource:
30
- creds = Helpers.get_authenticated_google_credentials(credentials, scopes)
31
- return discovery.build("sheets", "v4", credentials=creds).spreadsheets()
32
-
33
- @staticmethod
34
- def get_authenticated_drive_client(credentials: Dict[str, str], scopes: List[str] = SCOPES) -> discovery.Resource:
35
- creds = Helpers.get_authenticated_google_credentials(credentials, scopes)
36
- return discovery.build("drive", "v3", credentials=creds)
37
-
38
- @staticmethod
39
- def get_authenticated_google_credentials(credentials: Dict[str, str], scopes: List[str] = SCOPES):
40
- auth_type = credentials.pop("auth_type")
41
- if auth_type == "Service":
42
- return service_account.Credentials.from_service_account_info(json.loads(credentials["service_account_info"]), scopes=scopes)
43
- elif auth_type == "Client":
44
- return client_account.Credentials.from_authorized_user_info(info=credentials)
45
-
46
- @staticmethod
47
- def headers_to_airbyte_stream(logger: logging.Logger, sheet_name: str, header_row_values: List[str]) -> AirbyteStream:
48
- """
49
- Parses sheet headers from the provided row. This method assumes that data is contiguous
50
- i.e: every cell contains a value and the first cell which does not contain a value denotes the end
51
- of the headers. For example, if the first row contains "One | Two | | Three" then this method
52
- will parse the headers as ["One", "Two"]. This assumption is made for simplicity and can be modified later.
53
- """
54
- fields, duplicate_fields = Helpers.get_valid_headers_and_duplicates(header_row_values)
55
- if duplicate_fields:
56
- logger.warn(f"Duplicate headers found in {sheet_name}. Ignoring them: {duplicate_fields}")
57
-
58
- sheet_json_schema = {
59
- "$schema": "http://json-schema.org/draft-07/schema#",
60
- "type": "object",
61
- # For simplicity, the type of every cell is a string
62
- "properties": {field: {"type": "string"} for field in fields},
63
- }
64
-
65
- return AirbyteStream(name=sheet_name, json_schema=sheet_json_schema, supported_sync_modes=[SyncMode.full_refresh])
66
-
67
- @staticmethod
68
- def get_valid_headers_and_duplicates(header_row_values: List[str]) -> (List[str], List[str]):
69
- fields = []
70
- duplicate_fields = set()
71
- for cell_value in header_row_values:
72
- if cell_value:
73
- if cell_value in fields:
74
- duplicate_fields.add(cell_value)
75
- else:
76
- fields.append(cell_value)
77
- else:
78
- break
79
-
80
- # Removing all duplicate fields
81
- if duplicate_fields:
82
- fields = [field for field in fields if field not in duplicate_fields]
83
-
84
- return fields, list(duplicate_fields)
85
-
86
- @staticmethod
87
- def get_formatted_row_values(row_data: RowData) -> List[str]:
88
- """
89
- Gets the formatted values of all cell data in this row. A formatted value is the final value a user sees in a spreadsheet.
90
- It can be a raw string input by the user, or the result of a sheets function call.
91
- """
92
- return [value.formattedValue for value in row_data.values]
93
-
94
- @staticmethod
95
- def get_first_row(client, spreadsheet_id: str, sheet_name: str) -> List[str]:
96
- spreadsheet = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=True, ranges=f"{sheet_name}!1:1"))
97
-
98
- # There is only one sheet since we are specifying the sheet in the requested ranges.
99
- returned_sheets = spreadsheet.sheets
100
- if len(returned_sheets) != 1:
101
- raise Exception(f"Unexpected return result: Sheet {sheet_name} was expected to contain data on exactly 1 sheet. ")
102
-
103
- range_data = returned_sheets[0].data
104
- if len(range_data) != 1:
105
- raise Exception(f"Expected data for exactly one range for sheet {sheet_name}")
106
-
107
- all_row_data = range_data[0].rowData
108
- if not all_row_data:
109
- # the sheet is empty
110
- logger.warning(f"The sheet {sheet_name} (ID {spreadsheet_id}) is empty!")
111
- return []
112
-
113
- if len(all_row_data) != 1:
114
- raise Exception(f"Expected data for exactly one row for sheet {sheet_name}")
115
-
116
- first_row_data = all_row_data[0]
117
-
118
- return Helpers.get_formatted_row_values(first_row_data)
119
-
120
- @staticmethod
121
- def parse_sheet_and_column_names_from_catalog(catalog: ConfiguredAirbyteCatalog) -> Dict[str, FrozenSet[str]]:
122
- sheet_to_column_name = {}
123
- for configured_stream in catalog.streams:
124
- stream = configured_stream.stream
125
- sheet_name = stream.name
126
- sheet_to_column_name[sheet_name] = frozenset(stream.json_schema["properties"].keys())
127
-
128
- return sheet_to_column_name
129
-
130
- @staticmethod
131
- def row_data_to_record_message(sheet_name: str, cell_values: List[str], column_index_to_name: Dict[int, str]) -> AirbyteRecordMessage:
132
- data = {}
133
- for relevant_index in sorted(column_index_to_name.keys()):
134
- if relevant_index >= len(cell_values):
135
- break
136
-
137
- cell_value = cell_values[relevant_index]
138
- if cell_value.strip() != "":
139
- data[column_index_to_name[relevant_index]] = cell_value
140
-
141
- return AirbyteRecordMessage(stream=sheet_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000)
142
-
143
- @staticmethod
144
- def get_available_sheets_to_column_index_to_name(
145
- client, spreadsheet_id: str, requested_sheets_and_columns: Dict[str, FrozenSet[str]], names_conversion: bool = False
146
- ) -> Dict[str, Dict[int, str]]:
147
- available_sheets = Helpers.get_sheets_in_spreadsheet(client, spreadsheet_id)
148
- logger.info(f"Available sheets: {available_sheets}")
149
- available_sheets_to_column_index_to_name = defaultdict(dict)
150
- for sheet, columns in requested_sheets_and_columns.items():
151
- if sheet in available_sheets:
152
- first_row = Helpers.get_first_row(client, spreadsheet_id, sheet)
153
- if names_conversion:
154
- first_row = [safe_name_conversion(h) for h in first_row]
155
- # When performing names conversion, they won't match what is listed in catalog for the majority of cases,
156
- # so they should be cast here in order to have them in records
157
- columns = {safe_name_conversion(c) for c in columns}
158
- # Find the column index of each header value
159
- idx = 0
160
- for cell_value in first_row:
161
- if cell_value in columns:
162
- available_sheets_to_column_index_to_name[sheet][idx] = cell_value
163
- idx += 1
164
- return available_sheets_to_column_index_to_name
165
-
166
- @staticmethod
167
- def get_sheets_in_spreadsheet(client, spreadsheet_id: str) -> List[str]:
168
- spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
169
- return [sheet.properties.title for sheet in spreadsheet_metadata.sheets]
170
-
171
- @staticmethod
172
- def get_sheet_row_count(client, spreadsheet_id: str) -> Dict[str, int]:
173
- spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
174
- # filter out sheets without gridProperties (like in diagram sheets)
175
- data_sheets = [sheet for sheet in spreadsheet_metadata.sheets if hasattr(sheet.properties, "gridProperties")]
176
- return {sheet.properties.title: sheet.properties.gridProperties["rowCount"] for sheet in data_sheets}
177
-
178
- @staticmethod
179
- def get_grid_sheets(spreadsheet_metadata) -> List[str]:
180
- """Return grid only diagram, filter out sheets with image/diagram only
181
-
182
- https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/sheets#sheetproperties
183
- """
184
- grid_sheets = []
185
- non_grid_sheets = []
186
- for sheet in spreadsheet_metadata.sheets:
187
- sheet_title = sheet.properties.title
188
- if (
189
- hasattr(sheet.properties, "gridProperties")
190
- and hasattr(sheet.properties, "sheetType")
191
- and sheet.properties.sheetType == "GRID"
192
- ):
193
- grid_sheets.append(sheet_title)
194
- else:
195
- non_grid_sheets.append(sheet_title)
196
-
197
- if non_grid_sheets:
198
- # logging.getLogger(...).log() expects an integer level. The level for WARN is 30
199
- # Reference: https://docs.python.org/3.10/library/logging.html#levels
200
- logging.getLogger("airbyte").log(30, "Skip non-grid sheets: " + ", ".join(non_grid_sheets))
201
-
202
- return grid_sheets
203
-
204
- @staticmethod
205
- def is_row_empty(cell_values: List[str]) -> bool:
206
- for cell in cell_values:
207
- if cell.strip() != "":
208
- return False
209
- return True
210
-
211
- @staticmethod
212
- def row_contains_relevant_data(cell_values: List[str], relevant_indices: Iterable[int]) -> bool:
213
- for idx in relevant_indices:
214
- if len(cell_values) > idx and cell_values[idx].strip() != "":
215
- return True
216
- return False
217
-
218
- @staticmethod
219
- def get_spreadsheet_id(id_or_url: str) -> str:
220
- if re.match(r"(https://)", id_or_url):
221
- # This is a URL
222
- m = re.search(r"(/)([-\w]{20,})([/]?)", id_or_url)
223
- if m is not None and m.group(2):
224
- return m.group(2)
225
- else:
226
- return id_or_url
227
-
228
- @staticmethod
229
- def check_sheet_is_valid(client, spreadsheet_id: str, sheet_name: str) -> Tuple[bool, str]:
230
- try:
231
- Helpers.get_first_row(client, spreadsheet_id, sheet_name)
232
- return True, ""
233
- except Exception as e:
234
- return False, str(e)