airbyte-source-google-sheets 0.8.5__py3-none-any.whl → 0.9.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,279 +1,22 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ from typing import Any, Mapping, Optional
5
6
 
6
- import json
7
- import logging
8
- import socket
9
- from typing import Any, Generator, List, Mapping, MutableMapping, Optional, Union
7
+ from airbyte_cdk.models import ConfiguredAirbyteCatalog
8
+ from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource
9
+ from airbyte_cdk.sources.source import TState
10
10
 
11
- from apiclient import errors
12
- from google.auth import exceptions as google_exceptions
13
- from requests.status_codes import codes as status_codes
14
11
 
15
- from airbyte_cdk.models import FailureType
16
- from airbyte_cdk.models.airbyte_protocol import (
17
- AirbyteCatalog,
18
- AirbyteConnectionStatus,
19
- AirbyteMessage,
20
- AirbyteStateMessage,
21
- AirbyteStreamStatus,
22
- ConfiguredAirbyteCatalog,
23
- Status,
24
- Type,
25
- )
26
- from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
27
- from airbyte_cdk.sources.source import Source
28
- from airbyte_cdk.sources.streams.checkpoint import FullRefreshCheckpointReader
29
- from airbyte_cdk.utils import AirbyteTracedException
30
- from airbyte_cdk.utils.stream_status_utils import as_airbyte_message
12
+ """
13
+ This file provides the necessary constructs to interpret a provided declarative YAML configuration file into
14
+ source connector.
15
+ WARNING: Do not modify this file.
16
+ """
31
17
 
32
- from .client import GoogleSheetsClient
33
- from .helpers import Helpers
34
- from .models.spreadsheet import Spreadsheet
35
- from .models.spreadsheet_values import SpreadsheetValues
36
- from .utils import exception_description_by_status_code, safe_name_conversion
37
18
 
38
-
39
- # override default socket timeout to be 10 mins instead of 60 sec.
40
- # on behalf of https://github.com/airbytehq/oncall/issues/242
41
- DEFAULT_SOCKET_TIMEOUT: int = 600
42
- socket.setdefaulttimeout(DEFAULT_SOCKET_TIMEOUT)
43
-
44
-
45
- class SourceGoogleSheets(Source):
46
- """
47
- Spreadsheets API Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets
48
- """
49
-
50
- def check(self, logger: logging.Logger, config: json) -> AirbyteConnectionStatus:
51
- # Check involves verifying that the specified spreadsheet is reachable with our credentials.
52
- try:
53
- client = GoogleSheetsClient(self.get_credentials(config))
54
- except Exception as e:
55
- return AirbyteConnectionStatus(status=Status.FAILED, message=f"Please use valid credentials json file. Error: {e}")
56
-
57
- spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
58
-
59
- try:
60
- spreadsheet = client.get(spreadsheetId=spreadsheet_id, includeGridData=False)
61
- except errors.HttpError as err:
62
- message = "Config error: "
63
- # Give a clearer message if it's a common error like 404.
64
- if err.resp.status == status_codes.NOT_FOUND:
65
- message += "The spreadsheet link is not valid. Enter the URL of the Google spreadsheet you want to sync."
66
- raise AirbyteTracedException(
67
- message=message,
68
- internal_message=message,
69
- failure_type=FailureType.config_error,
70
- ) from err
71
- except google_exceptions.GoogleAuthError as err:
72
- message = "Access to the spreadsheet expired or was revoked. Re-authenticate to restore access."
73
- raise AirbyteTracedException(
74
- message=message,
75
- internal_message=message,
76
- failure_type=FailureType.config_error,
77
- ) from err
78
-
79
- # Check for duplicate headers
80
- spreadsheet_metadata = Spreadsheet.parse_obj(spreadsheet)
81
- grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
82
-
83
- duplicate_headers_in_sheet = {}
84
- for sheet_name in grid_sheets:
85
- try:
86
- header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
87
- if config.get("names_conversion"):
88
- header_row_data = [safe_name_conversion(h) for h in header_row_data]
89
- _, duplicate_headers = Helpers.get_valid_headers_and_duplicates(header_row_data)
90
- if duplicate_headers:
91
- duplicate_headers_in_sheet[sheet_name] = duplicate_headers
92
- except Exception as err:
93
- if str(err).startswith("Expected data for exactly one row for sheet"):
94
- logger.warn(f"Skip empty sheet: {sheet_name}")
95
- else:
96
- logger.error(str(err))
97
- return AirbyteConnectionStatus(
98
- status=Status.FAILED, message=f"Unable to read the schema of sheet {sheet_name}. Error: {str(err)}"
99
- )
100
- if duplicate_headers_in_sheet:
101
- duplicate_headers_error_message = ", ".join(
102
- [
103
- f"[sheet:{sheet_name}, headers:{duplicate_sheet_headers}]"
104
- for sheet_name, duplicate_sheet_headers in duplicate_headers_in_sheet.items()
105
- ]
106
- )
107
- return AirbyteConnectionStatus(
108
- status=Status.FAILED,
109
- message="The following duplicate headers were found in the following sheets. Please fix them to continue: "
110
- + duplicate_headers_error_message,
111
- )
112
-
113
- return AirbyteConnectionStatus(status=Status.SUCCEEDED)
114
-
115
- def discover(self, logger: logging.Logger, config: json) -> AirbyteCatalog:
116
- client = GoogleSheetsClient(self.get_credentials(config))
117
- spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
118
- try:
119
- logger.info(f"Running discovery on sheet {spreadsheet_id}")
120
- spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
121
- grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
122
- streams = []
123
- for sheet_name in grid_sheets:
124
- try:
125
- header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
126
- if config.get("names_conversion"):
127
- header_row_data = [safe_name_conversion(h) for h in header_row_data]
128
- stream = Helpers.headers_to_airbyte_stream(logger, sheet_name, header_row_data)
129
- streams.append(stream)
130
- except Exception as err:
131
- if str(err).startswith("Expected data for exactly one row for sheet"):
132
- logger.warn(f"Skip empty sheet: {sheet_name}")
133
- else:
134
- logger.error(str(err))
135
- return AirbyteCatalog(streams=streams)
136
-
137
- except errors.HttpError as err:
138
- error_description = exception_description_by_status_code(err.resp.status, spreadsheet_id)
139
- config_error_status_codes = [status_codes.NOT_FOUND, status_codes.FORBIDDEN]
140
- if err.resp.status in config_error_status_codes:
141
- message = f"{error_description}. {err.reason}."
142
- raise AirbyteTracedException(
143
- message=message,
144
- internal_message=message,
145
- failure_type=FailureType.config_error,
146
- ) from err
147
- raise Exception(f"Could not discover the schema of your spreadsheet. {error_description}. {err.reason}.")
148
- except google_exceptions.GoogleAuthError as err:
149
- message = "Access to the spreadsheet expired or was revoked. Re-authenticate to restore access."
150
- raise AirbyteTracedException(
151
- message=message,
152
- internal_message=message,
153
- failure_type=FailureType.config_error,
154
- ) from err
155
-
156
- def _read(
157
- self,
158
- logger: logging.Logger,
159
- config: json,
160
- catalog: ConfiguredAirbyteCatalog,
161
- state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
162
- ) -> Generator[AirbyteMessage, None, None]:
163
- client = GoogleSheetsClient(self.get_credentials(config))
164
- client.Backoff.row_batch_size = config.get("batch_size", 200)
165
-
166
- sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog)
167
- stream_instances = {s.stream.name: s.stream for s in catalog.streams}
168
- state_manager = ConnectorStateManager(stream_instance_map=stream_instances, state=state or {})
169
- spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
170
-
171
- logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
172
- # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
173
- # a blank row, emit the row batch
174
- sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(
175
- client, spreadsheet_id, sheet_to_column_name, config.get("names_conversion")
176
- )
177
- sheet_row_counts = Helpers.get_sheet_row_count(client, spreadsheet_id)
178
- logger.info(f"Row counts: {sheet_row_counts}")
179
- for sheet in sheet_to_column_index_to_name.keys():
180
- logger.info(f"Syncing sheet {sheet}")
181
- stream = stream_instances.get(sheet)
182
- yield as_airbyte_message(stream, AirbyteStreamStatus.STARTED)
183
- checkpoint_reader = FullRefreshCheckpointReader([])
184
- _ = checkpoint_reader.next()
185
- # We revalidate the sheet here to avoid errors in case the sheet was changed after the sync started
186
- is_valid, reason = Helpers.check_sheet_is_valid(client, spreadsheet_id, sheet)
187
- if not is_valid:
188
- logger.info(f"Skipping syncing sheet {sheet}: {reason}")
189
- yield self._checkpoint_state(checkpoint_reader.get_checkpoint(), state_manager, sheet, None)
190
- yield as_airbyte_message(stream, AirbyteStreamStatus.INCOMPLETE)
191
- continue
192
-
193
- column_index_to_name = sheet_to_column_index_to_name[sheet]
194
- row_cursor = 2 # we start syncing past the header row
195
- # For the loop, it is necessary that the initial row exists when we send a request to the API,
196
- # if the last row of the interval goes outside the sheet - this is normal, we will return
197
- # only the real data of the sheet and in the next iteration we will loop out.
198
- while row_cursor <= sheet_row_counts[sheet]:
199
- row_batch = SpreadsheetValues.parse_obj(
200
- client.get_values(
201
- sheet=sheet,
202
- row_cursor=row_cursor,
203
- spreadsheetId=spreadsheet_id,
204
- majorDimension="ROWS",
205
- )
206
- )
207
-
208
- row_cursor += client.Backoff.row_batch_size + 1
209
- # there should always be one range since we requested only one
210
- value_ranges = row_batch.valueRanges[0]
211
-
212
- if not value_ranges.values:
213
- break
214
-
215
- row_values = value_ranges.values
216
- if len(row_values) == 0:
217
- break
218
-
219
- yield as_airbyte_message(stream, AirbyteStreamStatus.RUNNING)
220
- for row in row_values:
221
- if not Helpers.is_row_empty(row) and Helpers.row_contains_relevant_data(row, column_index_to_name.keys()):
222
- yield AirbyteMessage(type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name))
223
-
224
- yield self._checkpoint_state(checkpoint_reader.get_checkpoint(), state_manager, sheet, None)
225
- yield as_airbyte_message(stream, AirbyteStreamStatus.COMPLETE)
226
-
227
- def _checkpoint_state(
228
- self,
229
- stream_state: Mapping[str, Any],
230
- state_manager,
231
- stream_name: str,
232
- stream_namespace: Optional[str],
233
- ) -> AirbyteMessage:
234
- state_manager.update_state_for_stream(stream_name, stream_namespace, stream_state)
235
- return state_manager.create_state_message(stream_name, stream_namespace)
236
-
237
- def read(
238
- self,
239
- logger: logging.Logger,
240
- config: json,
241
- catalog: ConfiguredAirbyteCatalog,
242
- state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
243
- ) -> Generator[AirbyteMessage, None, None]:
244
- spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
245
- try:
246
- yield from self._read(logger, config, catalog, state)
247
- except errors.HttpError as e:
248
- error_description = exception_description_by_status_code(e.status_code, spreadsheet_id)
249
-
250
- if e.status_code == status_codes.FORBIDDEN:
251
- raise AirbyteTracedException(
252
- message=f"Stopped syncing process. {error_description}",
253
- internal_message=error_description,
254
- failure_type=FailureType.config_error,
255
- ) from e
256
- if e.status_code == status_codes.TOO_MANY_REQUESTS:
257
- raise AirbyteTracedException(
258
- message=f"Stopped syncing process due to rate limits. {error_description}",
259
- internal_message=error_description,
260
- failure_type=FailureType.transient_error,
261
- ) from e
262
- else:
263
- logger.info(f"{e.status_code}: {e.reason}. {error_description}")
264
- raise AirbyteTracedException(
265
- message=f"Stopped syncing process. {error_description}",
266
- internal_message=error_description,
267
- failure_type=FailureType.transient_error,
268
- ) from e
269
- finally:
270
- logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
271
-
272
- @staticmethod
273
- def get_credentials(config):
274
- # backward compatible with old style config
275
- if config.get("credentials_json"):
276
- credentials = {"auth_type": "Service", "service_account_info": config.get("credentials_json")}
277
- return credentials
278
-
279
- return config.get("credentials")
19
+ # Declarative Source
20
+ class SourceGoogleSheets(YamlDeclarativeSource):
21
+ def __init__(self, catalog: Optional[ConfiguredAirbyteCatalog], config: Optional[Mapping[str, Any]], state: TState, **kwargs):
22
+ super().__init__(catalog=catalog, config=config, state=state, **{"path_to_yaml": "manifest.yaml"})
@@ -12,17 +12,15 @@ connectionSpecification:
12
12
  type: integer
13
13
  title: Row Batch Size
14
14
  description: >-
15
- Default value is 200.
15
+ Default value is 1000000.
16
16
  An integer representing row batch size for each sent request to Google Sheets API.
17
- Row batch size means how many rows are processed from the google sheet, for example default value 200
18
- would process rows 1-201, then 201-401 and so on.
17
+ Row batch size means how many rows are processed from the google sheet, for example default value 1000000
18
+ would process rows 2-1000002, then 1000003-2000003 and so on.
19
19
  Based on <a href='https://developers.google.com/sheets/api/limits'>Google Sheets API limits documentation</a>,
20
20
  it is possible to send up to 300 requests per minute, but each individual request has to be processed under 180 seconds,
21
21
  otherwise the request returns a timeout error. In regards to this information, consider network speed and
22
22
  number of columns of the google sheet when deciding a batch_size value.
23
- Default value should cover most of the cases, but if a google sheet has over 100,000 records or more,
24
- consider increasing batch_size value.
25
- default: 200
23
+ default: 1000000
26
24
  spreadsheet_id:
27
25
  type: string
28
26
  title: Spreadsheet Link
@@ -1,5 +1,5 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
 
@@ -1,14 +0,0 @@
1
- source_google_sheets/__init__.py,sha256=-aGVMRfrgWjYad3_cHofIptEEa5WMQzTvFD92HevQfw,73
2
- source_google_sheets/client.py,sha256=A2BynDswjJ4naSyfjJr9G8fAfGEqLlNXP5vlbAsI3_s,1917
3
- source_google_sheets/helpers.py,sha256=CbXNlEfC3sNMDTGNPb22DcalWoXfv7kAYs7LmNM76Ec,10644
4
- source_google_sheets/models/__init__.py,sha256=Z-4MTpxG5t2jGhXzs4PPoIOa83zw3jRnUDx0N9Puv3s,61
5
- source_google_sheets/models/spreadsheet.py,sha256=fsHREpPEN36wCzGdqgfJ2EVW40UDZ_lS863A4XT2pGo,1112
6
- source_google_sheets/models/spreadsheet_values.py,sha256=y8ytuTqwpziJ2ICl0xhlRWgjMkxTfxOalRd414PMHZM,440
7
- source_google_sheets/run.py,sha256=_f5-LNqMzBuHtCD1YoUBxnA0fszgqmdNGcN7y_AmXU0,237
8
- source_google_sheets/source.py,sha256=kuHugJM9VOEPsArkP4lo88nBrS8ipk1yAoUi7cVSYZY,13506
9
- source_google_sheets/spec.yaml,sha256=WrPdH2xLCdyM-kY-pRqbwICcNPhv8nqnb2gdbslTsaQ,5141
10
- source_google_sheets/utils.py,sha256=DI53ARcKln77ekvuzsb3x35O9aMgZ_9OY9ets0FtI24,2290
11
- airbyte_source_google_sheets-0.8.5.dist-info/METADATA,sha256=LkVsOKPv_UsgFioRkUDxOiDG8kO6jFF_F7MYl3zkXy8,5551
12
- airbyte_source_google_sheets-0.8.5.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
13
- airbyte_source_google_sheets-0.8.5.dist-info/entry_points.txt,sha256=Dtsfjohe5IPUFyqojk49SIoP7CifCTlNLG_pgivzppo,69
14
- airbyte_source_google_sheets-0.8.5.dist-info/RECORD,,
@@ -1,49 +0,0 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
- import logging
6
- from typing import Dict, List
7
-
8
- import backoff
9
- from googleapiclient import errors
10
- from requests import codes as status_codes
11
-
12
- from .helpers import SCOPES, Helpers
13
-
14
-
15
- logger = logging.getLogger("airbyte")
16
-
17
-
18
- class GoogleSheetsClient:
19
- class Backoff:
20
- row_batch_size = 200
21
-
22
- @classmethod
23
- def increase_row_batch_size(cls, details):
24
- if details["exception"].status_code == status_codes.TOO_MANY_REQUESTS and cls.row_batch_size < 1000:
25
- cls.row_batch_size = cls.row_batch_size + 100
26
- logger.info(f"Increasing number of records fetching due to rate limits. Current value: {cls.row_batch_size}")
27
-
28
- @staticmethod
29
- def give_up(error):
30
- code = error.resp.status
31
- # Stop retrying if it's not a problem with the rate limit or on the server end
32
- return not (code == status_codes.TOO_MANY_REQUESTS or 500 <= code < 600)
33
-
34
- def __init__(self, credentials: Dict[str, str], scopes: List[str] = SCOPES):
35
- self.client = Helpers.get_authenticated_sheets_client(credentials, scopes)
36
-
37
- @backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
38
- def get(self, **kwargs):
39
- return self.client.get(**kwargs).execute()
40
-
41
- @backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
42
- def get_values(self, **kwargs):
43
- range = self._create_range(kwargs.pop("sheet"), kwargs.pop("row_cursor"))
44
- logger.info(f"Fetching range {range}")
45
- return self.client.values().batchGet(ranges=range, **kwargs).execute()
46
-
47
- def _create_range(self, sheet, row_cursor):
48
- range = f"{sheet}!{row_cursor}:{row_cursor + self.Backoff.row_batch_size}"
49
- return range
@@ -1,234 +0,0 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
- import json
6
- import logging
7
- import re
8
- from collections import defaultdict
9
- from datetime import datetime
10
- from typing import Dict, FrozenSet, Iterable, List, Tuple
11
-
12
- from google.oauth2 import credentials as client_account
13
- from google.oauth2 import service_account
14
- from googleapiclient import discovery
15
-
16
- from airbyte_cdk.models.airbyte_protocol import AirbyteRecordMessage, AirbyteStream, ConfiguredAirbyteCatalog, SyncMode
17
-
18
- from .models.spreadsheet import RowData, Spreadsheet
19
- from .utils import safe_name_conversion
20
-
21
-
22
- SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly", "https://www.googleapis.com/auth/drive.readonly"]
23
-
24
- logger = logging.getLogger("airbyte")
25
-
26
-
27
- class Helpers(object):
28
- @staticmethod
29
- def get_authenticated_sheets_client(credentials: Dict[str, str], scopes: List[str] = SCOPES) -> discovery.Resource:
30
- creds = Helpers.get_authenticated_google_credentials(credentials, scopes)
31
- return discovery.build("sheets", "v4", credentials=creds).spreadsheets()
32
-
33
- @staticmethod
34
- def get_authenticated_drive_client(credentials: Dict[str, str], scopes: List[str] = SCOPES) -> discovery.Resource:
35
- creds = Helpers.get_authenticated_google_credentials(credentials, scopes)
36
- return discovery.build("drive", "v3", credentials=creds)
37
-
38
- @staticmethod
39
- def get_authenticated_google_credentials(credentials: Dict[str, str], scopes: List[str] = SCOPES):
40
- auth_type = credentials.pop("auth_type")
41
- if auth_type == "Service":
42
- return service_account.Credentials.from_service_account_info(json.loads(credentials["service_account_info"]), scopes=scopes)
43
- elif auth_type == "Client":
44
- return client_account.Credentials.from_authorized_user_info(info=credentials)
45
-
46
- @staticmethod
47
- def headers_to_airbyte_stream(logger: logging.Logger, sheet_name: str, header_row_values: List[str]) -> AirbyteStream:
48
- """
49
- Parses sheet headers from the provided row. This method assumes that data is contiguous
50
- i.e: every cell contains a value and the first cell which does not contain a value denotes the end
51
- of the headers. For example, if the first row contains "One | Two | | Three" then this method
52
- will parse the headers as ["One", "Two"]. This assumption is made for simplicity and can be modified later.
53
- """
54
- fields, duplicate_fields = Helpers.get_valid_headers_and_duplicates(header_row_values)
55
- if duplicate_fields:
56
- logger.warn(f"Duplicate headers found in {sheet_name}. Ignoring them: {duplicate_fields}")
57
-
58
- sheet_json_schema = {
59
- "$schema": "http://json-schema.org/draft-07/schema#",
60
- "type": "object",
61
- # For simplicity, the type of every cell is a string
62
- "properties": {field: {"type": "string"} for field in fields},
63
- }
64
-
65
- return AirbyteStream(name=sheet_name, json_schema=sheet_json_schema, supported_sync_modes=[SyncMode.full_refresh])
66
-
67
- @staticmethod
68
- def get_valid_headers_and_duplicates(header_row_values: List[str]) -> (List[str], List[str]):
69
- fields = []
70
- duplicate_fields = set()
71
- for cell_value in header_row_values:
72
- if cell_value:
73
- if cell_value in fields:
74
- duplicate_fields.add(cell_value)
75
- else:
76
- fields.append(cell_value)
77
- else:
78
- break
79
-
80
- # Removing all duplicate fields
81
- if duplicate_fields:
82
- fields = [field for field in fields if field not in duplicate_fields]
83
-
84
- return fields, list(duplicate_fields)
85
-
86
- @staticmethod
87
- def get_formatted_row_values(row_data: RowData) -> List[str]:
88
- """
89
- Gets the formatted values of all cell data in this row. A formatted value is the final value a user sees in a spreadsheet.
90
- It can be a raw string input by the user, or the result of a sheets function call.
91
- """
92
- return [value.formattedValue for value in row_data.values]
93
-
94
- @staticmethod
95
- def get_first_row(client, spreadsheet_id: str, sheet_name: str) -> List[str]:
96
- spreadsheet = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=True, ranges=f"{sheet_name}!1:1"))
97
-
98
- # There is only one sheet since we are specifying the sheet in the requested ranges.
99
- returned_sheets = spreadsheet.sheets
100
- if len(returned_sheets) != 1:
101
- raise Exception(f"Unexpected return result: Sheet {sheet_name} was expected to contain data on exactly 1 sheet. ")
102
-
103
- range_data = returned_sheets[0].data
104
- if len(range_data) != 1:
105
- raise Exception(f"Expected data for exactly one range for sheet {sheet_name}")
106
-
107
- all_row_data = range_data[0].rowData
108
- if not all_row_data:
109
- # the sheet is empty
110
- logger.warning(f"The sheet {sheet_name} (ID {spreadsheet_id}) is empty!")
111
- return []
112
-
113
- if len(all_row_data) != 1:
114
- raise Exception(f"Expected data for exactly one row for sheet {sheet_name}")
115
-
116
- first_row_data = all_row_data[0]
117
-
118
- return Helpers.get_formatted_row_values(first_row_data)
119
-
120
- @staticmethod
121
- def parse_sheet_and_column_names_from_catalog(catalog: ConfiguredAirbyteCatalog) -> Dict[str, FrozenSet[str]]:
122
- sheet_to_column_name = {}
123
- for configured_stream in catalog.streams:
124
- stream = configured_stream.stream
125
- sheet_name = stream.name
126
- sheet_to_column_name[sheet_name] = frozenset(stream.json_schema["properties"].keys())
127
-
128
- return sheet_to_column_name
129
-
130
- @staticmethod
131
- def row_data_to_record_message(sheet_name: str, cell_values: List[str], column_index_to_name: Dict[int, str]) -> AirbyteRecordMessage:
132
- data = {}
133
- for relevant_index in sorted(column_index_to_name.keys()):
134
- if relevant_index >= len(cell_values):
135
- break
136
-
137
- cell_value = cell_values[relevant_index]
138
- if cell_value.strip() != "":
139
- data[column_index_to_name[relevant_index]] = cell_value
140
-
141
- return AirbyteRecordMessage(stream=sheet_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000)
142
-
143
- @staticmethod
144
- def get_available_sheets_to_column_index_to_name(
145
- client, spreadsheet_id: str, requested_sheets_and_columns: Dict[str, FrozenSet[str]], names_conversion: bool = False
146
- ) -> Dict[str, Dict[int, str]]:
147
- available_sheets = Helpers.get_sheets_in_spreadsheet(client, spreadsheet_id)
148
- logger.info(f"Available sheets: {available_sheets}")
149
- available_sheets_to_column_index_to_name = defaultdict(dict)
150
- for sheet, columns in requested_sheets_and_columns.items():
151
- if sheet in available_sheets:
152
- first_row = Helpers.get_first_row(client, spreadsheet_id, sheet)
153
- if names_conversion:
154
- first_row = [safe_name_conversion(h) for h in first_row]
155
- # When performing names conversion, they won't match what is listed in catalog for the majority of cases,
156
- # so they should be cast here in order to have them in records
157
- columns = {safe_name_conversion(c) for c in columns}
158
- # Find the column index of each header value
159
- idx = 0
160
- for cell_value in first_row:
161
- if cell_value in columns:
162
- available_sheets_to_column_index_to_name[sheet][idx] = cell_value
163
- idx += 1
164
- return available_sheets_to_column_index_to_name
165
-
166
- @staticmethod
167
- def get_sheets_in_spreadsheet(client, spreadsheet_id: str) -> List[str]:
168
- spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
169
- return [sheet.properties.title for sheet in spreadsheet_metadata.sheets]
170
-
171
- @staticmethod
172
- def get_sheet_row_count(client, spreadsheet_id: str) -> Dict[str, int]:
173
- spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
174
- # filter out sheets without gridProperties (like in diagram sheets)
175
- data_sheets = [sheet for sheet in spreadsheet_metadata.sheets if hasattr(sheet.properties, "gridProperties")]
176
- return {sheet.properties.title: sheet.properties.gridProperties["rowCount"] for sheet in data_sheets}
177
-
178
- @staticmethod
179
- def get_grid_sheets(spreadsheet_metadata) -> List[str]:
180
- """Return grid only diagram, filter out sheets with image/diagram only
181
-
182
- https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/sheets#sheetproperties
183
- """
184
- grid_sheets = []
185
- non_grid_sheets = []
186
- for sheet in spreadsheet_metadata.sheets:
187
- sheet_title = sheet.properties.title
188
- if (
189
- hasattr(sheet.properties, "gridProperties")
190
- and hasattr(sheet.properties, "sheetType")
191
- and sheet.properties.sheetType == "GRID"
192
- ):
193
- grid_sheets.append(sheet_title)
194
- else:
195
- non_grid_sheets.append(sheet_title)
196
-
197
- if non_grid_sheets:
198
- # logging.getLogger(...).log() expects an integer level. The level for WARN is 30
199
- # Reference: https://docs.python.org/3.10/library/logging.html#levels
200
- logging.getLogger("airbyte").log(30, "Skip non-grid sheets: " + ", ".join(non_grid_sheets))
201
-
202
- return grid_sheets
203
-
204
- @staticmethod
205
- def is_row_empty(cell_values: List[str]) -> bool:
206
- for cell in cell_values:
207
- if cell.strip() != "":
208
- return False
209
- return True
210
-
211
- @staticmethod
212
- def row_contains_relevant_data(cell_values: List[str], relevant_indices: Iterable[int]) -> bool:
213
- for idx in relevant_indices:
214
- if len(cell_values) > idx and cell_values[idx].strip() != "":
215
- return True
216
- return False
217
-
218
- @staticmethod
219
- def get_spreadsheet_id(id_or_url: str) -> str:
220
- if re.match(r"(https://)", id_or_url):
221
- # This is a URL
222
- m = re.search(r"(/)([-\w]{20,})([/]?)", id_or_url)
223
- if m is not None and m.group(2):
224
- return m.group(2)
225
- else:
226
- return id_or_url
227
-
228
- @staticmethod
229
- def check_sheet_is_valid(client, spreadsheet_id: str, sheet_name: str) -> Tuple[bool, str]:
230
- try:
231
- Helpers.get_first_row(client, spreadsheet_id, sheet_name)
232
- return True, ""
233
- except Exception as e:
234
- return False, str(e)