airbyte-source-google-sheets 0.8.4__py3-none-any.whl → 0.9.0rc1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,277 +1,22 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ from typing import Any, Mapping, Optional
5
6
 
6
- import json
7
- import logging
8
- import socket
9
- from typing import Any, Generator, List, Mapping, MutableMapping, Optional, Union
7
+ from airbyte_cdk.models import ConfiguredAirbyteCatalog
8
+ from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource
9
+ from airbyte_cdk.sources.source import TState
10
10
 
11
- from airbyte_cdk.models import FailureType
12
- from airbyte_cdk.models.airbyte_protocol import (
13
- AirbyteCatalog,
14
- AirbyteConnectionStatus,
15
- AirbyteMessage,
16
- AirbyteStateMessage,
17
- AirbyteStreamStatus,
18
- ConfiguredAirbyteCatalog,
19
- Status,
20
- Type,
21
- )
22
- from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
23
- from airbyte_cdk.sources.source import Source
24
- from airbyte_cdk.sources.streams.checkpoint import FullRefreshCheckpointReader
25
- from airbyte_cdk.utils import AirbyteTracedException
26
- from airbyte_cdk.utils.stream_status_utils import as_airbyte_message
27
- from apiclient import errors
28
- from google.auth import exceptions as google_exceptions
29
- from requests.status_codes import codes as status_codes
30
11
 
31
- from .client import GoogleSheetsClient
32
- from .helpers import Helpers
33
- from .models.spreadsheet import Spreadsheet
34
- from .models.spreadsheet_values import SpreadsheetValues
35
- from .utils import exception_description_by_status_code, safe_name_conversion
12
+ """
13
+ This file provides the necessary constructs to interpret a provided declarative YAML configuration file into
14
+ source connector.
15
+ WARNING: Do not modify this file.
16
+ """
36
17
 
37
- # override default socket timeout to be 10 mins instead of 60 sec.
38
- # on behalf of https://github.com/airbytehq/oncall/issues/242
39
- DEFAULT_SOCKET_TIMEOUT: int = 600
40
- socket.setdefaulttimeout(DEFAULT_SOCKET_TIMEOUT)
41
18
 
42
-
43
- class SourceGoogleSheets(Source):
44
- """
45
- Spreadsheets API Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets
46
- """
47
-
48
- def check(self, logger: logging.Logger, config: json) -> AirbyteConnectionStatus:
49
- # Check involves verifying that the specified spreadsheet is reachable with our credentials.
50
- try:
51
- client = GoogleSheetsClient(self.get_credentials(config))
52
- except Exception as e:
53
- return AirbyteConnectionStatus(status=Status.FAILED, message=f"Please use valid credentials json file. Error: {e}")
54
-
55
- spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
56
-
57
- try:
58
- spreadsheet = client.get(spreadsheetId=spreadsheet_id, includeGridData=False)
59
- except errors.HttpError as err:
60
- message = "Config error: "
61
- # Give a clearer message if it's a common error like 404.
62
- if err.resp.status == status_codes.NOT_FOUND:
63
- message += "The spreadsheet link is not valid. Enter the URL of the Google spreadsheet you want to sync."
64
- raise AirbyteTracedException(
65
- message=message,
66
- internal_message=message,
67
- failure_type=FailureType.config_error,
68
- ) from err
69
- except google_exceptions.GoogleAuthError as err:
70
- message = "Access to the spreadsheet expired or was revoked. Re-authenticate to restore access."
71
- raise AirbyteTracedException(
72
- message=message,
73
- internal_message=message,
74
- failure_type=FailureType.config_error,
75
- ) from err
76
-
77
- # Check for duplicate headers
78
- spreadsheet_metadata = Spreadsheet.parse_obj(spreadsheet)
79
- grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
80
-
81
- duplicate_headers_in_sheet = {}
82
- for sheet_name in grid_sheets:
83
- try:
84
- header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
85
- if config.get("names_conversion"):
86
- header_row_data = [safe_name_conversion(h) for h in header_row_data]
87
- _, duplicate_headers = Helpers.get_valid_headers_and_duplicates(header_row_data)
88
- if duplicate_headers:
89
- duplicate_headers_in_sheet[sheet_name] = duplicate_headers
90
- except Exception as err:
91
- if str(err).startswith("Expected data for exactly one row for sheet"):
92
- logger.warn(f"Skip empty sheet: {sheet_name}")
93
- else:
94
- logger.error(str(err))
95
- return AirbyteConnectionStatus(
96
- status=Status.FAILED, message=f"Unable to read the schema of sheet {sheet_name}. Error: {str(err)}"
97
- )
98
- if duplicate_headers_in_sheet:
99
- duplicate_headers_error_message = ", ".join(
100
- [
101
- f"[sheet:{sheet_name}, headers:{duplicate_sheet_headers}]"
102
- for sheet_name, duplicate_sheet_headers in duplicate_headers_in_sheet.items()
103
- ]
104
- )
105
- return AirbyteConnectionStatus(
106
- status=Status.FAILED,
107
- message="The following duplicate headers were found in the following sheets. Please fix them to continue: "
108
- + duplicate_headers_error_message,
109
- )
110
-
111
- return AirbyteConnectionStatus(status=Status.SUCCEEDED)
112
-
113
- def discover(self, logger: logging.Logger, config: json) -> AirbyteCatalog:
114
- client = GoogleSheetsClient(self.get_credentials(config))
115
- spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
116
- try:
117
- logger.info(f"Running discovery on sheet {spreadsheet_id}")
118
- spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
119
- grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
120
- streams = []
121
- for sheet_name in grid_sheets:
122
- try:
123
- header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
124
- if config.get("names_conversion"):
125
- header_row_data = [safe_name_conversion(h) for h in header_row_data]
126
- stream = Helpers.headers_to_airbyte_stream(logger, sheet_name, header_row_data)
127
- streams.append(stream)
128
- except Exception as err:
129
- if str(err).startswith("Expected data for exactly one row for sheet"):
130
- logger.warn(f"Skip empty sheet: {sheet_name}")
131
- else:
132
- logger.error(str(err))
133
- return AirbyteCatalog(streams=streams)
134
-
135
- except errors.HttpError as err:
136
- error_description = exception_description_by_status_code(err.resp.status, spreadsheet_id)
137
- config_error_status_codes = [status_codes.NOT_FOUND, status_codes.FORBIDDEN]
138
- if err.resp.status in config_error_status_codes:
139
- message = f"{error_description}. {err.reason}."
140
- raise AirbyteTracedException(
141
- message=message,
142
- internal_message=message,
143
- failure_type=FailureType.config_error,
144
- ) from err
145
- raise Exception(f"Could not discover the schema of your spreadsheet. {error_description}. {err.reason}.")
146
- except google_exceptions.GoogleAuthError as err:
147
- message = "Access to the spreadsheet expired or was revoked. Re-authenticate to restore access."
148
- raise AirbyteTracedException(
149
- message=message,
150
- internal_message=message,
151
- failure_type=FailureType.config_error,
152
- ) from err
153
-
154
- def _read(
155
- self,
156
- logger: logging.Logger,
157
- config: json,
158
- catalog: ConfiguredAirbyteCatalog,
159
- state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
160
- ) -> Generator[AirbyteMessage, None, None]:
161
- client = GoogleSheetsClient(self.get_credentials(config))
162
- client.Backoff.row_batch_size = config.get("batch_size", 200)
163
-
164
- sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog)
165
- stream_instances = {s.stream.name: s.stream for s in catalog.streams}
166
- state_manager = ConnectorStateManager(stream_instance_map=stream_instances, state=state or {})
167
- spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
168
-
169
- logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
170
- # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
171
- # a blank row, emit the row batch
172
- sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(
173
- client, spreadsheet_id, sheet_to_column_name, config.get("names_conversion")
174
- )
175
- sheet_row_counts = Helpers.get_sheet_row_count(client, spreadsheet_id)
176
- logger.info(f"Row counts: {sheet_row_counts}")
177
- for sheet in sheet_to_column_index_to_name.keys():
178
- logger.info(f"Syncing sheet {sheet}")
179
- stream = stream_instances.get(sheet)
180
- yield as_airbyte_message(stream, AirbyteStreamStatus.STARTED)
181
- checkpoint_reader = FullRefreshCheckpointReader([])
182
- _ = checkpoint_reader.next()
183
- # We revalidate the sheet here to avoid errors in case the sheet was changed after the sync started
184
- is_valid, reason = Helpers.check_sheet_is_valid(client, spreadsheet_id, sheet)
185
- if not is_valid:
186
- logger.info(f"Skipping syncing sheet {sheet}: {reason}")
187
- yield self._checkpoint_state(checkpoint_reader.get_checkpoint(), state_manager, sheet, None)
188
- yield as_airbyte_message(stream, AirbyteStreamStatus.INCOMPLETE)
189
- continue
190
-
191
- column_index_to_name = sheet_to_column_index_to_name[sheet]
192
- row_cursor = 2 # we start syncing past the header row
193
- # For the loop, it is necessary that the initial row exists when we send a request to the API,
194
- # if the last row of the interval goes outside the sheet - this is normal, we will return
195
- # only the real data of the sheet and in the next iteration we will loop out.
196
- while row_cursor <= sheet_row_counts[sheet]:
197
- row_batch = SpreadsheetValues.parse_obj(
198
- client.get_values(
199
- sheet=sheet,
200
- row_cursor=row_cursor,
201
- spreadsheetId=spreadsheet_id,
202
- majorDimension="ROWS",
203
- )
204
- )
205
-
206
- row_cursor += client.Backoff.row_batch_size + 1
207
- # there should always be one range since we requested only one
208
- value_ranges = row_batch.valueRanges[0]
209
-
210
- if not value_ranges.values:
211
- break
212
-
213
- row_values = value_ranges.values
214
- if len(row_values) == 0:
215
- break
216
-
217
- yield as_airbyte_message(stream, AirbyteStreamStatus.RUNNING)
218
- for row in row_values:
219
- if not Helpers.is_row_empty(row) and Helpers.row_contains_relevant_data(row, column_index_to_name.keys()):
220
- yield AirbyteMessage(type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name))
221
-
222
- yield self._checkpoint_state(checkpoint_reader.get_checkpoint(), state_manager, sheet, None)
223
- yield as_airbyte_message(stream, AirbyteStreamStatus.COMPLETE)
224
-
225
- def _checkpoint_state(
226
- self,
227
- stream_state: Mapping[str, Any],
228
- state_manager,
229
- stream_name: str,
230
- stream_namespace: Optional[str],
231
- ) -> AirbyteMessage:
232
- state_manager.update_state_for_stream(stream_name, stream_namespace, stream_state)
233
- return state_manager.create_state_message(stream_name, stream_namespace)
234
-
235
- def read(
236
- self,
237
- logger: logging.Logger,
238
- config: json,
239
- catalog: ConfiguredAirbyteCatalog,
240
- state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
241
- ) -> Generator[AirbyteMessage, None, None]:
242
- spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
243
- try:
244
- yield from self._read(logger, config, catalog, state)
245
- except errors.HttpError as e:
246
- error_description = exception_description_by_status_code(e.status_code, spreadsheet_id)
247
-
248
- if e.status_code == status_codes.FORBIDDEN:
249
- raise AirbyteTracedException(
250
- message=f"Stopped syncing process. {error_description}",
251
- internal_message=error_description,
252
- failure_type=FailureType.config_error,
253
- ) from e
254
- if e.status_code == status_codes.TOO_MANY_REQUESTS:
255
- raise AirbyteTracedException(
256
- message=f"Stopped syncing process due to rate limits. {error_description}",
257
- internal_message=error_description,
258
- failure_type=FailureType.transient_error,
259
- ) from e
260
- else:
261
- logger.info(f"{e.status_code}: {e.reason}. {error_description}")
262
- raise AirbyteTracedException(
263
- message=f"Stopped syncing process. {error_description}",
264
- internal_message=error_description,
265
- failure_type=FailureType.transient_error,
266
- ) from e
267
- finally:
268
- logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
269
-
270
- @staticmethod
271
- def get_credentials(config):
272
- # backward compatible with old style config
273
- if config.get("credentials_json"):
274
- credentials = {"auth_type": "Service", "service_account_info": config.get("credentials_json")}
275
- return credentials
276
-
277
- return config.get("credentials")
19
+ # Declarative Source
20
+ class SourceGoogleSheets(YamlDeclarativeSource):
21
+ def __init__(self, catalog: Optional[ConfiguredAirbyteCatalog], config: Optional[Mapping[str, Any]], state: TState, **kwargs):
22
+ super().__init__(catalog=catalog, config=config, state=state, **{"path_to_yaml": "manifest.yaml"})
@@ -12,17 +12,15 @@ connectionSpecification:
12
12
  type: integer
13
13
  title: Row Batch Size
14
14
  description: >-
15
- Default value is 200.
15
+ Default value is 1000000.
16
16
  An integer representing row batch size for each sent request to Google Sheets API.
17
- Row batch size means how many rows are processed from the google sheet, for example default value 200
18
- would process rows 1-201, then 201-401 and so on.
17
+ Row batch size means how many rows are processed from the google sheet, for example default value 1000000
18
+ would process rows 2-1000002, then 1000003-2000003 and so on.
19
19
  Based on <a href='https://developers.google.com/sheets/api/limits'>Google Sheets API limits documentation</a>,
20
20
  it is possible to send up to 300 requests per minute, but each individual request has to be processed under 180 seconds,
21
21
  otherwise the request returns a timeout error. In regards to this information, consider network speed and
22
22
  number of columns of the google sheet when deciding a batch_size value.
23
- Default value should cover most of the cases, but if a google sheet has over 100,000 records or more,
24
- consider increasing batch_size value.
25
- default: 200
23
+ default: 1000000
26
24
  spreadsheet_id:
27
25
  type: string
28
26
  title: Spreadsheet Link
@@ -1,5 +1,5 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
 
@@ -8,6 +8,7 @@ import re
8
8
  import unidecode
9
9
  from requests.status_codes import codes as status_codes
10
10
 
11
+
11
12
  TOKEN_PATTERN = re.compile(r"[A-Z]+[a-z]*|[a-z]+|\d+|(?P<NoToken>[^a-zA-Z\d]+)")
12
13
  DEFAULT_SEPARATOR = "_"
13
14
 
@@ -1,14 +0,0 @@
1
- source_google_sheets/__init__.py,sha256=-aGVMRfrgWjYad3_cHofIptEEa5WMQzTvFD92HevQfw,73
2
- source_google_sheets/client.py,sha256=dLujGTU2CPVV8WJwGHEBg3zJZK6xtzuyGTJm7TjAv9I,1916
3
- source_google_sheets/helpers.py,sha256=kKXop3YyQ3jPYlWgWW3GmRPxDmnuoHZ4joa3rQLCxUQ,10642
4
- source_google_sheets/models/__init__.py,sha256=Z-4MTpxG5t2jGhXzs4PPoIOa83zw3jRnUDx0N9Puv3s,61
5
- source_google_sheets/models/spreadsheet.py,sha256=fsHREpPEN36wCzGdqgfJ2EVW40UDZ_lS863A4XT2pGo,1112
6
- source_google_sheets/models/spreadsheet_values.py,sha256=y8ytuTqwpziJ2ICl0xhlRWgjMkxTfxOalRd414PMHZM,440
7
- source_google_sheets/run.py,sha256=_f5-LNqMzBuHtCD1YoUBxnA0fszgqmdNGcN7y_AmXU0,237
8
- source_google_sheets/source.py,sha256=KIcjUH_-vRNTwHd6rNv8c_jgPBJjXMMmcDvvVyng_OA,13504
9
- source_google_sheets/spec.yaml,sha256=WrPdH2xLCdyM-kY-pRqbwICcNPhv8nqnb2gdbslTsaQ,5141
10
- source_google_sheets/utils.py,sha256=ZB5lboyffiuuQdSarqe8AqBGEyiQpxiOfxqcU7Ght8A,2289
11
- airbyte_source_google_sheets-0.8.4.dist-info/METADATA,sha256=mIdbrQljTBznf31l7Xrr5Zy63tsNCI3LctSDj51bRhE,5539
12
- airbyte_source_google_sheets-0.8.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
13
- airbyte_source_google_sheets-0.8.4.dist-info/entry_points.txt,sha256=Dtsfjohe5IPUFyqojk49SIoP7CifCTlNLG_pgivzppo,69
14
- airbyte_source_google_sheets-0.8.4.dist-info/RECORD,,
@@ -1,48 +0,0 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
- import logging
6
- from typing import Dict, List
7
-
8
- import backoff
9
- from googleapiclient import errors
10
- from requests import codes as status_codes
11
-
12
- from .helpers import SCOPES, Helpers
13
-
14
- logger = logging.getLogger("airbyte")
15
-
16
-
17
- class GoogleSheetsClient:
18
- class Backoff:
19
- row_batch_size = 200
20
-
21
- @classmethod
22
- def increase_row_batch_size(cls, details):
23
- if details["exception"].status_code == status_codes.TOO_MANY_REQUESTS and cls.row_batch_size < 1000:
24
- cls.row_batch_size = cls.row_batch_size + 100
25
- logger.info(f"Increasing number of records fetching due to rate limits. Current value: {cls.row_batch_size}")
26
-
27
- @staticmethod
28
- def give_up(error):
29
- code = error.resp.status
30
- # Stop retrying if it's not a problem with the rate limit or on the server end
31
- return not (code == status_codes.TOO_MANY_REQUESTS or 500 <= code < 600)
32
-
33
- def __init__(self, credentials: Dict[str, str], scopes: List[str] = SCOPES):
34
- self.client = Helpers.get_authenticated_sheets_client(credentials, scopes)
35
-
36
- @backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
37
- def get(self, **kwargs):
38
- return self.client.get(**kwargs).execute()
39
-
40
- @backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
41
- def get_values(self, **kwargs):
42
- range = self._create_range(kwargs.pop("sheet"), kwargs.pop("row_cursor"))
43
- logger.info(f"Fetching range {range}")
44
- return self.client.values().batchGet(ranges=range, **kwargs).execute()
45
-
46
- def _create_range(self, sheet, row_cursor):
47
- range = f"{sheet}!{row_cursor}:{row_cursor + self.Backoff.row_batch_size}"
48
- return range
@@ -1,232 +0,0 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
- import json
6
- import logging
7
- import re
8
- from collections import defaultdict
9
- from datetime import datetime
10
- from typing import Dict, FrozenSet, Iterable, List, Tuple
11
-
12
- from airbyte_cdk.models.airbyte_protocol import AirbyteRecordMessage, AirbyteStream, ConfiguredAirbyteCatalog, SyncMode
13
- from google.oauth2 import credentials as client_account
14
- from google.oauth2 import service_account
15
- from googleapiclient import discovery
16
-
17
- from .models.spreadsheet import RowData, Spreadsheet
18
- from .utils import safe_name_conversion
19
-
20
- SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly", "https://www.googleapis.com/auth/drive.readonly"]
21
-
22
- logger = logging.getLogger("airbyte")
23
-
24
-
25
- class Helpers(object):
26
- @staticmethod
27
- def get_authenticated_sheets_client(credentials: Dict[str, str], scopes: List[str] = SCOPES) -> discovery.Resource:
28
- creds = Helpers.get_authenticated_google_credentials(credentials, scopes)
29
- return discovery.build("sheets", "v4", credentials=creds).spreadsheets()
30
-
31
- @staticmethod
32
- def get_authenticated_drive_client(credentials: Dict[str, str], scopes: List[str] = SCOPES) -> discovery.Resource:
33
- creds = Helpers.get_authenticated_google_credentials(credentials, scopes)
34
- return discovery.build("drive", "v3", credentials=creds)
35
-
36
- @staticmethod
37
- def get_authenticated_google_credentials(credentials: Dict[str, str], scopes: List[str] = SCOPES):
38
- auth_type = credentials.pop("auth_type")
39
- if auth_type == "Service":
40
- return service_account.Credentials.from_service_account_info(json.loads(credentials["service_account_info"]), scopes=scopes)
41
- elif auth_type == "Client":
42
- return client_account.Credentials.from_authorized_user_info(info=credentials)
43
-
44
- @staticmethod
45
- def headers_to_airbyte_stream(logger: logging.Logger, sheet_name: str, header_row_values: List[str]) -> AirbyteStream:
46
- """
47
- Parses sheet headers from the provided row. This method assumes that data is contiguous
48
- i.e: every cell contains a value and the first cell which does not contain a value denotes the end
49
- of the headers. For example, if the first row contains "One | Two | | Three" then this method
50
- will parse the headers as ["One", "Two"]. This assumption is made for simplicity and can be modified later.
51
- """
52
- fields, duplicate_fields = Helpers.get_valid_headers_and_duplicates(header_row_values)
53
- if duplicate_fields:
54
- logger.warn(f"Duplicate headers found in {sheet_name}. Ignoring them: {duplicate_fields}")
55
-
56
- sheet_json_schema = {
57
- "$schema": "http://json-schema.org/draft-07/schema#",
58
- "type": "object",
59
- # For simplicity, the type of every cell is a string
60
- "properties": {field: {"type": "string"} for field in fields},
61
- }
62
-
63
- return AirbyteStream(name=sheet_name, json_schema=sheet_json_schema, supported_sync_modes=[SyncMode.full_refresh])
64
-
65
- @staticmethod
66
- def get_valid_headers_and_duplicates(header_row_values: List[str]) -> (List[str], List[str]):
67
- fields = []
68
- duplicate_fields = set()
69
- for cell_value in header_row_values:
70
- if cell_value:
71
- if cell_value in fields:
72
- duplicate_fields.add(cell_value)
73
- else:
74
- fields.append(cell_value)
75
- else:
76
- break
77
-
78
- # Removing all duplicate fields
79
- if duplicate_fields:
80
- fields = [field for field in fields if field not in duplicate_fields]
81
-
82
- return fields, list(duplicate_fields)
83
-
84
- @staticmethod
85
- def get_formatted_row_values(row_data: RowData) -> List[str]:
86
- """
87
- Gets the formatted values of all cell data in this row. A formatted value is the final value a user sees in a spreadsheet.
88
- It can be a raw string input by the user, or the result of a sheets function call.
89
- """
90
- return [value.formattedValue for value in row_data.values]
91
-
92
- @staticmethod
93
- def get_first_row(client, spreadsheet_id: str, sheet_name: str) -> List[str]:
94
- spreadsheet = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=True, ranges=f"{sheet_name}!1:1"))
95
-
96
- # There is only one sheet since we are specifying the sheet in the requested ranges.
97
- returned_sheets = spreadsheet.sheets
98
- if len(returned_sheets) != 1:
99
- raise Exception(f"Unexpected return result: Sheet {sheet_name} was expected to contain data on exactly 1 sheet. ")
100
-
101
- range_data = returned_sheets[0].data
102
- if len(range_data) != 1:
103
- raise Exception(f"Expected data for exactly one range for sheet {sheet_name}")
104
-
105
- all_row_data = range_data[0].rowData
106
- if not all_row_data:
107
- # the sheet is empty
108
- logger.warning(f"The sheet {sheet_name} (ID {spreadsheet_id}) is empty!")
109
- return []
110
-
111
- if len(all_row_data) != 1:
112
- raise Exception(f"Expected data for exactly one row for sheet {sheet_name}")
113
-
114
- first_row_data = all_row_data[0]
115
-
116
- return Helpers.get_formatted_row_values(first_row_data)
117
-
118
- @staticmethod
119
- def parse_sheet_and_column_names_from_catalog(catalog: ConfiguredAirbyteCatalog) -> Dict[str, FrozenSet[str]]:
120
- sheet_to_column_name = {}
121
- for configured_stream in catalog.streams:
122
- stream = configured_stream.stream
123
- sheet_name = stream.name
124
- sheet_to_column_name[sheet_name] = frozenset(stream.json_schema["properties"].keys())
125
-
126
- return sheet_to_column_name
127
-
128
- @staticmethod
129
- def row_data_to_record_message(sheet_name: str, cell_values: List[str], column_index_to_name: Dict[int, str]) -> AirbyteRecordMessage:
130
- data = {}
131
- for relevant_index in sorted(column_index_to_name.keys()):
132
- if relevant_index >= len(cell_values):
133
- break
134
-
135
- cell_value = cell_values[relevant_index]
136
- if cell_value.strip() != "":
137
- data[column_index_to_name[relevant_index]] = cell_value
138
-
139
- return AirbyteRecordMessage(stream=sheet_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000)
140
-
141
- @staticmethod
142
- def get_available_sheets_to_column_index_to_name(
143
- client, spreadsheet_id: str, requested_sheets_and_columns: Dict[str, FrozenSet[str]], names_conversion: bool = False
144
- ) -> Dict[str, Dict[int, str]]:
145
- available_sheets = Helpers.get_sheets_in_spreadsheet(client, spreadsheet_id)
146
- logger.info(f"Available sheets: {available_sheets}")
147
- available_sheets_to_column_index_to_name = defaultdict(dict)
148
- for sheet, columns in requested_sheets_and_columns.items():
149
- if sheet in available_sheets:
150
- first_row = Helpers.get_first_row(client, spreadsheet_id, sheet)
151
- if names_conversion:
152
- first_row = [safe_name_conversion(h) for h in first_row]
153
- # When performing names conversion, they won't match what is listed in catalog for the majority of cases,
154
- # so they should be cast here in order to have them in records
155
- columns = {safe_name_conversion(c) for c in columns}
156
- # Find the column index of each header value
157
- idx = 0
158
- for cell_value in first_row:
159
- if cell_value in columns:
160
- available_sheets_to_column_index_to_name[sheet][idx] = cell_value
161
- idx += 1
162
- return available_sheets_to_column_index_to_name
163
-
164
- @staticmethod
165
- def get_sheets_in_spreadsheet(client, spreadsheet_id: str) -> List[str]:
166
- spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
167
- return [sheet.properties.title for sheet in spreadsheet_metadata.sheets]
168
-
169
- @staticmethod
170
- def get_sheet_row_count(client, spreadsheet_id: str) -> Dict[str, int]:
171
- spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
172
- # filter out sheets without gridProperties (like in diagram sheets)
173
- data_sheets = [sheet for sheet in spreadsheet_metadata.sheets if hasattr(sheet.properties, "gridProperties")]
174
- return {sheet.properties.title: sheet.properties.gridProperties["rowCount"] for sheet in data_sheets}
175
-
176
- @staticmethod
177
- def get_grid_sheets(spreadsheet_metadata) -> List[str]:
178
- """Return grid only diagram, filter out sheets with image/diagram only
179
-
180
- https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/sheets#sheetproperties
181
- """
182
- grid_sheets = []
183
- non_grid_sheets = []
184
- for sheet in spreadsheet_metadata.sheets:
185
- sheet_title = sheet.properties.title
186
- if (
187
- hasattr(sheet.properties, "gridProperties")
188
- and hasattr(sheet.properties, "sheetType")
189
- and sheet.properties.sheetType == "GRID"
190
- ):
191
- grid_sheets.append(sheet_title)
192
- else:
193
- non_grid_sheets.append(sheet_title)
194
-
195
- if non_grid_sheets:
196
- # logging.getLogger(...).log() expects an integer level. The level for WARN is 30
197
- # Reference: https://docs.python.org/3.10/library/logging.html#levels
198
- logging.getLogger("airbyte").log(30, "Skip non-grid sheets: " + ", ".join(non_grid_sheets))
199
-
200
- return grid_sheets
201
-
202
- @staticmethod
203
- def is_row_empty(cell_values: List[str]) -> bool:
204
- for cell in cell_values:
205
- if cell.strip() != "":
206
- return False
207
- return True
208
-
209
- @staticmethod
210
- def row_contains_relevant_data(cell_values: List[str], relevant_indices: Iterable[int]) -> bool:
211
- for idx in relevant_indices:
212
- if len(cell_values) > idx and cell_values[idx].strip() != "":
213
- return True
214
- return False
215
-
216
- @staticmethod
217
- def get_spreadsheet_id(id_or_url: str) -> str:
218
- if re.match(r"(https://)", id_or_url):
219
- # This is a URL
220
- m = re.search(r"(/)([-\w]{20,})([/]?)", id_or_url)
221
- if m is not None and m.group(2):
222
- return m.group(2)
223
- else:
224
- return id_or_url
225
-
226
- @staticmethod
227
- def check_sheet_is_valid(client, spreadsheet_id: str, sheet_name: str) -> Tuple[bool, str]:
228
- try:
229
- Helpers.get_first_row(client, spreadsheet_id, sheet_name)
230
- return True, ""
231
- except Exception as e:
232
- return False, str(e)