airbyte-source-google-sheets 0.8.4__py3-none-any.whl → 0.9.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,277 +1,22 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ from typing import Any, Mapping, Optional
5
6
 
6
- import json
7
- import logging
8
- import socket
9
- from typing import Any, Generator, List, Mapping, MutableMapping, Optional, Union
7
+ from airbyte_cdk.models import ConfiguredAirbyteCatalog
8
+ from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource
9
+ from airbyte_cdk.sources.source import TState
10
10
 
11
- from airbyte_cdk.models import FailureType
12
- from airbyte_cdk.models.airbyte_protocol import (
13
- AirbyteCatalog,
14
- AirbyteConnectionStatus,
15
- AirbyteMessage,
16
- AirbyteStateMessage,
17
- AirbyteStreamStatus,
18
- ConfiguredAirbyteCatalog,
19
- Status,
20
- Type,
21
- )
22
- from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
23
- from airbyte_cdk.sources.source import Source
24
- from airbyte_cdk.sources.streams.checkpoint import FullRefreshCheckpointReader
25
- from airbyte_cdk.utils import AirbyteTracedException
26
- from airbyte_cdk.utils.stream_status_utils import as_airbyte_message
27
- from apiclient import errors
28
- from google.auth import exceptions as google_exceptions
29
- from requests.status_codes import codes as status_codes
30
11
 
31
- from .client import GoogleSheetsClient
32
- from .helpers import Helpers
33
- from .models.spreadsheet import Spreadsheet
34
- from .models.spreadsheet_values import SpreadsheetValues
35
- from .utils import exception_description_by_status_code, safe_name_conversion
12
+ """
13
+ This file provides the necessary constructs to interpret a provided declarative YAML configuration file into
14
+ source connector.
15
+ WARNING: Do not modify this file.
16
+ """
36
17
 
37
- # override default socket timeout to be 10 mins instead of 60 sec.
38
- # on behalf of https://github.com/airbytehq/oncall/issues/242
39
- DEFAULT_SOCKET_TIMEOUT: int = 600
40
- socket.setdefaulttimeout(DEFAULT_SOCKET_TIMEOUT)
41
18
 
42
-
43
- class SourceGoogleSheets(Source):
44
- """
45
- Spreadsheets API Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets
46
- """
47
-
48
- def check(self, logger: logging.Logger, config: json) -> AirbyteConnectionStatus:
49
- # Check involves verifying that the specified spreadsheet is reachable with our credentials.
50
- try:
51
- client = GoogleSheetsClient(self.get_credentials(config))
52
- except Exception as e:
53
- return AirbyteConnectionStatus(status=Status.FAILED, message=f"Please use valid credentials json file. Error: {e}")
54
-
55
- spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
56
-
57
- try:
58
- spreadsheet = client.get(spreadsheetId=spreadsheet_id, includeGridData=False)
59
- except errors.HttpError as err:
60
- message = "Config error: "
61
- # Give a clearer message if it's a common error like 404.
62
- if err.resp.status == status_codes.NOT_FOUND:
63
- message += "The spreadsheet link is not valid. Enter the URL of the Google spreadsheet you want to sync."
64
- raise AirbyteTracedException(
65
- message=message,
66
- internal_message=message,
67
- failure_type=FailureType.config_error,
68
- ) from err
69
- except google_exceptions.GoogleAuthError as err:
70
- message = "Access to the spreadsheet expired or was revoked. Re-authenticate to restore access."
71
- raise AirbyteTracedException(
72
- message=message,
73
- internal_message=message,
74
- failure_type=FailureType.config_error,
75
- ) from err
76
-
77
- # Check for duplicate headers
78
- spreadsheet_metadata = Spreadsheet.parse_obj(spreadsheet)
79
- grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
80
-
81
- duplicate_headers_in_sheet = {}
82
- for sheet_name in grid_sheets:
83
- try:
84
- header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
85
- if config.get("names_conversion"):
86
- header_row_data = [safe_name_conversion(h) for h in header_row_data]
87
- _, duplicate_headers = Helpers.get_valid_headers_and_duplicates(header_row_data)
88
- if duplicate_headers:
89
- duplicate_headers_in_sheet[sheet_name] = duplicate_headers
90
- except Exception as err:
91
- if str(err).startswith("Expected data for exactly one row for sheet"):
92
- logger.warn(f"Skip empty sheet: {sheet_name}")
93
- else:
94
- logger.error(str(err))
95
- return AirbyteConnectionStatus(
96
- status=Status.FAILED, message=f"Unable to read the schema of sheet {sheet_name}. Error: {str(err)}"
97
- )
98
- if duplicate_headers_in_sheet:
99
- duplicate_headers_error_message = ", ".join(
100
- [
101
- f"[sheet:{sheet_name}, headers:{duplicate_sheet_headers}]"
102
- for sheet_name, duplicate_sheet_headers in duplicate_headers_in_sheet.items()
103
- ]
104
- )
105
- return AirbyteConnectionStatus(
106
- status=Status.FAILED,
107
- message="The following duplicate headers were found in the following sheets. Please fix them to continue: "
108
- + duplicate_headers_error_message,
109
- )
110
-
111
- return AirbyteConnectionStatus(status=Status.SUCCEEDED)
112
-
113
- def discover(self, logger: logging.Logger, config: json) -> AirbyteCatalog:
114
- client = GoogleSheetsClient(self.get_credentials(config))
115
- spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
116
- try:
117
- logger.info(f"Running discovery on sheet {spreadsheet_id}")
118
- spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
119
- grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
120
- streams = []
121
- for sheet_name in grid_sheets:
122
- try:
123
- header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
124
- if config.get("names_conversion"):
125
- header_row_data = [safe_name_conversion(h) for h in header_row_data]
126
- stream = Helpers.headers_to_airbyte_stream(logger, sheet_name, header_row_data)
127
- streams.append(stream)
128
- except Exception as err:
129
- if str(err).startswith("Expected data for exactly one row for sheet"):
130
- logger.warn(f"Skip empty sheet: {sheet_name}")
131
- else:
132
- logger.error(str(err))
133
- return AirbyteCatalog(streams=streams)
134
-
135
- except errors.HttpError as err:
136
- error_description = exception_description_by_status_code(err.resp.status, spreadsheet_id)
137
- config_error_status_codes = [status_codes.NOT_FOUND, status_codes.FORBIDDEN]
138
- if err.resp.status in config_error_status_codes:
139
- message = f"{error_description}. {err.reason}."
140
- raise AirbyteTracedException(
141
- message=message,
142
- internal_message=message,
143
- failure_type=FailureType.config_error,
144
- ) from err
145
- raise Exception(f"Could not discover the schema of your spreadsheet. {error_description}. {err.reason}.")
146
- except google_exceptions.GoogleAuthError as err:
147
- message = "Access to the spreadsheet expired or was revoked. Re-authenticate to restore access."
148
- raise AirbyteTracedException(
149
- message=message,
150
- internal_message=message,
151
- failure_type=FailureType.config_error,
152
- ) from err
153
-
154
- def _read(
155
- self,
156
- logger: logging.Logger,
157
- config: json,
158
- catalog: ConfiguredAirbyteCatalog,
159
- state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
160
- ) -> Generator[AirbyteMessage, None, None]:
161
- client = GoogleSheetsClient(self.get_credentials(config))
162
- client.Backoff.row_batch_size = config.get("batch_size", 200)
163
-
164
- sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog)
165
- stream_instances = {s.stream.name: s.stream for s in catalog.streams}
166
- state_manager = ConnectorStateManager(stream_instance_map=stream_instances, state=state or {})
167
- spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
168
-
169
- logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
170
- # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
171
- # a blank row, emit the row batch
172
- sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(
173
- client, spreadsheet_id, sheet_to_column_name, config.get("names_conversion")
174
- )
175
- sheet_row_counts = Helpers.get_sheet_row_count(client, spreadsheet_id)
176
- logger.info(f"Row counts: {sheet_row_counts}")
177
- for sheet in sheet_to_column_index_to_name.keys():
178
- logger.info(f"Syncing sheet {sheet}")
179
- stream = stream_instances.get(sheet)
180
- yield as_airbyte_message(stream, AirbyteStreamStatus.STARTED)
181
- checkpoint_reader = FullRefreshCheckpointReader([])
182
- _ = checkpoint_reader.next()
183
- # We revalidate the sheet here to avoid errors in case the sheet was changed after the sync started
184
- is_valid, reason = Helpers.check_sheet_is_valid(client, spreadsheet_id, sheet)
185
- if not is_valid:
186
- logger.info(f"Skipping syncing sheet {sheet}: {reason}")
187
- yield self._checkpoint_state(checkpoint_reader.get_checkpoint(), state_manager, sheet, None)
188
- yield as_airbyte_message(stream, AirbyteStreamStatus.INCOMPLETE)
189
- continue
190
-
191
- column_index_to_name = sheet_to_column_index_to_name[sheet]
192
- row_cursor = 2 # we start syncing past the header row
193
- # For the loop, it is necessary that the initial row exists when we send a request to the API,
194
- # if the last row of the interval goes outside the sheet - this is normal, we will return
195
- # only the real data of the sheet and in the next iteration we will loop out.
196
- while row_cursor <= sheet_row_counts[sheet]:
197
- row_batch = SpreadsheetValues.parse_obj(
198
- client.get_values(
199
- sheet=sheet,
200
- row_cursor=row_cursor,
201
- spreadsheetId=spreadsheet_id,
202
- majorDimension="ROWS",
203
- )
204
- )
205
-
206
- row_cursor += client.Backoff.row_batch_size + 1
207
- # there should always be one range since we requested only one
208
- value_ranges = row_batch.valueRanges[0]
209
-
210
- if not value_ranges.values:
211
- break
212
-
213
- row_values = value_ranges.values
214
- if len(row_values) == 0:
215
- break
216
-
217
- yield as_airbyte_message(stream, AirbyteStreamStatus.RUNNING)
218
- for row in row_values:
219
- if not Helpers.is_row_empty(row) and Helpers.row_contains_relevant_data(row, column_index_to_name.keys()):
220
- yield AirbyteMessage(type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name))
221
-
222
- yield self._checkpoint_state(checkpoint_reader.get_checkpoint(), state_manager, sheet, None)
223
- yield as_airbyte_message(stream, AirbyteStreamStatus.COMPLETE)
224
-
225
- def _checkpoint_state(
226
- self,
227
- stream_state: Mapping[str, Any],
228
- state_manager,
229
- stream_name: str,
230
- stream_namespace: Optional[str],
231
- ) -> AirbyteMessage:
232
- state_manager.update_state_for_stream(stream_name, stream_namespace, stream_state)
233
- return state_manager.create_state_message(stream_name, stream_namespace)
234
-
235
- def read(
236
- self,
237
- logger: logging.Logger,
238
- config: json,
239
- catalog: ConfiguredAirbyteCatalog,
240
- state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
241
- ) -> Generator[AirbyteMessage, None, None]:
242
- spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
243
- try:
244
- yield from self._read(logger, config, catalog, state)
245
- except errors.HttpError as e:
246
- error_description = exception_description_by_status_code(e.status_code, spreadsheet_id)
247
-
248
- if e.status_code == status_codes.FORBIDDEN:
249
- raise AirbyteTracedException(
250
- message=f"Stopped syncing process. {error_description}",
251
- internal_message=error_description,
252
- failure_type=FailureType.config_error,
253
- ) from e
254
- if e.status_code == status_codes.TOO_MANY_REQUESTS:
255
- raise AirbyteTracedException(
256
- message=f"Stopped syncing process due to rate limits. {error_description}",
257
- internal_message=error_description,
258
- failure_type=FailureType.transient_error,
259
- ) from e
260
- else:
261
- logger.info(f"{e.status_code}: {e.reason}. {error_description}")
262
- raise AirbyteTracedException(
263
- message=f"Stopped syncing process. {error_description}",
264
- internal_message=error_description,
265
- failure_type=FailureType.transient_error,
266
- ) from e
267
- finally:
268
- logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
269
-
270
- @staticmethod
271
- def get_credentials(config):
272
- # backward compatible with old style config
273
- if config.get("credentials_json"):
274
- credentials = {"auth_type": "Service", "service_account_info": config.get("credentials_json")}
275
- return credentials
276
-
277
- return config.get("credentials")
19
+ # Declarative Source
20
+ class SourceGoogleSheets(YamlDeclarativeSource):
21
+ def __init__(self, catalog: Optional[ConfiguredAirbyteCatalog], config: Optional[Mapping[str, Any]], state: TState, **kwargs):
22
+ super().__init__(catalog=catalog, config=config, state=state, **{"path_to_yaml": "manifest.yaml"})
@@ -12,17 +12,15 @@ connectionSpecification:
12
12
  type: integer
13
13
  title: Row Batch Size
14
14
  description: >-
15
- Default value is 200.
15
+ Default value is 1000000.
16
16
  An integer representing row batch size for each sent request to Google Sheets API.
17
- Row batch size means how many rows are processed from the google sheet, for example default value 200
18
- would process rows 1-201, then 201-401 and so on.
17
+ Row batch size means how many rows are processed from the google sheet, for example default value 1000000
18
+ would process rows 2-1000002, then 1000003-2000003 and so on.
19
19
  Based on <a href='https://developers.google.com/sheets/api/limits'>Google Sheets API limits documentation</a>,
20
20
  it is possible to send up to 300 requests per minute, but each individual request has to be processed under 180 seconds,
21
21
  otherwise the request returns a timeout error. In regards to this information, consider network speed and
22
22
  number of columns of the google sheet when deciding a batch_size value.
23
- Default value should cover most of the cases, but if a google sheet has over 100,000 records or more,
24
- consider increasing batch_size value.
25
- default: 200
23
+ default: 1000000
26
24
  spreadsheet_id:
27
25
  type: string
28
26
  title: Spreadsheet Link
@@ -1,5 +1,5 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
 
@@ -8,6 +8,7 @@ import re
8
8
  import unidecode
9
9
  from requests.status_codes import codes as status_codes
10
10
 
11
+
11
12
  TOKEN_PATTERN = re.compile(r"[A-Z]+[a-z]*|[a-z]+|\d+|(?P<NoToken>[^a-zA-Z\d]+)")
12
13
  DEFAULT_SEPARATOR = "_"
13
14
 
@@ -1,14 +0,0 @@
1
- source_google_sheets/__init__.py,sha256=-aGVMRfrgWjYad3_cHofIptEEa5WMQzTvFD92HevQfw,73
2
- source_google_sheets/client.py,sha256=dLujGTU2CPVV8WJwGHEBg3zJZK6xtzuyGTJm7TjAv9I,1916
3
- source_google_sheets/helpers.py,sha256=kKXop3YyQ3jPYlWgWW3GmRPxDmnuoHZ4joa3rQLCxUQ,10642
4
- source_google_sheets/models/__init__.py,sha256=Z-4MTpxG5t2jGhXzs4PPoIOa83zw3jRnUDx0N9Puv3s,61
5
- source_google_sheets/models/spreadsheet.py,sha256=fsHREpPEN36wCzGdqgfJ2EVW40UDZ_lS863A4XT2pGo,1112
6
- source_google_sheets/models/spreadsheet_values.py,sha256=y8ytuTqwpziJ2ICl0xhlRWgjMkxTfxOalRd414PMHZM,440
7
- source_google_sheets/run.py,sha256=_f5-LNqMzBuHtCD1YoUBxnA0fszgqmdNGcN7y_AmXU0,237
8
- source_google_sheets/source.py,sha256=KIcjUH_-vRNTwHd6rNv8c_jgPBJjXMMmcDvvVyng_OA,13504
9
- source_google_sheets/spec.yaml,sha256=WrPdH2xLCdyM-kY-pRqbwICcNPhv8nqnb2gdbslTsaQ,5141
10
- source_google_sheets/utils.py,sha256=ZB5lboyffiuuQdSarqe8AqBGEyiQpxiOfxqcU7Ght8A,2289
11
- airbyte_source_google_sheets-0.8.4.dist-info/METADATA,sha256=mIdbrQljTBznf31l7Xrr5Zy63tsNCI3LctSDj51bRhE,5539
12
- airbyte_source_google_sheets-0.8.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
13
- airbyte_source_google_sheets-0.8.4.dist-info/entry_points.txt,sha256=Dtsfjohe5IPUFyqojk49SIoP7CifCTlNLG_pgivzppo,69
14
- airbyte_source_google_sheets-0.8.4.dist-info/RECORD,,
@@ -1,48 +0,0 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
- import logging
6
- from typing import Dict, List
7
-
8
- import backoff
9
- from googleapiclient import errors
10
- from requests import codes as status_codes
11
-
12
- from .helpers import SCOPES, Helpers
13
-
14
- logger = logging.getLogger("airbyte")
15
-
16
-
17
- class GoogleSheetsClient:
18
- class Backoff:
19
- row_batch_size = 200
20
-
21
- @classmethod
22
- def increase_row_batch_size(cls, details):
23
- if details["exception"].status_code == status_codes.TOO_MANY_REQUESTS and cls.row_batch_size < 1000:
24
- cls.row_batch_size = cls.row_batch_size + 100
25
- logger.info(f"Increasing number of records fetching due to rate limits. Current value: {cls.row_batch_size}")
26
-
27
- @staticmethod
28
- def give_up(error):
29
- code = error.resp.status
30
- # Stop retrying if it's not a problem with the rate limit or on the server end
31
- return not (code == status_codes.TOO_MANY_REQUESTS or 500 <= code < 600)
32
-
33
- def __init__(self, credentials: Dict[str, str], scopes: List[str] = SCOPES):
34
- self.client = Helpers.get_authenticated_sheets_client(credentials, scopes)
35
-
36
- @backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
37
- def get(self, **kwargs):
38
- return self.client.get(**kwargs).execute()
39
-
40
- @backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
41
- def get_values(self, **kwargs):
42
- range = self._create_range(kwargs.pop("sheet"), kwargs.pop("row_cursor"))
43
- logger.info(f"Fetching range {range}")
44
- return self.client.values().batchGet(ranges=range, **kwargs).execute()
45
-
46
- def _create_range(self, sheet, row_cursor):
47
- range = f"{sheet}!{row_cursor}:{row_cursor + self.Backoff.row_batch_size}"
48
- return range
@@ -1,232 +0,0 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
- import json
6
- import logging
7
- import re
8
- from collections import defaultdict
9
- from datetime import datetime
10
- from typing import Dict, FrozenSet, Iterable, List, Tuple
11
-
12
- from airbyte_cdk.models.airbyte_protocol import AirbyteRecordMessage, AirbyteStream, ConfiguredAirbyteCatalog, SyncMode
13
- from google.oauth2 import credentials as client_account
14
- from google.oauth2 import service_account
15
- from googleapiclient import discovery
16
-
17
- from .models.spreadsheet import RowData, Spreadsheet
18
- from .utils import safe_name_conversion
19
-
20
- SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly", "https://www.googleapis.com/auth/drive.readonly"]
21
-
22
- logger = logging.getLogger("airbyte")
23
-
24
-
25
- class Helpers(object):
26
- @staticmethod
27
- def get_authenticated_sheets_client(credentials: Dict[str, str], scopes: List[str] = SCOPES) -> discovery.Resource:
28
- creds = Helpers.get_authenticated_google_credentials(credentials, scopes)
29
- return discovery.build("sheets", "v4", credentials=creds).spreadsheets()
30
-
31
- @staticmethod
32
- def get_authenticated_drive_client(credentials: Dict[str, str], scopes: List[str] = SCOPES) -> discovery.Resource:
33
- creds = Helpers.get_authenticated_google_credentials(credentials, scopes)
34
- return discovery.build("drive", "v3", credentials=creds)
35
-
36
- @staticmethod
37
- def get_authenticated_google_credentials(credentials: Dict[str, str], scopes: List[str] = SCOPES):
38
- auth_type = credentials.pop("auth_type")
39
- if auth_type == "Service":
40
- return service_account.Credentials.from_service_account_info(json.loads(credentials["service_account_info"]), scopes=scopes)
41
- elif auth_type == "Client":
42
- return client_account.Credentials.from_authorized_user_info(info=credentials)
43
-
44
- @staticmethod
45
- def headers_to_airbyte_stream(logger: logging.Logger, sheet_name: str, header_row_values: List[str]) -> AirbyteStream:
46
- """
47
- Parses sheet headers from the provided row. This method assumes that data is contiguous
48
- i.e: every cell contains a value and the first cell which does not contain a value denotes the end
49
- of the headers. For example, if the first row contains "One | Two | | Three" then this method
50
- will parse the headers as ["One", "Two"]. This assumption is made for simplicity and can be modified later.
51
- """
52
- fields, duplicate_fields = Helpers.get_valid_headers_and_duplicates(header_row_values)
53
- if duplicate_fields:
54
- logger.warn(f"Duplicate headers found in {sheet_name}. Ignoring them: {duplicate_fields}")
55
-
56
- sheet_json_schema = {
57
- "$schema": "http://json-schema.org/draft-07/schema#",
58
- "type": "object",
59
- # For simplicity, the type of every cell is a string
60
- "properties": {field: {"type": "string"} for field in fields},
61
- }
62
-
63
- return AirbyteStream(name=sheet_name, json_schema=sheet_json_schema, supported_sync_modes=[SyncMode.full_refresh])
64
-
65
- @staticmethod
66
- def get_valid_headers_and_duplicates(header_row_values: List[str]) -> (List[str], List[str]):
67
- fields = []
68
- duplicate_fields = set()
69
- for cell_value in header_row_values:
70
- if cell_value:
71
- if cell_value in fields:
72
- duplicate_fields.add(cell_value)
73
- else:
74
- fields.append(cell_value)
75
- else:
76
- break
77
-
78
- # Removing all duplicate fields
79
- if duplicate_fields:
80
- fields = [field for field in fields if field not in duplicate_fields]
81
-
82
- return fields, list(duplicate_fields)
83
-
84
- @staticmethod
85
- def get_formatted_row_values(row_data: RowData) -> List[str]:
86
- """
87
- Gets the formatted values of all cell data in this row. A formatted value is the final value a user sees in a spreadsheet.
88
- It can be a raw string input by the user, or the result of a sheets function call.
89
- """
90
- return [value.formattedValue for value in row_data.values]
91
-
92
- @staticmethod
93
- def get_first_row(client, spreadsheet_id: str, sheet_name: str) -> List[str]:
94
- spreadsheet = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=True, ranges=f"{sheet_name}!1:1"))
95
-
96
- # There is only one sheet since we are specifying the sheet in the requested ranges.
97
- returned_sheets = spreadsheet.sheets
98
- if len(returned_sheets) != 1:
99
- raise Exception(f"Unexpected return result: Sheet {sheet_name} was expected to contain data on exactly 1 sheet. ")
100
-
101
- range_data = returned_sheets[0].data
102
- if len(range_data) != 1:
103
- raise Exception(f"Expected data for exactly one range for sheet {sheet_name}")
104
-
105
- all_row_data = range_data[0].rowData
106
- if not all_row_data:
107
- # the sheet is empty
108
- logger.warning(f"The sheet {sheet_name} (ID {spreadsheet_id}) is empty!")
109
- return []
110
-
111
- if len(all_row_data) != 1:
112
- raise Exception(f"Expected data for exactly one row for sheet {sheet_name}")
113
-
114
- first_row_data = all_row_data[0]
115
-
116
- return Helpers.get_formatted_row_values(first_row_data)
117
-
118
- @staticmethod
119
- def parse_sheet_and_column_names_from_catalog(catalog: ConfiguredAirbyteCatalog) -> Dict[str, FrozenSet[str]]:
120
- sheet_to_column_name = {}
121
- for configured_stream in catalog.streams:
122
- stream = configured_stream.stream
123
- sheet_name = stream.name
124
- sheet_to_column_name[sheet_name] = frozenset(stream.json_schema["properties"].keys())
125
-
126
- return sheet_to_column_name
127
-
128
- @staticmethod
129
- def row_data_to_record_message(sheet_name: str, cell_values: List[str], column_index_to_name: Dict[int, str]) -> AirbyteRecordMessage:
130
- data = {}
131
- for relevant_index in sorted(column_index_to_name.keys()):
132
- if relevant_index >= len(cell_values):
133
- break
134
-
135
- cell_value = cell_values[relevant_index]
136
- if cell_value.strip() != "":
137
- data[column_index_to_name[relevant_index]] = cell_value
138
-
139
- return AirbyteRecordMessage(stream=sheet_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000)
140
-
141
- @staticmethod
142
- def get_available_sheets_to_column_index_to_name(
143
- client, spreadsheet_id: str, requested_sheets_and_columns: Dict[str, FrozenSet[str]], names_conversion: bool = False
144
- ) -> Dict[str, Dict[int, str]]:
145
- available_sheets = Helpers.get_sheets_in_spreadsheet(client, spreadsheet_id)
146
- logger.info(f"Available sheets: {available_sheets}")
147
- available_sheets_to_column_index_to_name = defaultdict(dict)
148
- for sheet, columns in requested_sheets_and_columns.items():
149
- if sheet in available_sheets:
150
- first_row = Helpers.get_first_row(client, spreadsheet_id, sheet)
151
- if names_conversion:
152
- first_row = [safe_name_conversion(h) for h in first_row]
153
- # When performing names conversion, they won't match what is listed in catalog for the majority of cases,
154
- # so they should be cast here in order to have them in records
155
- columns = {safe_name_conversion(c) for c in columns}
156
- # Find the column index of each header value
157
- idx = 0
158
- for cell_value in first_row:
159
- if cell_value in columns:
160
- available_sheets_to_column_index_to_name[sheet][idx] = cell_value
161
- idx += 1
162
- return available_sheets_to_column_index_to_name
163
-
164
- @staticmethod
165
- def get_sheets_in_spreadsheet(client, spreadsheet_id: str) -> List[str]:
166
- spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
167
- return [sheet.properties.title for sheet in spreadsheet_metadata.sheets]
168
-
169
- @staticmethod
170
- def get_sheet_row_count(client, spreadsheet_id: str) -> Dict[str, int]:
171
- spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
172
- # filter out sheets without gridProperties (like in diagram sheets)
173
- data_sheets = [sheet for sheet in spreadsheet_metadata.sheets if hasattr(sheet.properties, "gridProperties")]
174
- return {sheet.properties.title: sheet.properties.gridProperties["rowCount"] for sheet in data_sheets}
175
-
176
- @staticmethod
177
- def get_grid_sheets(spreadsheet_metadata) -> List[str]:
178
- """Return grid only diagram, filter out sheets with image/diagram only
179
-
180
- https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/sheets#sheetproperties
181
- """
182
- grid_sheets = []
183
- non_grid_sheets = []
184
- for sheet in spreadsheet_metadata.sheets:
185
- sheet_title = sheet.properties.title
186
- if (
187
- hasattr(sheet.properties, "gridProperties")
188
- and hasattr(sheet.properties, "sheetType")
189
- and sheet.properties.sheetType == "GRID"
190
- ):
191
- grid_sheets.append(sheet_title)
192
- else:
193
- non_grid_sheets.append(sheet_title)
194
-
195
- if non_grid_sheets:
196
- # logging.getLogger(...).log() expects an integer level. The level for WARN is 30
197
- # Reference: https://docs.python.org/3.10/library/logging.html#levels
198
- logging.getLogger("airbyte").log(30, "Skip non-grid sheets: " + ", ".join(non_grid_sheets))
199
-
200
- return grid_sheets
201
-
202
- @staticmethod
203
- def is_row_empty(cell_values: List[str]) -> bool:
204
- for cell in cell_values:
205
- if cell.strip() != "":
206
- return False
207
- return True
208
-
209
- @staticmethod
210
- def row_contains_relevant_data(cell_values: List[str], relevant_indices: Iterable[int]) -> bool:
211
- for idx in relevant_indices:
212
- if len(cell_values) > idx and cell_values[idx].strip() != "":
213
- return True
214
- return False
215
-
216
- @staticmethod
217
- def get_spreadsheet_id(id_or_url: str) -> str:
218
- if re.match(r"(https://)", id_or_url):
219
- # This is a URL
220
- m = re.search(r"(/)([-\w]{20,})([/]?)", id_or_url)
221
- if m is not None and m.group(2):
222
- return m.group(2)
223
- else:
224
- return id_or_url
225
-
226
- @staticmethod
227
- def check_sheet_is_valid(client, spreadsheet_id: str, sheet_name: str) -> Tuple[bool, str]:
228
- try:
229
- Helpers.get_first_row(client, spreadsheet_id, sheet_name)
230
- return True, ""
231
- except Exception as e:
232
- return False, str(e)