airbyte-source-google-sheets 0.8.5__py3-none-any.whl → 0.9.0rc1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- {airbyte_source_google_sheets-0.8.5.dist-info → airbyte_source_google_sheets-0.9.0rc1.dist-info}/METADATA +3 -7
- airbyte_source_google_sheets-0.9.0rc1.dist-info/RECORD +16 -0
- source_google_sheets/__init__.py +4 -0
- source_google_sheets/components/__init__.py +8 -0
- source_google_sheets/components/extractors.py +207 -0
- source_google_sheets/components/partition_routers.py +36 -0
- source_google_sheets/manifest.yaml +407 -0
- source_google_sheets/models/spreadsheet.py +1 -1
- source_google_sheets/models/spreadsheet_values.py +1 -1
- source_google_sheets/run.py +43 -5
- source_google_sheets/source.py +14 -271
- source_google_sheets/spec.yaml +4 -6
- source_google_sheets/utils.py +1 -1
- airbyte_source_google_sheets-0.8.5.dist-info/RECORD +0 -14
- source_google_sheets/client.py +0 -49
- source_google_sheets/helpers.py +0 -234
- {airbyte_source_google_sheets-0.8.5.dist-info → airbyte_source_google_sheets-0.9.0rc1.dist-info}/WHEEL +0 -0
- {airbyte_source_google_sheets-0.8.5.dist-info → airbyte_source_google_sheets-0.9.0rc1.dist-info}/entry_points.txt +0 -0
source_google_sheets/source.py
CHANGED
@@ -1,279 +1,22 @@
|
|
1
1
|
#
|
2
|
-
# Copyright (c)
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
+
from typing import Any, Mapping, Optional
|
5
6
|
|
6
|
-
import
|
7
|
-
import
|
8
|
-
import
|
9
|
-
from typing import Any, Generator, List, Mapping, MutableMapping, Optional, Union
|
7
|
+
from airbyte_cdk.models import ConfiguredAirbyteCatalog
|
8
|
+
from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource
|
9
|
+
from airbyte_cdk.sources.source import TState
|
10
10
|
|
11
|
-
from apiclient import errors
|
12
|
-
from google.auth import exceptions as google_exceptions
|
13
|
-
from requests.status_codes import codes as status_codes
|
14
11
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
AirbyteStateMessage,
|
21
|
-
AirbyteStreamStatus,
|
22
|
-
ConfiguredAirbyteCatalog,
|
23
|
-
Status,
|
24
|
-
Type,
|
25
|
-
)
|
26
|
-
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
27
|
-
from airbyte_cdk.sources.source import Source
|
28
|
-
from airbyte_cdk.sources.streams.checkpoint import FullRefreshCheckpointReader
|
29
|
-
from airbyte_cdk.utils import AirbyteTracedException
|
30
|
-
from airbyte_cdk.utils.stream_status_utils import as_airbyte_message
|
12
|
+
"""
|
13
|
+
This file provides the necessary constructs to interpret a provided declarative YAML configuration file into
|
14
|
+
source connector.
|
15
|
+
WARNING: Do not modify this file.
|
16
|
+
"""
|
31
17
|
|
32
|
-
from .client import GoogleSheetsClient
|
33
|
-
from .helpers import Helpers
|
34
|
-
from .models.spreadsheet import Spreadsheet
|
35
|
-
from .models.spreadsheet_values import SpreadsheetValues
|
36
|
-
from .utils import exception_description_by_status_code, safe_name_conversion
|
37
18
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
socket.setdefaulttimeout(DEFAULT_SOCKET_TIMEOUT)
|
43
|
-
|
44
|
-
|
45
|
-
class SourceGoogleSheets(Source):
|
46
|
-
"""
|
47
|
-
Spreadsheets API Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets
|
48
|
-
"""
|
49
|
-
|
50
|
-
def check(self, logger: logging.Logger, config: json) -> AirbyteConnectionStatus:
|
51
|
-
# Check involves verifying that the specified spreadsheet is reachable with our credentials.
|
52
|
-
try:
|
53
|
-
client = GoogleSheetsClient(self.get_credentials(config))
|
54
|
-
except Exception as e:
|
55
|
-
return AirbyteConnectionStatus(status=Status.FAILED, message=f"Please use valid credentials json file. Error: {e}")
|
56
|
-
|
57
|
-
spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
|
58
|
-
|
59
|
-
try:
|
60
|
-
spreadsheet = client.get(spreadsheetId=spreadsheet_id, includeGridData=False)
|
61
|
-
except errors.HttpError as err:
|
62
|
-
message = "Config error: "
|
63
|
-
# Give a clearer message if it's a common error like 404.
|
64
|
-
if err.resp.status == status_codes.NOT_FOUND:
|
65
|
-
message += "The spreadsheet link is not valid. Enter the URL of the Google spreadsheet you want to sync."
|
66
|
-
raise AirbyteTracedException(
|
67
|
-
message=message,
|
68
|
-
internal_message=message,
|
69
|
-
failure_type=FailureType.config_error,
|
70
|
-
) from err
|
71
|
-
except google_exceptions.GoogleAuthError as err:
|
72
|
-
message = "Access to the spreadsheet expired or was revoked. Re-authenticate to restore access."
|
73
|
-
raise AirbyteTracedException(
|
74
|
-
message=message,
|
75
|
-
internal_message=message,
|
76
|
-
failure_type=FailureType.config_error,
|
77
|
-
) from err
|
78
|
-
|
79
|
-
# Check for duplicate headers
|
80
|
-
spreadsheet_metadata = Spreadsheet.parse_obj(spreadsheet)
|
81
|
-
grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
|
82
|
-
|
83
|
-
duplicate_headers_in_sheet = {}
|
84
|
-
for sheet_name in grid_sheets:
|
85
|
-
try:
|
86
|
-
header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
|
87
|
-
if config.get("names_conversion"):
|
88
|
-
header_row_data = [safe_name_conversion(h) for h in header_row_data]
|
89
|
-
_, duplicate_headers = Helpers.get_valid_headers_and_duplicates(header_row_data)
|
90
|
-
if duplicate_headers:
|
91
|
-
duplicate_headers_in_sheet[sheet_name] = duplicate_headers
|
92
|
-
except Exception as err:
|
93
|
-
if str(err).startswith("Expected data for exactly one row for sheet"):
|
94
|
-
logger.warn(f"Skip empty sheet: {sheet_name}")
|
95
|
-
else:
|
96
|
-
logger.error(str(err))
|
97
|
-
return AirbyteConnectionStatus(
|
98
|
-
status=Status.FAILED, message=f"Unable to read the schema of sheet {sheet_name}. Error: {str(err)}"
|
99
|
-
)
|
100
|
-
if duplicate_headers_in_sheet:
|
101
|
-
duplicate_headers_error_message = ", ".join(
|
102
|
-
[
|
103
|
-
f"[sheet:{sheet_name}, headers:{duplicate_sheet_headers}]"
|
104
|
-
for sheet_name, duplicate_sheet_headers in duplicate_headers_in_sheet.items()
|
105
|
-
]
|
106
|
-
)
|
107
|
-
return AirbyteConnectionStatus(
|
108
|
-
status=Status.FAILED,
|
109
|
-
message="The following duplicate headers were found in the following sheets. Please fix them to continue: "
|
110
|
-
+ duplicate_headers_error_message,
|
111
|
-
)
|
112
|
-
|
113
|
-
return AirbyteConnectionStatus(status=Status.SUCCEEDED)
|
114
|
-
|
115
|
-
def discover(self, logger: logging.Logger, config: json) -> AirbyteCatalog:
|
116
|
-
client = GoogleSheetsClient(self.get_credentials(config))
|
117
|
-
spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
|
118
|
-
try:
|
119
|
-
logger.info(f"Running discovery on sheet {spreadsheet_id}")
|
120
|
-
spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
|
121
|
-
grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
|
122
|
-
streams = []
|
123
|
-
for sheet_name in grid_sheets:
|
124
|
-
try:
|
125
|
-
header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
|
126
|
-
if config.get("names_conversion"):
|
127
|
-
header_row_data = [safe_name_conversion(h) for h in header_row_data]
|
128
|
-
stream = Helpers.headers_to_airbyte_stream(logger, sheet_name, header_row_data)
|
129
|
-
streams.append(stream)
|
130
|
-
except Exception as err:
|
131
|
-
if str(err).startswith("Expected data for exactly one row for sheet"):
|
132
|
-
logger.warn(f"Skip empty sheet: {sheet_name}")
|
133
|
-
else:
|
134
|
-
logger.error(str(err))
|
135
|
-
return AirbyteCatalog(streams=streams)
|
136
|
-
|
137
|
-
except errors.HttpError as err:
|
138
|
-
error_description = exception_description_by_status_code(err.resp.status, spreadsheet_id)
|
139
|
-
config_error_status_codes = [status_codes.NOT_FOUND, status_codes.FORBIDDEN]
|
140
|
-
if err.resp.status in config_error_status_codes:
|
141
|
-
message = f"{error_description}. {err.reason}."
|
142
|
-
raise AirbyteTracedException(
|
143
|
-
message=message,
|
144
|
-
internal_message=message,
|
145
|
-
failure_type=FailureType.config_error,
|
146
|
-
) from err
|
147
|
-
raise Exception(f"Could not discover the schema of your spreadsheet. {error_description}. {err.reason}.")
|
148
|
-
except google_exceptions.GoogleAuthError as err:
|
149
|
-
message = "Access to the spreadsheet expired or was revoked. Re-authenticate to restore access."
|
150
|
-
raise AirbyteTracedException(
|
151
|
-
message=message,
|
152
|
-
internal_message=message,
|
153
|
-
failure_type=FailureType.config_error,
|
154
|
-
) from err
|
155
|
-
|
156
|
-
def _read(
|
157
|
-
self,
|
158
|
-
logger: logging.Logger,
|
159
|
-
config: json,
|
160
|
-
catalog: ConfiguredAirbyteCatalog,
|
161
|
-
state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
|
162
|
-
) -> Generator[AirbyteMessage, None, None]:
|
163
|
-
client = GoogleSheetsClient(self.get_credentials(config))
|
164
|
-
client.Backoff.row_batch_size = config.get("batch_size", 200)
|
165
|
-
|
166
|
-
sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog)
|
167
|
-
stream_instances = {s.stream.name: s.stream for s in catalog.streams}
|
168
|
-
state_manager = ConnectorStateManager(stream_instance_map=stream_instances, state=state or {})
|
169
|
-
spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
|
170
|
-
|
171
|
-
logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
|
172
|
-
# For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
|
173
|
-
# a blank row, emit the row batch
|
174
|
-
sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(
|
175
|
-
client, spreadsheet_id, sheet_to_column_name, config.get("names_conversion")
|
176
|
-
)
|
177
|
-
sheet_row_counts = Helpers.get_sheet_row_count(client, spreadsheet_id)
|
178
|
-
logger.info(f"Row counts: {sheet_row_counts}")
|
179
|
-
for sheet in sheet_to_column_index_to_name.keys():
|
180
|
-
logger.info(f"Syncing sheet {sheet}")
|
181
|
-
stream = stream_instances.get(sheet)
|
182
|
-
yield as_airbyte_message(stream, AirbyteStreamStatus.STARTED)
|
183
|
-
checkpoint_reader = FullRefreshCheckpointReader([])
|
184
|
-
_ = checkpoint_reader.next()
|
185
|
-
# We revalidate the sheet here to avoid errors in case the sheet was changed after the sync started
|
186
|
-
is_valid, reason = Helpers.check_sheet_is_valid(client, spreadsheet_id, sheet)
|
187
|
-
if not is_valid:
|
188
|
-
logger.info(f"Skipping syncing sheet {sheet}: {reason}")
|
189
|
-
yield self._checkpoint_state(checkpoint_reader.get_checkpoint(), state_manager, sheet, None)
|
190
|
-
yield as_airbyte_message(stream, AirbyteStreamStatus.INCOMPLETE)
|
191
|
-
continue
|
192
|
-
|
193
|
-
column_index_to_name = sheet_to_column_index_to_name[sheet]
|
194
|
-
row_cursor = 2 # we start syncing past the header row
|
195
|
-
# For the loop, it is necessary that the initial row exists when we send a request to the API,
|
196
|
-
# if the last row of the interval goes outside the sheet - this is normal, we will return
|
197
|
-
# only the real data of the sheet and in the next iteration we will loop out.
|
198
|
-
while row_cursor <= sheet_row_counts[sheet]:
|
199
|
-
row_batch = SpreadsheetValues.parse_obj(
|
200
|
-
client.get_values(
|
201
|
-
sheet=sheet,
|
202
|
-
row_cursor=row_cursor,
|
203
|
-
spreadsheetId=spreadsheet_id,
|
204
|
-
majorDimension="ROWS",
|
205
|
-
)
|
206
|
-
)
|
207
|
-
|
208
|
-
row_cursor += client.Backoff.row_batch_size + 1
|
209
|
-
# there should always be one range since we requested only one
|
210
|
-
value_ranges = row_batch.valueRanges[0]
|
211
|
-
|
212
|
-
if not value_ranges.values:
|
213
|
-
break
|
214
|
-
|
215
|
-
row_values = value_ranges.values
|
216
|
-
if len(row_values) == 0:
|
217
|
-
break
|
218
|
-
|
219
|
-
yield as_airbyte_message(stream, AirbyteStreamStatus.RUNNING)
|
220
|
-
for row in row_values:
|
221
|
-
if not Helpers.is_row_empty(row) and Helpers.row_contains_relevant_data(row, column_index_to_name.keys()):
|
222
|
-
yield AirbyteMessage(type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name))
|
223
|
-
|
224
|
-
yield self._checkpoint_state(checkpoint_reader.get_checkpoint(), state_manager, sheet, None)
|
225
|
-
yield as_airbyte_message(stream, AirbyteStreamStatus.COMPLETE)
|
226
|
-
|
227
|
-
def _checkpoint_state(
|
228
|
-
self,
|
229
|
-
stream_state: Mapping[str, Any],
|
230
|
-
state_manager,
|
231
|
-
stream_name: str,
|
232
|
-
stream_namespace: Optional[str],
|
233
|
-
) -> AirbyteMessage:
|
234
|
-
state_manager.update_state_for_stream(stream_name, stream_namespace, stream_state)
|
235
|
-
return state_manager.create_state_message(stream_name, stream_namespace)
|
236
|
-
|
237
|
-
def read(
|
238
|
-
self,
|
239
|
-
logger: logging.Logger,
|
240
|
-
config: json,
|
241
|
-
catalog: ConfiguredAirbyteCatalog,
|
242
|
-
state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
|
243
|
-
) -> Generator[AirbyteMessage, None, None]:
|
244
|
-
spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
|
245
|
-
try:
|
246
|
-
yield from self._read(logger, config, catalog, state)
|
247
|
-
except errors.HttpError as e:
|
248
|
-
error_description = exception_description_by_status_code(e.status_code, spreadsheet_id)
|
249
|
-
|
250
|
-
if e.status_code == status_codes.FORBIDDEN:
|
251
|
-
raise AirbyteTracedException(
|
252
|
-
message=f"Stopped syncing process. {error_description}",
|
253
|
-
internal_message=error_description,
|
254
|
-
failure_type=FailureType.config_error,
|
255
|
-
) from e
|
256
|
-
if e.status_code == status_codes.TOO_MANY_REQUESTS:
|
257
|
-
raise AirbyteTracedException(
|
258
|
-
message=f"Stopped syncing process due to rate limits. {error_description}",
|
259
|
-
internal_message=error_description,
|
260
|
-
failure_type=FailureType.transient_error,
|
261
|
-
) from e
|
262
|
-
else:
|
263
|
-
logger.info(f"{e.status_code}: {e.reason}. {error_description}")
|
264
|
-
raise AirbyteTracedException(
|
265
|
-
message=f"Stopped syncing process. {error_description}",
|
266
|
-
internal_message=error_description,
|
267
|
-
failure_type=FailureType.transient_error,
|
268
|
-
) from e
|
269
|
-
finally:
|
270
|
-
logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
|
271
|
-
|
272
|
-
@staticmethod
|
273
|
-
def get_credentials(config):
|
274
|
-
# backward compatible with old style config
|
275
|
-
if config.get("credentials_json"):
|
276
|
-
credentials = {"auth_type": "Service", "service_account_info": config.get("credentials_json")}
|
277
|
-
return credentials
|
278
|
-
|
279
|
-
return config.get("credentials")
|
19
|
+
# Declarative Source
|
20
|
+
class SourceGoogleSheets(YamlDeclarativeSource):
|
21
|
+
def __init__(self, catalog: Optional[ConfiguredAirbyteCatalog], config: Optional[Mapping[str, Any]], state: TState, **kwargs):
|
22
|
+
super().__init__(catalog=catalog, config=config, state=state, **{"path_to_yaml": "manifest.yaml"})
|
source_google_sheets/spec.yaml
CHANGED
@@ -12,17 +12,15 @@ connectionSpecification:
|
|
12
12
|
type: integer
|
13
13
|
title: Row Batch Size
|
14
14
|
description: >-
|
15
|
-
Default value is
|
15
|
+
Default value is 1000000.
|
16
16
|
An integer representing row batch size for each sent request to Google Sheets API.
|
17
|
-
Row batch size means how many rows are processed from the google sheet, for example default value
|
18
|
-
would process rows
|
17
|
+
Row batch size means how many rows are processed from the google sheet, for example default value 1000000
|
18
|
+
would process rows 2-1000002, then 1000003-2000003 and so on.
|
19
19
|
Based on <a href='https://developers.google.com/sheets/api/limits'>Google Sheets API limits documentation</a>,
|
20
20
|
it is possible to send up to 300 requests per minute, but each individual request has to be processed under 180 seconds,
|
21
21
|
otherwise the request returns a timeout error. In regards to this information, consider network speed and
|
22
22
|
number of columns of the google sheet when deciding a batch_size value.
|
23
|
-
|
24
|
-
consider increasing batch_size value.
|
25
|
-
default: 200
|
23
|
+
default: 1000000
|
26
24
|
spreadsheet_id:
|
27
25
|
type: string
|
28
26
|
title: Spreadsheet Link
|
source_google_sheets/utils.py
CHANGED
@@ -1,14 +0,0 @@
|
|
1
|
-
source_google_sheets/__init__.py,sha256=-aGVMRfrgWjYad3_cHofIptEEa5WMQzTvFD92HevQfw,73
|
2
|
-
source_google_sheets/client.py,sha256=A2BynDswjJ4naSyfjJr9G8fAfGEqLlNXP5vlbAsI3_s,1917
|
3
|
-
source_google_sheets/helpers.py,sha256=CbXNlEfC3sNMDTGNPb22DcalWoXfv7kAYs7LmNM76Ec,10644
|
4
|
-
source_google_sheets/models/__init__.py,sha256=Z-4MTpxG5t2jGhXzs4PPoIOa83zw3jRnUDx0N9Puv3s,61
|
5
|
-
source_google_sheets/models/spreadsheet.py,sha256=fsHREpPEN36wCzGdqgfJ2EVW40UDZ_lS863A4XT2pGo,1112
|
6
|
-
source_google_sheets/models/spreadsheet_values.py,sha256=y8ytuTqwpziJ2ICl0xhlRWgjMkxTfxOalRd414PMHZM,440
|
7
|
-
source_google_sheets/run.py,sha256=_f5-LNqMzBuHtCD1YoUBxnA0fszgqmdNGcN7y_AmXU0,237
|
8
|
-
source_google_sheets/source.py,sha256=kuHugJM9VOEPsArkP4lo88nBrS8ipk1yAoUi7cVSYZY,13506
|
9
|
-
source_google_sheets/spec.yaml,sha256=WrPdH2xLCdyM-kY-pRqbwICcNPhv8nqnb2gdbslTsaQ,5141
|
10
|
-
source_google_sheets/utils.py,sha256=DI53ARcKln77ekvuzsb3x35O9aMgZ_9OY9ets0FtI24,2290
|
11
|
-
airbyte_source_google_sheets-0.8.5.dist-info/METADATA,sha256=LkVsOKPv_UsgFioRkUDxOiDG8kO6jFF_F7MYl3zkXy8,5551
|
12
|
-
airbyte_source_google_sheets-0.8.5.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
13
|
-
airbyte_source_google_sheets-0.8.5.dist-info/entry_points.txt,sha256=Dtsfjohe5IPUFyqojk49SIoP7CifCTlNLG_pgivzppo,69
|
14
|
-
airbyte_source_google_sheets-0.8.5.dist-info/RECORD,,
|
source_google_sheets/client.py
DELETED
@@ -1,49 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
-
#
|
4
|
-
|
5
|
-
import logging
|
6
|
-
from typing import Dict, List
|
7
|
-
|
8
|
-
import backoff
|
9
|
-
from googleapiclient import errors
|
10
|
-
from requests import codes as status_codes
|
11
|
-
|
12
|
-
from .helpers import SCOPES, Helpers
|
13
|
-
|
14
|
-
|
15
|
-
logger = logging.getLogger("airbyte")
|
16
|
-
|
17
|
-
|
18
|
-
class GoogleSheetsClient:
|
19
|
-
class Backoff:
|
20
|
-
row_batch_size = 200
|
21
|
-
|
22
|
-
@classmethod
|
23
|
-
def increase_row_batch_size(cls, details):
|
24
|
-
if details["exception"].status_code == status_codes.TOO_MANY_REQUESTS and cls.row_batch_size < 1000:
|
25
|
-
cls.row_batch_size = cls.row_batch_size + 100
|
26
|
-
logger.info(f"Increasing number of records fetching due to rate limits. Current value: {cls.row_batch_size}")
|
27
|
-
|
28
|
-
@staticmethod
|
29
|
-
def give_up(error):
|
30
|
-
code = error.resp.status
|
31
|
-
# Stop retrying if it's not a problem with the rate limit or on the server end
|
32
|
-
return not (code == status_codes.TOO_MANY_REQUESTS or 500 <= code < 600)
|
33
|
-
|
34
|
-
def __init__(self, credentials: Dict[str, str], scopes: List[str] = SCOPES):
|
35
|
-
self.client = Helpers.get_authenticated_sheets_client(credentials, scopes)
|
36
|
-
|
37
|
-
@backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
|
38
|
-
def get(self, **kwargs):
|
39
|
-
return self.client.get(**kwargs).execute()
|
40
|
-
|
41
|
-
@backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
|
42
|
-
def get_values(self, **kwargs):
|
43
|
-
range = self._create_range(kwargs.pop("sheet"), kwargs.pop("row_cursor"))
|
44
|
-
logger.info(f"Fetching range {range}")
|
45
|
-
return self.client.values().batchGet(ranges=range, **kwargs).execute()
|
46
|
-
|
47
|
-
def _create_range(self, sheet, row_cursor):
|
48
|
-
range = f"{sheet}!{row_cursor}:{row_cursor + self.Backoff.row_batch_size}"
|
49
|
-
return range
|
source_google_sheets/helpers.py
DELETED
@@ -1,234 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
-
#
|
4
|
-
|
5
|
-
import json
|
6
|
-
import logging
|
7
|
-
import re
|
8
|
-
from collections import defaultdict
|
9
|
-
from datetime import datetime
|
10
|
-
from typing import Dict, FrozenSet, Iterable, List, Tuple
|
11
|
-
|
12
|
-
from google.oauth2 import credentials as client_account
|
13
|
-
from google.oauth2 import service_account
|
14
|
-
from googleapiclient import discovery
|
15
|
-
|
16
|
-
from airbyte_cdk.models.airbyte_protocol import AirbyteRecordMessage, AirbyteStream, ConfiguredAirbyteCatalog, SyncMode
|
17
|
-
|
18
|
-
from .models.spreadsheet import RowData, Spreadsheet
|
19
|
-
from .utils import safe_name_conversion
|
20
|
-
|
21
|
-
|
22
|
-
SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly", "https://www.googleapis.com/auth/drive.readonly"]
|
23
|
-
|
24
|
-
logger = logging.getLogger("airbyte")
|
25
|
-
|
26
|
-
|
27
|
-
class Helpers(object):
|
28
|
-
@staticmethod
|
29
|
-
def get_authenticated_sheets_client(credentials: Dict[str, str], scopes: List[str] = SCOPES) -> discovery.Resource:
|
30
|
-
creds = Helpers.get_authenticated_google_credentials(credentials, scopes)
|
31
|
-
return discovery.build("sheets", "v4", credentials=creds).spreadsheets()
|
32
|
-
|
33
|
-
@staticmethod
|
34
|
-
def get_authenticated_drive_client(credentials: Dict[str, str], scopes: List[str] = SCOPES) -> discovery.Resource:
|
35
|
-
creds = Helpers.get_authenticated_google_credentials(credentials, scopes)
|
36
|
-
return discovery.build("drive", "v3", credentials=creds)
|
37
|
-
|
38
|
-
@staticmethod
|
39
|
-
def get_authenticated_google_credentials(credentials: Dict[str, str], scopes: List[str] = SCOPES):
|
40
|
-
auth_type = credentials.pop("auth_type")
|
41
|
-
if auth_type == "Service":
|
42
|
-
return service_account.Credentials.from_service_account_info(json.loads(credentials["service_account_info"]), scopes=scopes)
|
43
|
-
elif auth_type == "Client":
|
44
|
-
return client_account.Credentials.from_authorized_user_info(info=credentials)
|
45
|
-
|
46
|
-
@staticmethod
|
47
|
-
def headers_to_airbyte_stream(logger: logging.Logger, sheet_name: str, header_row_values: List[str]) -> AirbyteStream:
|
48
|
-
"""
|
49
|
-
Parses sheet headers from the provided row. This method assumes that data is contiguous
|
50
|
-
i.e: every cell contains a value and the first cell which does not contain a value denotes the end
|
51
|
-
of the headers. For example, if the first row contains "One | Two | | Three" then this method
|
52
|
-
will parse the headers as ["One", "Two"]. This assumption is made for simplicity and can be modified later.
|
53
|
-
"""
|
54
|
-
fields, duplicate_fields = Helpers.get_valid_headers_and_duplicates(header_row_values)
|
55
|
-
if duplicate_fields:
|
56
|
-
logger.warn(f"Duplicate headers found in {sheet_name}. Ignoring them: {duplicate_fields}")
|
57
|
-
|
58
|
-
sheet_json_schema = {
|
59
|
-
"$schema": "http://json-schema.org/draft-07/schema#",
|
60
|
-
"type": "object",
|
61
|
-
# For simplicity, the type of every cell is a string
|
62
|
-
"properties": {field: {"type": "string"} for field in fields},
|
63
|
-
}
|
64
|
-
|
65
|
-
return AirbyteStream(name=sheet_name, json_schema=sheet_json_schema, supported_sync_modes=[SyncMode.full_refresh])
|
66
|
-
|
67
|
-
@staticmethod
|
68
|
-
def get_valid_headers_and_duplicates(header_row_values: List[str]) -> (List[str], List[str]):
|
69
|
-
fields = []
|
70
|
-
duplicate_fields = set()
|
71
|
-
for cell_value in header_row_values:
|
72
|
-
if cell_value:
|
73
|
-
if cell_value in fields:
|
74
|
-
duplicate_fields.add(cell_value)
|
75
|
-
else:
|
76
|
-
fields.append(cell_value)
|
77
|
-
else:
|
78
|
-
break
|
79
|
-
|
80
|
-
# Removing all duplicate fields
|
81
|
-
if duplicate_fields:
|
82
|
-
fields = [field for field in fields if field not in duplicate_fields]
|
83
|
-
|
84
|
-
return fields, list(duplicate_fields)
|
85
|
-
|
86
|
-
@staticmethod
|
87
|
-
def get_formatted_row_values(row_data: RowData) -> List[str]:
|
88
|
-
"""
|
89
|
-
Gets the formatted values of all cell data in this row. A formatted value is the final value a user sees in a spreadsheet.
|
90
|
-
It can be a raw string input by the user, or the result of a sheets function call.
|
91
|
-
"""
|
92
|
-
return [value.formattedValue for value in row_data.values]
|
93
|
-
|
94
|
-
@staticmethod
|
95
|
-
def get_first_row(client, spreadsheet_id: str, sheet_name: str) -> List[str]:
|
96
|
-
spreadsheet = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=True, ranges=f"{sheet_name}!1:1"))
|
97
|
-
|
98
|
-
# There is only one sheet since we are specifying the sheet in the requested ranges.
|
99
|
-
returned_sheets = spreadsheet.sheets
|
100
|
-
if len(returned_sheets) != 1:
|
101
|
-
raise Exception(f"Unexpected return result: Sheet {sheet_name} was expected to contain data on exactly 1 sheet. ")
|
102
|
-
|
103
|
-
range_data = returned_sheets[0].data
|
104
|
-
if len(range_data) != 1:
|
105
|
-
raise Exception(f"Expected data for exactly one range for sheet {sheet_name}")
|
106
|
-
|
107
|
-
all_row_data = range_data[0].rowData
|
108
|
-
if not all_row_data:
|
109
|
-
# the sheet is empty
|
110
|
-
logger.warning(f"The sheet {sheet_name} (ID {spreadsheet_id}) is empty!")
|
111
|
-
return []
|
112
|
-
|
113
|
-
if len(all_row_data) != 1:
|
114
|
-
raise Exception(f"Expected data for exactly one row for sheet {sheet_name}")
|
115
|
-
|
116
|
-
first_row_data = all_row_data[0]
|
117
|
-
|
118
|
-
return Helpers.get_formatted_row_values(first_row_data)
|
119
|
-
|
120
|
-
@staticmethod
|
121
|
-
def parse_sheet_and_column_names_from_catalog(catalog: ConfiguredAirbyteCatalog) -> Dict[str, FrozenSet[str]]:
|
122
|
-
sheet_to_column_name = {}
|
123
|
-
for configured_stream in catalog.streams:
|
124
|
-
stream = configured_stream.stream
|
125
|
-
sheet_name = stream.name
|
126
|
-
sheet_to_column_name[sheet_name] = frozenset(stream.json_schema["properties"].keys())
|
127
|
-
|
128
|
-
return sheet_to_column_name
|
129
|
-
|
130
|
-
@staticmethod
|
131
|
-
def row_data_to_record_message(sheet_name: str, cell_values: List[str], column_index_to_name: Dict[int, str]) -> AirbyteRecordMessage:
|
132
|
-
data = {}
|
133
|
-
for relevant_index in sorted(column_index_to_name.keys()):
|
134
|
-
if relevant_index >= len(cell_values):
|
135
|
-
break
|
136
|
-
|
137
|
-
cell_value = cell_values[relevant_index]
|
138
|
-
if cell_value.strip() != "":
|
139
|
-
data[column_index_to_name[relevant_index]] = cell_value
|
140
|
-
|
141
|
-
return AirbyteRecordMessage(stream=sheet_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000)
|
142
|
-
|
143
|
-
@staticmethod
|
144
|
-
def get_available_sheets_to_column_index_to_name(
|
145
|
-
client, spreadsheet_id: str, requested_sheets_and_columns: Dict[str, FrozenSet[str]], names_conversion: bool = False
|
146
|
-
) -> Dict[str, Dict[int, str]]:
|
147
|
-
available_sheets = Helpers.get_sheets_in_spreadsheet(client, spreadsheet_id)
|
148
|
-
logger.info(f"Available sheets: {available_sheets}")
|
149
|
-
available_sheets_to_column_index_to_name = defaultdict(dict)
|
150
|
-
for sheet, columns in requested_sheets_and_columns.items():
|
151
|
-
if sheet in available_sheets:
|
152
|
-
first_row = Helpers.get_first_row(client, spreadsheet_id, sheet)
|
153
|
-
if names_conversion:
|
154
|
-
first_row = [safe_name_conversion(h) for h in first_row]
|
155
|
-
# When performing names conversion, they won't match what is listed in catalog for the majority of cases,
|
156
|
-
# so they should be cast here in order to have them in records
|
157
|
-
columns = {safe_name_conversion(c) for c in columns}
|
158
|
-
# Find the column index of each header value
|
159
|
-
idx = 0
|
160
|
-
for cell_value in first_row:
|
161
|
-
if cell_value in columns:
|
162
|
-
available_sheets_to_column_index_to_name[sheet][idx] = cell_value
|
163
|
-
idx += 1
|
164
|
-
return available_sheets_to_column_index_to_name
|
165
|
-
|
166
|
-
@staticmethod
|
167
|
-
def get_sheets_in_spreadsheet(client, spreadsheet_id: str) -> List[str]:
|
168
|
-
spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
|
169
|
-
return [sheet.properties.title for sheet in spreadsheet_metadata.sheets]
|
170
|
-
|
171
|
-
@staticmethod
|
172
|
-
def get_sheet_row_count(client, spreadsheet_id: str) -> Dict[str, int]:
|
173
|
-
spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
|
174
|
-
# filter out sheets without gridProperties (like in diagram sheets)
|
175
|
-
data_sheets = [sheet for sheet in spreadsheet_metadata.sheets if hasattr(sheet.properties, "gridProperties")]
|
176
|
-
return {sheet.properties.title: sheet.properties.gridProperties["rowCount"] for sheet in data_sheets}
|
177
|
-
|
178
|
-
@staticmethod
|
179
|
-
def get_grid_sheets(spreadsheet_metadata) -> List[str]:
|
180
|
-
"""Return grid only diagram, filter out sheets with image/diagram only
|
181
|
-
|
182
|
-
https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/sheets#sheetproperties
|
183
|
-
"""
|
184
|
-
grid_sheets = []
|
185
|
-
non_grid_sheets = []
|
186
|
-
for sheet in spreadsheet_metadata.sheets:
|
187
|
-
sheet_title = sheet.properties.title
|
188
|
-
if (
|
189
|
-
hasattr(sheet.properties, "gridProperties")
|
190
|
-
and hasattr(sheet.properties, "sheetType")
|
191
|
-
and sheet.properties.sheetType == "GRID"
|
192
|
-
):
|
193
|
-
grid_sheets.append(sheet_title)
|
194
|
-
else:
|
195
|
-
non_grid_sheets.append(sheet_title)
|
196
|
-
|
197
|
-
if non_grid_sheets:
|
198
|
-
# logging.getLogger(...).log() expects an integer level. The level for WARN is 30
|
199
|
-
# Reference: https://docs.python.org/3.10/library/logging.html#levels
|
200
|
-
logging.getLogger("airbyte").log(30, "Skip non-grid sheets: " + ", ".join(non_grid_sheets))
|
201
|
-
|
202
|
-
return grid_sheets
|
203
|
-
|
204
|
-
@staticmethod
|
205
|
-
def is_row_empty(cell_values: List[str]) -> bool:
|
206
|
-
for cell in cell_values:
|
207
|
-
if cell.strip() != "":
|
208
|
-
return False
|
209
|
-
return True
|
210
|
-
|
211
|
-
@staticmethod
|
212
|
-
def row_contains_relevant_data(cell_values: List[str], relevant_indices: Iterable[int]) -> bool:
|
213
|
-
for idx in relevant_indices:
|
214
|
-
if len(cell_values) > idx and cell_values[idx].strip() != "":
|
215
|
-
return True
|
216
|
-
return False
|
217
|
-
|
218
|
-
@staticmethod
|
219
|
-
def get_spreadsheet_id(id_or_url: str) -> str:
|
220
|
-
if re.match(r"(https://)", id_or_url):
|
221
|
-
# This is a URL
|
222
|
-
m = re.search(r"(/)([-\w]{20,})([/]?)", id_or_url)
|
223
|
-
if m is not None and m.group(2):
|
224
|
-
return m.group(2)
|
225
|
-
else:
|
226
|
-
return id_or_url
|
227
|
-
|
228
|
-
@staticmethod
|
229
|
-
def check_sheet_is_valid(client, spreadsheet_id: str, sheet_name: str) -> Tuple[bool, str]:
|
230
|
-
try:
|
231
|
-
Helpers.get_first_row(client, spreadsheet_id, sheet_name)
|
232
|
-
return True, ""
|
233
|
-
except Exception as e:
|
234
|
-
return False, str(e)
|
File without changes
|
File without changes
|