airbyte-source-google-sheets 0.8.5__py3-none-any.whl → 0.9.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {airbyte_source_google_sheets-0.8.5.dist-info → airbyte_source_google_sheets-0.9.0rc2.dist-info}/METADATA +3 -7
- airbyte_source_google_sheets-0.9.0rc2.dist-info/RECORD +16 -0
- source_google_sheets/__init__.py +4 -0
- source_google_sheets/components/__init__.py +8 -0
- source_google_sheets/components/extractors.py +207 -0
- source_google_sheets/components/partition_routers.py +36 -0
- source_google_sheets/manifest.yaml +407 -0
- source_google_sheets/models/spreadsheet.py +1 -1
- source_google_sheets/models/spreadsheet_values.py +1 -1
- source_google_sheets/run.py +43 -5
- source_google_sheets/source.py +14 -271
- source_google_sheets/spec.yaml +4 -6
- source_google_sheets/utils.py +1 -1
- airbyte_source_google_sheets-0.8.5.dist-info/RECORD +0 -14
- source_google_sheets/client.py +0 -49
- source_google_sheets/helpers.py +0 -234
- {airbyte_source_google_sheets-0.8.5.dist-info → airbyte_source_google_sheets-0.9.0rc2.dist-info}/WHEEL +0 -0
- {airbyte_source_google_sheets-0.8.5.dist-info → airbyte_source_google_sheets-0.9.0rc2.dist-info}/entry_points.txt +0 -0
source_google_sheets/source.py
CHANGED
@@ -1,279 +1,22 @@
|
|
1
1
|
#
|
2
|
-
# Copyright (c)
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
+
from typing import Any, Mapping, Optional
|
5
6
|
|
6
|
-
import
|
7
|
-
import
|
8
|
-
import
|
9
|
-
from typing import Any, Generator, List, Mapping, MutableMapping, Optional, Union
|
7
|
+
from airbyte_cdk.models import ConfiguredAirbyteCatalog
|
8
|
+
from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource
|
9
|
+
from airbyte_cdk.sources.source import TState
|
10
10
|
|
11
|
-
from apiclient import errors
|
12
|
-
from google.auth import exceptions as google_exceptions
|
13
|
-
from requests.status_codes import codes as status_codes
|
14
11
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
AirbyteStateMessage,
|
21
|
-
AirbyteStreamStatus,
|
22
|
-
ConfiguredAirbyteCatalog,
|
23
|
-
Status,
|
24
|
-
Type,
|
25
|
-
)
|
26
|
-
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
27
|
-
from airbyte_cdk.sources.source import Source
|
28
|
-
from airbyte_cdk.sources.streams.checkpoint import FullRefreshCheckpointReader
|
29
|
-
from airbyte_cdk.utils import AirbyteTracedException
|
30
|
-
from airbyte_cdk.utils.stream_status_utils import as_airbyte_message
|
12
|
+
"""
|
13
|
+
This file provides the necessary constructs to interpret a provided declarative YAML configuration file into
|
14
|
+
source connector.
|
15
|
+
WARNING: Do not modify this file.
|
16
|
+
"""
|
31
17
|
|
32
|
-
from .client import GoogleSheetsClient
|
33
|
-
from .helpers import Helpers
|
34
|
-
from .models.spreadsheet import Spreadsheet
|
35
|
-
from .models.spreadsheet_values import SpreadsheetValues
|
36
|
-
from .utils import exception_description_by_status_code, safe_name_conversion
|
37
18
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
socket.setdefaulttimeout(DEFAULT_SOCKET_TIMEOUT)
|
43
|
-
|
44
|
-
|
45
|
-
class SourceGoogleSheets(Source):
|
46
|
-
"""
|
47
|
-
Spreadsheets API Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets
|
48
|
-
"""
|
49
|
-
|
50
|
-
def check(self, logger: logging.Logger, config: json) -> AirbyteConnectionStatus:
|
51
|
-
# Check involves verifying that the specified spreadsheet is reachable with our credentials.
|
52
|
-
try:
|
53
|
-
client = GoogleSheetsClient(self.get_credentials(config))
|
54
|
-
except Exception as e:
|
55
|
-
return AirbyteConnectionStatus(status=Status.FAILED, message=f"Please use valid credentials json file. Error: {e}")
|
56
|
-
|
57
|
-
spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
|
58
|
-
|
59
|
-
try:
|
60
|
-
spreadsheet = client.get(spreadsheetId=spreadsheet_id, includeGridData=False)
|
61
|
-
except errors.HttpError as err:
|
62
|
-
message = "Config error: "
|
63
|
-
# Give a clearer message if it's a common error like 404.
|
64
|
-
if err.resp.status == status_codes.NOT_FOUND:
|
65
|
-
message += "The spreadsheet link is not valid. Enter the URL of the Google spreadsheet you want to sync."
|
66
|
-
raise AirbyteTracedException(
|
67
|
-
message=message,
|
68
|
-
internal_message=message,
|
69
|
-
failure_type=FailureType.config_error,
|
70
|
-
) from err
|
71
|
-
except google_exceptions.GoogleAuthError as err:
|
72
|
-
message = "Access to the spreadsheet expired or was revoked. Re-authenticate to restore access."
|
73
|
-
raise AirbyteTracedException(
|
74
|
-
message=message,
|
75
|
-
internal_message=message,
|
76
|
-
failure_type=FailureType.config_error,
|
77
|
-
) from err
|
78
|
-
|
79
|
-
# Check for duplicate headers
|
80
|
-
spreadsheet_metadata = Spreadsheet.parse_obj(spreadsheet)
|
81
|
-
grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
|
82
|
-
|
83
|
-
duplicate_headers_in_sheet = {}
|
84
|
-
for sheet_name in grid_sheets:
|
85
|
-
try:
|
86
|
-
header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
|
87
|
-
if config.get("names_conversion"):
|
88
|
-
header_row_data = [safe_name_conversion(h) for h in header_row_data]
|
89
|
-
_, duplicate_headers = Helpers.get_valid_headers_and_duplicates(header_row_data)
|
90
|
-
if duplicate_headers:
|
91
|
-
duplicate_headers_in_sheet[sheet_name] = duplicate_headers
|
92
|
-
except Exception as err:
|
93
|
-
if str(err).startswith("Expected data for exactly one row for sheet"):
|
94
|
-
logger.warn(f"Skip empty sheet: {sheet_name}")
|
95
|
-
else:
|
96
|
-
logger.error(str(err))
|
97
|
-
return AirbyteConnectionStatus(
|
98
|
-
status=Status.FAILED, message=f"Unable to read the schema of sheet {sheet_name}. Error: {str(err)}"
|
99
|
-
)
|
100
|
-
if duplicate_headers_in_sheet:
|
101
|
-
duplicate_headers_error_message = ", ".join(
|
102
|
-
[
|
103
|
-
f"[sheet:{sheet_name}, headers:{duplicate_sheet_headers}]"
|
104
|
-
for sheet_name, duplicate_sheet_headers in duplicate_headers_in_sheet.items()
|
105
|
-
]
|
106
|
-
)
|
107
|
-
return AirbyteConnectionStatus(
|
108
|
-
status=Status.FAILED,
|
109
|
-
message="The following duplicate headers were found in the following sheets. Please fix them to continue: "
|
110
|
-
+ duplicate_headers_error_message,
|
111
|
-
)
|
112
|
-
|
113
|
-
return AirbyteConnectionStatus(status=Status.SUCCEEDED)
|
114
|
-
|
115
|
-
def discover(self, logger: logging.Logger, config: json) -> AirbyteCatalog:
|
116
|
-
client = GoogleSheetsClient(self.get_credentials(config))
|
117
|
-
spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
|
118
|
-
try:
|
119
|
-
logger.info(f"Running discovery on sheet {spreadsheet_id}")
|
120
|
-
spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
|
121
|
-
grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
|
122
|
-
streams = []
|
123
|
-
for sheet_name in grid_sheets:
|
124
|
-
try:
|
125
|
-
header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
|
126
|
-
if config.get("names_conversion"):
|
127
|
-
header_row_data = [safe_name_conversion(h) for h in header_row_data]
|
128
|
-
stream = Helpers.headers_to_airbyte_stream(logger, sheet_name, header_row_data)
|
129
|
-
streams.append(stream)
|
130
|
-
except Exception as err:
|
131
|
-
if str(err).startswith("Expected data for exactly one row for sheet"):
|
132
|
-
logger.warn(f"Skip empty sheet: {sheet_name}")
|
133
|
-
else:
|
134
|
-
logger.error(str(err))
|
135
|
-
return AirbyteCatalog(streams=streams)
|
136
|
-
|
137
|
-
except errors.HttpError as err:
|
138
|
-
error_description = exception_description_by_status_code(err.resp.status, spreadsheet_id)
|
139
|
-
config_error_status_codes = [status_codes.NOT_FOUND, status_codes.FORBIDDEN]
|
140
|
-
if err.resp.status in config_error_status_codes:
|
141
|
-
message = f"{error_description}. {err.reason}."
|
142
|
-
raise AirbyteTracedException(
|
143
|
-
message=message,
|
144
|
-
internal_message=message,
|
145
|
-
failure_type=FailureType.config_error,
|
146
|
-
) from err
|
147
|
-
raise Exception(f"Could not discover the schema of your spreadsheet. {error_description}. {err.reason}.")
|
148
|
-
except google_exceptions.GoogleAuthError as err:
|
149
|
-
message = "Access to the spreadsheet expired or was revoked. Re-authenticate to restore access."
|
150
|
-
raise AirbyteTracedException(
|
151
|
-
message=message,
|
152
|
-
internal_message=message,
|
153
|
-
failure_type=FailureType.config_error,
|
154
|
-
) from err
|
155
|
-
|
156
|
-
def _read(
|
157
|
-
self,
|
158
|
-
logger: logging.Logger,
|
159
|
-
config: json,
|
160
|
-
catalog: ConfiguredAirbyteCatalog,
|
161
|
-
state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
|
162
|
-
) -> Generator[AirbyteMessage, None, None]:
|
163
|
-
client = GoogleSheetsClient(self.get_credentials(config))
|
164
|
-
client.Backoff.row_batch_size = config.get("batch_size", 200)
|
165
|
-
|
166
|
-
sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog)
|
167
|
-
stream_instances = {s.stream.name: s.stream for s in catalog.streams}
|
168
|
-
state_manager = ConnectorStateManager(stream_instance_map=stream_instances, state=state or {})
|
169
|
-
spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
|
170
|
-
|
171
|
-
logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
|
172
|
-
# For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
|
173
|
-
# a blank row, emit the row batch
|
174
|
-
sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(
|
175
|
-
client, spreadsheet_id, sheet_to_column_name, config.get("names_conversion")
|
176
|
-
)
|
177
|
-
sheet_row_counts = Helpers.get_sheet_row_count(client, spreadsheet_id)
|
178
|
-
logger.info(f"Row counts: {sheet_row_counts}")
|
179
|
-
for sheet in sheet_to_column_index_to_name.keys():
|
180
|
-
logger.info(f"Syncing sheet {sheet}")
|
181
|
-
stream = stream_instances.get(sheet)
|
182
|
-
yield as_airbyte_message(stream, AirbyteStreamStatus.STARTED)
|
183
|
-
checkpoint_reader = FullRefreshCheckpointReader([])
|
184
|
-
_ = checkpoint_reader.next()
|
185
|
-
# We revalidate the sheet here to avoid errors in case the sheet was changed after the sync started
|
186
|
-
is_valid, reason = Helpers.check_sheet_is_valid(client, spreadsheet_id, sheet)
|
187
|
-
if not is_valid:
|
188
|
-
logger.info(f"Skipping syncing sheet {sheet}: {reason}")
|
189
|
-
yield self._checkpoint_state(checkpoint_reader.get_checkpoint(), state_manager, sheet, None)
|
190
|
-
yield as_airbyte_message(stream, AirbyteStreamStatus.INCOMPLETE)
|
191
|
-
continue
|
192
|
-
|
193
|
-
column_index_to_name = sheet_to_column_index_to_name[sheet]
|
194
|
-
row_cursor = 2 # we start syncing past the header row
|
195
|
-
# For the loop, it is necessary that the initial row exists when we send a request to the API,
|
196
|
-
# if the last row of the interval goes outside the sheet - this is normal, we will return
|
197
|
-
# only the real data of the sheet and in the next iteration we will loop out.
|
198
|
-
while row_cursor <= sheet_row_counts[sheet]:
|
199
|
-
row_batch = SpreadsheetValues.parse_obj(
|
200
|
-
client.get_values(
|
201
|
-
sheet=sheet,
|
202
|
-
row_cursor=row_cursor,
|
203
|
-
spreadsheetId=spreadsheet_id,
|
204
|
-
majorDimension="ROWS",
|
205
|
-
)
|
206
|
-
)
|
207
|
-
|
208
|
-
row_cursor += client.Backoff.row_batch_size + 1
|
209
|
-
# there should always be one range since we requested only one
|
210
|
-
value_ranges = row_batch.valueRanges[0]
|
211
|
-
|
212
|
-
if not value_ranges.values:
|
213
|
-
break
|
214
|
-
|
215
|
-
row_values = value_ranges.values
|
216
|
-
if len(row_values) == 0:
|
217
|
-
break
|
218
|
-
|
219
|
-
yield as_airbyte_message(stream, AirbyteStreamStatus.RUNNING)
|
220
|
-
for row in row_values:
|
221
|
-
if not Helpers.is_row_empty(row) and Helpers.row_contains_relevant_data(row, column_index_to_name.keys()):
|
222
|
-
yield AirbyteMessage(type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name))
|
223
|
-
|
224
|
-
yield self._checkpoint_state(checkpoint_reader.get_checkpoint(), state_manager, sheet, None)
|
225
|
-
yield as_airbyte_message(stream, AirbyteStreamStatus.COMPLETE)
|
226
|
-
|
227
|
-
def _checkpoint_state(
|
228
|
-
self,
|
229
|
-
stream_state: Mapping[str, Any],
|
230
|
-
state_manager,
|
231
|
-
stream_name: str,
|
232
|
-
stream_namespace: Optional[str],
|
233
|
-
) -> AirbyteMessage:
|
234
|
-
state_manager.update_state_for_stream(stream_name, stream_namespace, stream_state)
|
235
|
-
return state_manager.create_state_message(stream_name, stream_namespace)
|
236
|
-
|
237
|
-
def read(
|
238
|
-
self,
|
239
|
-
logger: logging.Logger,
|
240
|
-
config: json,
|
241
|
-
catalog: ConfiguredAirbyteCatalog,
|
242
|
-
state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
|
243
|
-
) -> Generator[AirbyteMessage, None, None]:
|
244
|
-
spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
|
245
|
-
try:
|
246
|
-
yield from self._read(logger, config, catalog, state)
|
247
|
-
except errors.HttpError as e:
|
248
|
-
error_description = exception_description_by_status_code(e.status_code, spreadsheet_id)
|
249
|
-
|
250
|
-
if e.status_code == status_codes.FORBIDDEN:
|
251
|
-
raise AirbyteTracedException(
|
252
|
-
message=f"Stopped syncing process. {error_description}",
|
253
|
-
internal_message=error_description,
|
254
|
-
failure_type=FailureType.config_error,
|
255
|
-
) from e
|
256
|
-
if e.status_code == status_codes.TOO_MANY_REQUESTS:
|
257
|
-
raise AirbyteTracedException(
|
258
|
-
message=f"Stopped syncing process due to rate limits. {error_description}",
|
259
|
-
internal_message=error_description,
|
260
|
-
failure_type=FailureType.transient_error,
|
261
|
-
) from e
|
262
|
-
else:
|
263
|
-
logger.info(f"{e.status_code}: {e.reason}. {error_description}")
|
264
|
-
raise AirbyteTracedException(
|
265
|
-
message=f"Stopped syncing process. {error_description}",
|
266
|
-
internal_message=error_description,
|
267
|
-
failure_type=FailureType.transient_error,
|
268
|
-
) from e
|
269
|
-
finally:
|
270
|
-
logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
|
271
|
-
|
272
|
-
@staticmethod
|
273
|
-
def get_credentials(config):
|
274
|
-
# backward compatible with old style config
|
275
|
-
if config.get("credentials_json"):
|
276
|
-
credentials = {"auth_type": "Service", "service_account_info": config.get("credentials_json")}
|
277
|
-
return credentials
|
278
|
-
|
279
|
-
return config.get("credentials")
|
19
|
+
# Declarative Source
|
20
|
+
class SourceGoogleSheets(YamlDeclarativeSource):
|
21
|
+
def __init__(self, catalog: Optional[ConfiguredAirbyteCatalog], config: Optional[Mapping[str, Any]], state: TState, **kwargs):
|
22
|
+
super().__init__(catalog=catalog, config=config, state=state, **{"path_to_yaml": "manifest.yaml"})
|
source_google_sheets/spec.yaml
CHANGED
@@ -12,17 +12,15 @@ connectionSpecification:
|
|
12
12
|
type: integer
|
13
13
|
title: Row Batch Size
|
14
14
|
description: >-
|
15
|
-
Default value is
|
15
|
+
Default value is 1000000.
|
16
16
|
An integer representing row batch size for each sent request to Google Sheets API.
|
17
|
-
Row batch size means how many rows are processed from the google sheet, for example default value
|
18
|
-
would process rows
|
17
|
+
Row batch size means how many rows are processed from the google sheet, for example default value 1000000
|
18
|
+
would process rows 2-1000002, then 1000003-2000003 and so on.
|
19
19
|
Based on <a href='https://developers.google.com/sheets/api/limits'>Google Sheets API limits documentation</a>,
|
20
20
|
it is possible to send up to 300 requests per minute, but each individual request has to be processed under 180 seconds,
|
21
21
|
otherwise the request returns a timeout error. In regards to this information, consider network speed and
|
22
22
|
number of columns of the google sheet when deciding a batch_size value.
|
23
|
-
|
24
|
-
consider increasing batch_size value.
|
25
|
-
default: 200
|
23
|
+
default: 1000000
|
26
24
|
spreadsheet_id:
|
27
25
|
type: string
|
28
26
|
title: Spreadsheet Link
|
source_google_sheets/utils.py
CHANGED
@@ -1,14 +0,0 @@
|
|
1
|
-
source_google_sheets/__init__.py,sha256=-aGVMRfrgWjYad3_cHofIptEEa5WMQzTvFD92HevQfw,73
|
2
|
-
source_google_sheets/client.py,sha256=A2BynDswjJ4naSyfjJr9G8fAfGEqLlNXP5vlbAsI3_s,1917
|
3
|
-
source_google_sheets/helpers.py,sha256=CbXNlEfC3sNMDTGNPb22DcalWoXfv7kAYs7LmNM76Ec,10644
|
4
|
-
source_google_sheets/models/__init__.py,sha256=Z-4MTpxG5t2jGhXzs4PPoIOa83zw3jRnUDx0N9Puv3s,61
|
5
|
-
source_google_sheets/models/spreadsheet.py,sha256=fsHREpPEN36wCzGdqgfJ2EVW40UDZ_lS863A4XT2pGo,1112
|
6
|
-
source_google_sheets/models/spreadsheet_values.py,sha256=y8ytuTqwpziJ2ICl0xhlRWgjMkxTfxOalRd414PMHZM,440
|
7
|
-
source_google_sheets/run.py,sha256=_f5-LNqMzBuHtCD1YoUBxnA0fszgqmdNGcN7y_AmXU0,237
|
8
|
-
source_google_sheets/source.py,sha256=kuHugJM9VOEPsArkP4lo88nBrS8ipk1yAoUi7cVSYZY,13506
|
9
|
-
source_google_sheets/spec.yaml,sha256=WrPdH2xLCdyM-kY-pRqbwICcNPhv8nqnb2gdbslTsaQ,5141
|
10
|
-
source_google_sheets/utils.py,sha256=DI53ARcKln77ekvuzsb3x35O9aMgZ_9OY9ets0FtI24,2290
|
11
|
-
airbyte_source_google_sheets-0.8.5.dist-info/METADATA,sha256=LkVsOKPv_UsgFioRkUDxOiDG8kO6jFF_F7MYl3zkXy8,5551
|
12
|
-
airbyte_source_google_sheets-0.8.5.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
13
|
-
airbyte_source_google_sheets-0.8.5.dist-info/entry_points.txt,sha256=Dtsfjohe5IPUFyqojk49SIoP7CifCTlNLG_pgivzppo,69
|
14
|
-
airbyte_source_google_sheets-0.8.5.dist-info/RECORD,,
|
source_google_sheets/client.py
DELETED
@@ -1,49 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
-
#
|
4
|
-
|
5
|
-
import logging
|
6
|
-
from typing import Dict, List
|
7
|
-
|
8
|
-
import backoff
|
9
|
-
from googleapiclient import errors
|
10
|
-
from requests import codes as status_codes
|
11
|
-
|
12
|
-
from .helpers import SCOPES, Helpers
|
13
|
-
|
14
|
-
|
15
|
-
logger = logging.getLogger("airbyte")
|
16
|
-
|
17
|
-
|
18
|
-
class GoogleSheetsClient:
|
19
|
-
class Backoff:
|
20
|
-
row_batch_size = 200
|
21
|
-
|
22
|
-
@classmethod
|
23
|
-
def increase_row_batch_size(cls, details):
|
24
|
-
if details["exception"].status_code == status_codes.TOO_MANY_REQUESTS and cls.row_batch_size < 1000:
|
25
|
-
cls.row_batch_size = cls.row_batch_size + 100
|
26
|
-
logger.info(f"Increasing number of records fetching due to rate limits. Current value: {cls.row_batch_size}")
|
27
|
-
|
28
|
-
@staticmethod
|
29
|
-
def give_up(error):
|
30
|
-
code = error.resp.status
|
31
|
-
# Stop retrying if it's not a problem with the rate limit or on the server end
|
32
|
-
return not (code == status_codes.TOO_MANY_REQUESTS or 500 <= code < 600)
|
33
|
-
|
34
|
-
def __init__(self, credentials: Dict[str, str], scopes: List[str] = SCOPES):
|
35
|
-
self.client = Helpers.get_authenticated_sheets_client(credentials, scopes)
|
36
|
-
|
37
|
-
@backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
|
38
|
-
def get(self, **kwargs):
|
39
|
-
return self.client.get(**kwargs).execute()
|
40
|
-
|
41
|
-
@backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
|
42
|
-
def get_values(self, **kwargs):
|
43
|
-
range = self._create_range(kwargs.pop("sheet"), kwargs.pop("row_cursor"))
|
44
|
-
logger.info(f"Fetching range {range}")
|
45
|
-
return self.client.values().batchGet(ranges=range, **kwargs).execute()
|
46
|
-
|
47
|
-
def _create_range(self, sheet, row_cursor):
|
48
|
-
range = f"{sheet}!{row_cursor}:{row_cursor + self.Backoff.row_batch_size}"
|
49
|
-
return range
|
source_google_sheets/helpers.py
DELETED
@@ -1,234 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
-
#
|
4
|
-
|
5
|
-
import json
|
6
|
-
import logging
|
7
|
-
import re
|
8
|
-
from collections import defaultdict
|
9
|
-
from datetime import datetime
|
10
|
-
from typing import Dict, FrozenSet, Iterable, List, Tuple
|
11
|
-
|
12
|
-
from google.oauth2 import credentials as client_account
|
13
|
-
from google.oauth2 import service_account
|
14
|
-
from googleapiclient import discovery
|
15
|
-
|
16
|
-
from airbyte_cdk.models.airbyte_protocol import AirbyteRecordMessage, AirbyteStream, ConfiguredAirbyteCatalog, SyncMode
|
17
|
-
|
18
|
-
from .models.spreadsheet import RowData, Spreadsheet
|
19
|
-
from .utils import safe_name_conversion
|
20
|
-
|
21
|
-
|
22
|
-
SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly", "https://www.googleapis.com/auth/drive.readonly"]
|
23
|
-
|
24
|
-
logger = logging.getLogger("airbyte")
|
25
|
-
|
26
|
-
|
27
|
-
class Helpers(object):
|
28
|
-
@staticmethod
|
29
|
-
def get_authenticated_sheets_client(credentials: Dict[str, str], scopes: List[str] = SCOPES) -> discovery.Resource:
|
30
|
-
creds = Helpers.get_authenticated_google_credentials(credentials, scopes)
|
31
|
-
return discovery.build("sheets", "v4", credentials=creds).spreadsheets()
|
32
|
-
|
33
|
-
@staticmethod
|
34
|
-
def get_authenticated_drive_client(credentials: Dict[str, str], scopes: List[str] = SCOPES) -> discovery.Resource:
|
35
|
-
creds = Helpers.get_authenticated_google_credentials(credentials, scopes)
|
36
|
-
return discovery.build("drive", "v3", credentials=creds)
|
37
|
-
|
38
|
-
@staticmethod
|
39
|
-
def get_authenticated_google_credentials(credentials: Dict[str, str], scopes: List[str] = SCOPES):
|
40
|
-
auth_type = credentials.pop("auth_type")
|
41
|
-
if auth_type == "Service":
|
42
|
-
return service_account.Credentials.from_service_account_info(json.loads(credentials["service_account_info"]), scopes=scopes)
|
43
|
-
elif auth_type == "Client":
|
44
|
-
return client_account.Credentials.from_authorized_user_info(info=credentials)
|
45
|
-
|
46
|
-
@staticmethod
|
47
|
-
def headers_to_airbyte_stream(logger: logging.Logger, sheet_name: str, header_row_values: List[str]) -> AirbyteStream:
|
48
|
-
"""
|
49
|
-
Parses sheet headers from the provided row. This method assumes that data is contiguous
|
50
|
-
i.e: every cell contains a value and the first cell which does not contain a value denotes the end
|
51
|
-
of the headers. For example, if the first row contains "One | Two | | Three" then this method
|
52
|
-
will parse the headers as ["One", "Two"]. This assumption is made for simplicity and can be modified later.
|
53
|
-
"""
|
54
|
-
fields, duplicate_fields = Helpers.get_valid_headers_and_duplicates(header_row_values)
|
55
|
-
if duplicate_fields:
|
56
|
-
logger.warn(f"Duplicate headers found in {sheet_name}. Ignoring them: {duplicate_fields}")
|
57
|
-
|
58
|
-
sheet_json_schema = {
|
59
|
-
"$schema": "http://json-schema.org/draft-07/schema#",
|
60
|
-
"type": "object",
|
61
|
-
# For simplicity, the type of every cell is a string
|
62
|
-
"properties": {field: {"type": "string"} for field in fields},
|
63
|
-
}
|
64
|
-
|
65
|
-
return AirbyteStream(name=sheet_name, json_schema=sheet_json_schema, supported_sync_modes=[SyncMode.full_refresh])
|
66
|
-
|
67
|
-
@staticmethod
|
68
|
-
def get_valid_headers_and_duplicates(header_row_values: List[str]) -> (List[str], List[str]):
|
69
|
-
fields = []
|
70
|
-
duplicate_fields = set()
|
71
|
-
for cell_value in header_row_values:
|
72
|
-
if cell_value:
|
73
|
-
if cell_value in fields:
|
74
|
-
duplicate_fields.add(cell_value)
|
75
|
-
else:
|
76
|
-
fields.append(cell_value)
|
77
|
-
else:
|
78
|
-
break
|
79
|
-
|
80
|
-
# Removing all duplicate fields
|
81
|
-
if duplicate_fields:
|
82
|
-
fields = [field for field in fields if field not in duplicate_fields]
|
83
|
-
|
84
|
-
return fields, list(duplicate_fields)
|
85
|
-
|
86
|
-
@staticmethod
|
87
|
-
def get_formatted_row_values(row_data: RowData) -> List[str]:
|
88
|
-
"""
|
89
|
-
Gets the formatted values of all cell data in this row. A formatted value is the final value a user sees in a spreadsheet.
|
90
|
-
It can be a raw string input by the user, or the result of a sheets function call.
|
91
|
-
"""
|
92
|
-
return [value.formattedValue for value in row_data.values]
|
93
|
-
|
94
|
-
@staticmethod
|
95
|
-
def get_first_row(client, spreadsheet_id: str, sheet_name: str) -> List[str]:
|
96
|
-
spreadsheet = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=True, ranges=f"{sheet_name}!1:1"))
|
97
|
-
|
98
|
-
# There is only one sheet since we are specifying the sheet in the requested ranges.
|
99
|
-
returned_sheets = spreadsheet.sheets
|
100
|
-
if len(returned_sheets) != 1:
|
101
|
-
raise Exception(f"Unexpected return result: Sheet {sheet_name} was expected to contain data on exactly 1 sheet. ")
|
102
|
-
|
103
|
-
range_data = returned_sheets[0].data
|
104
|
-
if len(range_data) != 1:
|
105
|
-
raise Exception(f"Expected data for exactly one range for sheet {sheet_name}")
|
106
|
-
|
107
|
-
all_row_data = range_data[0].rowData
|
108
|
-
if not all_row_data:
|
109
|
-
# the sheet is empty
|
110
|
-
logger.warning(f"The sheet {sheet_name} (ID {spreadsheet_id}) is empty!")
|
111
|
-
return []
|
112
|
-
|
113
|
-
if len(all_row_data) != 1:
|
114
|
-
raise Exception(f"Expected data for exactly one row for sheet {sheet_name}")
|
115
|
-
|
116
|
-
first_row_data = all_row_data[0]
|
117
|
-
|
118
|
-
return Helpers.get_formatted_row_values(first_row_data)
|
119
|
-
|
120
|
-
@staticmethod
|
121
|
-
def parse_sheet_and_column_names_from_catalog(catalog: ConfiguredAirbyteCatalog) -> Dict[str, FrozenSet[str]]:
|
122
|
-
sheet_to_column_name = {}
|
123
|
-
for configured_stream in catalog.streams:
|
124
|
-
stream = configured_stream.stream
|
125
|
-
sheet_name = stream.name
|
126
|
-
sheet_to_column_name[sheet_name] = frozenset(stream.json_schema["properties"].keys())
|
127
|
-
|
128
|
-
return sheet_to_column_name
|
129
|
-
|
130
|
-
@staticmethod
|
131
|
-
def row_data_to_record_message(sheet_name: str, cell_values: List[str], column_index_to_name: Dict[int, str]) -> AirbyteRecordMessage:
|
132
|
-
data = {}
|
133
|
-
for relevant_index in sorted(column_index_to_name.keys()):
|
134
|
-
if relevant_index >= len(cell_values):
|
135
|
-
break
|
136
|
-
|
137
|
-
cell_value = cell_values[relevant_index]
|
138
|
-
if cell_value.strip() != "":
|
139
|
-
data[column_index_to_name[relevant_index]] = cell_value
|
140
|
-
|
141
|
-
return AirbyteRecordMessage(stream=sheet_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000)
|
142
|
-
|
143
|
-
@staticmethod
|
144
|
-
def get_available_sheets_to_column_index_to_name(
|
145
|
-
client, spreadsheet_id: str, requested_sheets_and_columns: Dict[str, FrozenSet[str]], names_conversion: bool = False
|
146
|
-
) -> Dict[str, Dict[int, str]]:
|
147
|
-
available_sheets = Helpers.get_sheets_in_spreadsheet(client, spreadsheet_id)
|
148
|
-
logger.info(f"Available sheets: {available_sheets}")
|
149
|
-
available_sheets_to_column_index_to_name = defaultdict(dict)
|
150
|
-
for sheet, columns in requested_sheets_and_columns.items():
|
151
|
-
if sheet in available_sheets:
|
152
|
-
first_row = Helpers.get_first_row(client, spreadsheet_id, sheet)
|
153
|
-
if names_conversion:
|
154
|
-
first_row = [safe_name_conversion(h) for h in first_row]
|
155
|
-
# When performing names conversion, they won't match what is listed in catalog for the majority of cases,
|
156
|
-
# so they should be cast here in order to have them in records
|
157
|
-
columns = {safe_name_conversion(c) for c in columns}
|
158
|
-
# Find the column index of each header value
|
159
|
-
idx = 0
|
160
|
-
for cell_value in first_row:
|
161
|
-
if cell_value in columns:
|
162
|
-
available_sheets_to_column_index_to_name[sheet][idx] = cell_value
|
163
|
-
idx += 1
|
164
|
-
return available_sheets_to_column_index_to_name
|
165
|
-
|
166
|
-
@staticmethod
|
167
|
-
def get_sheets_in_spreadsheet(client, spreadsheet_id: str) -> List[str]:
|
168
|
-
spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
|
169
|
-
return [sheet.properties.title for sheet in spreadsheet_metadata.sheets]
|
170
|
-
|
171
|
-
@staticmethod
|
172
|
-
def get_sheet_row_count(client, spreadsheet_id: str) -> Dict[str, int]:
|
173
|
-
spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
|
174
|
-
# filter out sheets without gridProperties (like in diagram sheets)
|
175
|
-
data_sheets = [sheet for sheet in spreadsheet_metadata.sheets if hasattr(sheet.properties, "gridProperties")]
|
176
|
-
return {sheet.properties.title: sheet.properties.gridProperties["rowCount"] for sheet in data_sheets}
|
177
|
-
|
178
|
-
@staticmethod
|
179
|
-
def get_grid_sheets(spreadsheet_metadata) -> List[str]:
|
180
|
-
"""Return grid only diagram, filter out sheets with image/diagram only
|
181
|
-
|
182
|
-
https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/sheets#sheetproperties
|
183
|
-
"""
|
184
|
-
grid_sheets = []
|
185
|
-
non_grid_sheets = []
|
186
|
-
for sheet in spreadsheet_metadata.sheets:
|
187
|
-
sheet_title = sheet.properties.title
|
188
|
-
if (
|
189
|
-
hasattr(sheet.properties, "gridProperties")
|
190
|
-
and hasattr(sheet.properties, "sheetType")
|
191
|
-
and sheet.properties.sheetType == "GRID"
|
192
|
-
):
|
193
|
-
grid_sheets.append(sheet_title)
|
194
|
-
else:
|
195
|
-
non_grid_sheets.append(sheet_title)
|
196
|
-
|
197
|
-
if non_grid_sheets:
|
198
|
-
# logging.getLogger(...).log() expects an integer level. The level for WARN is 30
|
199
|
-
# Reference: https://docs.python.org/3.10/library/logging.html#levels
|
200
|
-
logging.getLogger("airbyte").log(30, "Skip non-grid sheets: " + ", ".join(non_grid_sheets))
|
201
|
-
|
202
|
-
return grid_sheets
|
203
|
-
|
204
|
-
@staticmethod
|
205
|
-
def is_row_empty(cell_values: List[str]) -> bool:
|
206
|
-
for cell in cell_values:
|
207
|
-
if cell.strip() != "":
|
208
|
-
return False
|
209
|
-
return True
|
210
|
-
|
211
|
-
@staticmethod
|
212
|
-
def row_contains_relevant_data(cell_values: List[str], relevant_indices: Iterable[int]) -> bool:
|
213
|
-
for idx in relevant_indices:
|
214
|
-
if len(cell_values) > idx and cell_values[idx].strip() != "":
|
215
|
-
return True
|
216
|
-
return False
|
217
|
-
|
218
|
-
@staticmethod
|
219
|
-
def get_spreadsheet_id(id_or_url: str) -> str:
|
220
|
-
if re.match(r"(https://)", id_or_url):
|
221
|
-
# This is a URL
|
222
|
-
m = re.search(r"(/)([-\w]{20,})([/]?)", id_or_url)
|
223
|
-
if m is not None and m.group(2):
|
224
|
-
return m.group(2)
|
225
|
-
else:
|
226
|
-
return id_or_url
|
227
|
-
|
228
|
-
@staticmethod
|
229
|
-
def check_sheet_is_valid(client, spreadsheet_id: str, sheet_name: str) -> Tuple[bool, str]:
|
230
|
-
try:
|
231
|
-
Helpers.get_first_row(client, spreadsheet_id, sheet_name)
|
232
|
-
return True, ""
|
233
|
-
except Exception as e:
|
234
|
-
return False, str(e)
|
File without changes
|
File without changes
|