airbyte-source-google-sheets 0.8.5__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,408 @@
1
+ version: 6.7.0
2
+
3
+ type: DeclarativeSource
4
+
5
+ check:
6
+ type: CheckDynamicStream
7
+ stream_count: 1
8
+ use_check_availability: false
9
+
10
+ dynamic_streams:
11
+ - type: DynamicDeclarativeStream
12
+ stream_template:
13
+ type: DeclarativeStream
14
+ name: ""
15
+ $parameters:
16
+ i: 123
17
+ primary_key: []
18
+ retriever:
19
+ type: SimpleRetriever
20
+ $parameters:
21
+ row_count: 0
22
+ sheet_id: ""
23
+ batch_size: 0
24
+ partition_router:
25
+ type: CustomPartitionRouter
26
+ class_name: "source_google_sheets.components.partition_routers.RangePartitionRouter"
27
+ paginator:
28
+ type: NoPagination
29
+ record_selector:
30
+ decoder:
31
+ type: JsonDecoder
32
+ extractor:
33
+ type: CustomRecordExtractor
34
+ class_name: source_google_sheets.components.DpathSchemaMatchingExtractor
35
+ description: Extract record list of values (rows) and matches such values to correct schema property to generate individual records.
36
+ field_path:
37
+ - valueRanges
38
+ - "*"
39
+ $parameters:
40
+ schema_type_identifier:
41
+ $ref: "#/definitions/schema_type_identifier"
42
+ values_to_match_key: "values"
43
+ properties_to_match: ""
44
+ type: RecordSelector
45
+ $parameters:
46
+ name: ""
47
+ requester:
48
+ $ref: "#/definitions/base_requester"
49
+ $parameters:
50
+ sheet_id: ""
51
+ name: ""
52
+ http_method: GET
53
+ path: >-
54
+ {% if config["spreadsheet_id"] | regex_search("^(https://.*)") %}{{ config["spreadsheet_id"] | regex_search("/([-\\w]{20,})([/]?)") }}{% else %}{{ config["spreadsheet_id"] }}{% endif %}/values:batchGet?ranges={{parameters["sheet_id"] | urlencode}}!{{stream_partition.start_range}}:{{stream_partition.end_range}}&majorDimension=ROWS&alt=json
55
+ error_handler:
56
+ type: DefaultErrorHandler
57
+ backoff_strategies:
58
+ - type: ExponentialBackoffStrategy
59
+ response_filters:
60
+ $ref: "#/definitions/response_filters/response_error_filters"
61
+ schema_loader:
62
+ type: DynamicSchemaLoader
63
+ retriever:
64
+ type: SimpleRetriever
65
+ paginator:
66
+ type: NoPagination
67
+ record_selector:
68
+ extractor:
69
+ type: CustomRecordExtractor
70
+ class_name: source_google_sheets.components.DpathSchemaExtractor
71
+ parameters:
72
+ schema_type_identifier:
73
+ $ref: "#/definitions/schema_type_identifier"
74
+ field_path:
75
+ - sheets
76
+ - "*"
77
+ - data
78
+ - "*"
79
+ - rowData
80
+ - "*"
81
+ type: RecordSelector
82
+ requester:
83
+ $ref: "#/definitions/base_requester"
84
+ $parameters:
85
+ sheet_id: ""
86
+ http_method: GET
87
+ path: >-
88
+ {% if config["spreadsheet_id"] | regex_search("^(https://.*)") %}{{ config["spreadsheet_id"] | regex_search("/([-\\w]{20,})([/]?)") }}{% else %}{{ config["spreadsheet_id"] }}{% endif %}?includeGridData=true&ranges={{parameters["sheet_id"] | urlencode}}!1:1&alt=json
89
+ error_handler:
90
+ type: CompositeErrorHandler
91
+ error_handlers:
92
+ - type: DefaultErrorHandler
93
+ backoff_strategies:
94
+ - type: ExponentialBackoffStrategy
95
+ response_filters:
96
+ $ref: "#/definitions/response_filters/single_sheet_response_error_filters"
97
+ schema_type_identifier:
98
+ $ref: "#/definitions/schema_type_identifier"
99
+ components_resolver:
100
+ type: HttpComponentsResolver
101
+ description: We use first row of sheet to obtain data.
102
+ retriever:
103
+ $ref: "#/definitions/retrievers/components_resolver_retriever"
104
+ components_mapping:
105
+ - field_path:
106
+ - name
107
+ type: ComponentMappingDefinition
108
+ value: "{{components_values['properties']['title']}}"
109
+ value_type: string
110
+ description: name for dynamic stream.
111
+ - field_path:
112
+ - schema_loader
113
+ - retriever
114
+ - requester
115
+ - $parameters
116
+ - sheet_id
117
+ type: ComponentMappingDefinition
118
+ value: "{{components_values['properties']['title']}}"
119
+ description: sheet_id for dynamic schema loader requester.
120
+ - field_path:
121
+ - retriever
122
+ - requester
123
+ - $parameters
124
+ - sheet_id
125
+ type: ComponentMappingDefinition
126
+ value: "{{components_values['properties']['title']}}"
127
+ description: sheet_id for dynamic stream retriever requester.
128
+ - field_path:
129
+ - retriever
130
+ - record_selector
131
+ - extractor
132
+ - $parameters
133
+ - properties_to_match
134
+ type: ComponentMappingDefinition
135
+ value: "{{components_values['data'][0].get('rowData', [{}])[0]}}"
136
+ description: indexed_schema to match with row values.
137
+ - field_path:
138
+ - retriever
139
+ - partition_router
140
+ - $parameters
141
+ - row_count
142
+ type: ComponentMappingDefinition
143
+ value: "{{components_values['properties']['gridProperties']['rowCount']}}"
144
+ - field_path:
145
+ - retriever
146
+ - partition_router
147
+ - $parameters
148
+ - sheet_id
149
+ type: ComponentMappingDefinition
150
+ value: "{{components_values['properties']['title']}}"
151
+ description: sheet_id for retriever.
152
+ - field_path:
153
+ - retriever
154
+ - partition_router
155
+ - $parameters
156
+ - batch_size
157
+ type: ComponentMappingDefinition
158
+ value: "{{config.get('batch_size', 1000000)}}"
159
+ description: batch size count for dynamic stream partition router (slicer).
160
+
161
+ definitions:
162
+ streams:
163
+ get_spreadsheet_info_and_sheets:
164
+ type: DeclarativeStream
165
+ name: get_spreadsheet_info_and_sheets
166
+ retriever:
167
+ type: SimpleRetriever
168
+ requester:
169
+ $ref: "#/definitions/base_requester"
170
+ path: >-
171
+ {% if config["spreadsheet_id"] | regex_search("^(https://.*)") %}{{ config["spreadsheet_id"] | regex_search("/([-\\w]{20,})([/]?)") }}{% else %}{{ config["spreadsheet_id"] }}{% endif %}?includeGridData=false&alt=json
172
+ http_method: GET
173
+ error_handler:
174
+ type: CompositeErrorHandler
175
+ error_handlers:
176
+ - type: DefaultErrorHandler
177
+ backoff_strategies:
178
+ - type: ExponentialBackoffStrategy
179
+ response_filters:
180
+ $ref: "#/definitions/response_filters/response_error_filters"
181
+ record_selector:
182
+ type: RecordSelector
183
+ extractor:
184
+ type: DpathExtractor
185
+ field_path:
186
+ - sheets
187
+ - "*"
188
+ - properties
189
+ record_filter:
190
+ type: RecordFilter
191
+ condition: '{{ record["sheetType"] == "GRID" and record["gridProperties"]["rowCount"] > 0}}'
192
+ schema_loader:
193
+ type: InlineSchemaLoader
194
+ schema:
195
+ $ref: "#/schemas/sheets"
196
+ base_requester:
197
+ type: HttpRequester
198
+ url_base: https://sheets.googleapis.com/v4/spreadsheets/
199
+ use_cache: true
200
+ authenticator: "#/definitions/authenticator"
201
+ retrievers:
202
+ components_resolver_retriever:
203
+ type: SimpleRetriever
204
+ paginator:
205
+ type: NoPagination
206
+ record_selector:
207
+ type: RecordSelector
208
+ extractor:
209
+ type: DpathExtractor
210
+ field_path:
211
+ - sheets
212
+ partition_router:
213
+ type: SubstreamPartitionRouter
214
+ parent_stream_configs:
215
+ - type: ParentStreamConfig
216
+ parent_key: title
217
+ partition_field: sheet_id
218
+ stream:
219
+ $ref: "#/definitions/streams/get_spreadsheet_info_and_sheets"
220
+ requester:
221
+ $ref: "#/definitions/base_requester"
222
+ description: spreadsheet_id can be either the full url to spreadsheet or the spreadsheet id.
223
+ http_method: GET
224
+ path: >-
225
+ {% if config["spreadsheet_id"] | regex_search("^(https://.*)") %}{{ config["spreadsheet_id"] | regex_search("/([-\\w]{20,})([/]?)") }}{% else %}{{ config["spreadsheet_id"] }}{% endif %}?includeGridData=true&ranges={{stream_partition.sheet_id | urlencode}}!1:1&alt=json
226
+ error_handler:
227
+ type: CompositeErrorHandler
228
+ error_handlers:
229
+ - type: DefaultErrorHandler
230
+ response_filters:
231
+ $ref: "#/definitions/response_filters/single_sheet_response_error_filters"
232
+ response_filters:
233
+ expected_one_sheet:
234
+ type: HttpResponseFilter
235
+ action: FAIL
236
+ predicate: >-
237
+ {{ 'sheets' in response and response["sheets"] | length != 1 }}
238
+ # error lacks of information as error_message can't interpolate stream_slice["potato"]
239
+ error_message: >-
240
+ Unable to read the schema of sheet. Error: Unexpected return
241
+ result: Sheet was
242
+ expected to contain data on exactly 1 sheet.
243
+ ignore_duplicate_headers:
244
+ type: HttpResponseFilter
245
+ action: IGNORE
246
+ predicate: >-
247
+ {{ response["sheets"][0]["data"][0]["rowData"][0]["values"] |
248
+ map(attribute="formattedValue") | list | length !=
249
+ response["sheets"][0]["data"][0]["rowData"][0]["values"] |
250
+ map(attribute="formattedValue") | list | unique | list | length }}
251
+ error_message: >-
252
+ {%- set headers_found = response["sheets"][0]["data"][0]["rowData"][0]["values"] | map(attribute="formattedValue") | list -%}
253
+ {%- set headers_count = {} -%}
254
+ {%- set duplicate_fields = [] -%}
255
+ {%- for headerFound in headers_found -%}
256
+ {%- if headerFound is not none -%}
257
+ {%- set headers_count = headers_count.update({headerFound: headers_count.get(headerFound, 0) + 1}) or headers_count -%}
258
+ {%- if headers_count.get(headerFound) > 1 and headerFound not in duplicate_fields -%}
259
+ {%- set duplicate_fields = duplicate_fields.append(headerFound) -%}
260
+ {%- endif -%}
261
+ {%- endif -%}
262
+ {%- endfor -%}
263
+ Duplicate headers found in sheet {{ response["sheets"][0]["properties"]["title"] }}.
264
+ Ignoring them: {{ duplicate_fields }}
265
+ fail_duplicate_headers:
266
+ $ref: "#/definitions/response_filters/ignore_duplicate_headers"
267
+ action: FAIL
268
+ error_message: >-
269
+ {%- set headers_found = response["sheets"][0]["data"][0]["rowData"][0]["values"] | map(attribute="formattedValue") | list -%}
270
+ {%- set headers_count = {} -%}
271
+ {%- set duplicate_fields = [] -%}
272
+ {%- for headerFound in headers_found -%}
273
+ {%- if headerFound is not none -%}
274
+ {%- set headers_count = headers_count.update({headerFound: headers_count.get(headerFound, 0) + 1}) or headers_count -%}
275
+ {%- if headers_count.get(headerFound) > 1 and headerFound not in duplicate_fields -%}
276
+ {%- set duplicate_fields = duplicate_fields.append(headerFound) -%}
277
+ {%- endif -%}
278
+ {%- endif -%}
279
+ {%- endfor -%}
280
+ The following duplicate headers were found in the sheet.
281
+ Please fix them to continue: {{ duplicate_fields }}
282
+ server_error:
283
+ type: HttpResponseFilter
284
+ action: RETRY
285
+ http_codes:
286
+ - 500
287
+ - 502
288
+ - 503
289
+ error_message: >-
290
+ There was an issue
291
+ with the Google Sheets API. This is usually a temporary issue from
292
+ Google's side. Please try again. If this issue persists, contact
293
+ support
294
+ forbidden:
295
+ type: HttpResponseFilter
296
+ action: FAIL
297
+ http_codes:
298
+ - 403
299
+ error_message: >-
300
+ The authenticated Google Sheets user does not have permissions to view the
301
+ spreadsheet with id {{config["spreadsheet_id"]}}. Please ensure the authenticated user has access
302
+ to the Spreadsheet and reauthenticate. If the issue persists, contact support.
303
+ The caller does not have right permissions.
304
+ not_found:
305
+ type: HttpResponseFilter
306
+ action: FAIL
307
+ http_codes:
308
+ - 404
309
+ error_message: >-
310
+ The requested Google Sheets spreadsheet with id {{config["spreadsheet_id"]}} does not exist.
311
+ Please ensure the Spreadsheet Link you have set is valid and the spreadsheet exists. If the issue persists, contact support
312
+ rate_limit:
313
+ type: HttpResponseFilter
314
+ action: RATE_LIMITED
315
+ http_codes:
316
+ - 429
317
+ error_message: >-
318
+ Rate limit has been
319
+ reached. Please try later or request a higher quota for your account.
320
+ single_sheet_response_error_filters:
321
+ - $ref: "#/definitions/response_filters/expected_one_sheet"
322
+ - $ref: "#/definitions/response_filters/ignore_duplicate_headers"
323
+ check_operation_single_sheet_response_error_filters:
324
+ - $ref: "#/definitions/response_filters/expected_one_sheet"
325
+ - $ref: "#/definitions/response_filters/fail_duplicate_headers"
326
+ response_error_filters:
327
+ - $ref: "#/definitions/response_filters/server_error"
328
+ - $ref: "#/definitions/response_filters/forbidden"
329
+ - $ref: "#/definitions/response_filters/not_found"
330
+ - $ref: "#/definitions/response_filters/rate_limit"
331
+ schema_type_identifier:
332
+ key_pointer:
333
+ - formattedValue
334
+ schema_pointer:
335
+ - values
336
+ jwt_authenticator:
337
+ type: JwtAuthenticator
338
+ secret_key: "{{ json_loads(config['credentials']['service_account_info'])['private_key'] }}"
339
+ algorithm: "RS256"
340
+ token_duration: 3600
341
+ jwt_payload:
342
+ aud: "{{ json_loads(config['credentials']['service_account_info'])['token_uri'] }}"
343
+ iss: "{{ json_loads(config['credentials']['service_account_info'])['client_email'] }}"
344
+ additional_jwt_payload:
345
+ scope: "https://www.googleapis.com/auth/spreadsheets.readonly https://www.googleapis.com/auth/drive.readonly"
346
+ oauth_authenticator:
347
+ type: OAuthAuthenticator
348
+ refresh_request_body: {}
349
+ token_refresh_endpoint: https://www.googleapis.com/oauth2/v4/token
350
+ grant_type: refresh_token
351
+ client_id: '{{ config["credentials"]["client_id"] }}'
352
+ client_secret: '{{ config["credentials"]["client_secret"] }}'
353
+ refresh_token: '{{ config["credentials"]["refresh_token"] }}'
354
+ jwt_profile_assertion_oauth_authenticator:
355
+ type: OAuthAuthenticator
356
+ token_refresh_endpoint: https://oauth2.googleapis.com/token
357
+ refresh_request_headers:
358
+ Content-Type: application/x-www-form-urlencoded
359
+ use_profile_assertion: true
360
+ profile_assertion:
361
+ $ref: "#/definitions/jwt_authenticator"
362
+ authenticator:
363
+ type: SelectiveAuthenticator
364
+ authenticator_selection_path: ["credentials", "auth_type"]
365
+ authenticators:
366
+ Client: "#/definitions/oauth_authenticator"
367
+ Service: "#/definitions/jwt_profile_assertion_oauth_authenticator"
368
+
369
+ schemas:
370
+ sheets:
371
+ type: object
372
+ $schema: http://json-schema.org/schema#
373
+ additionalProperties: true
374
+ properties:
375
+ gridProperties:
376
+ type:
377
+ - object
378
+ - "null"
379
+ properties:
380
+ columnCount:
381
+ type:
382
+ - number
383
+ - "null"
384
+ rowCount:
385
+ type:
386
+ - number
387
+ - "null"
388
+ index:
389
+ type:
390
+ - number
391
+ - "null"
392
+ sheetId:
393
+ type:
394
+ - number
395
+ - "null"
396
+ sheetType:
397
+ type:
398
+ - string
399
+ - "null"
400
+ title:
401
+ type:
402
+ - string
403
+ - "null"
404
+
405
+ concurrency_level:
406
+ type: ConcurrencyLevel
407
+ default_concurrency: 1
408
+ max_concurrency: 1
@@ -1,5 +1,5 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
 
@@ -1,5 +1,5 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
 
@@ -1,15 +1,53 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
 
6
6
  import sys
7
+ import traceback
8
+ from datetime import datetime
9
+ from typing import List
7
10
 
8
- from airbyte_cdk.entrypoint import launch
11
+ from orjson import orjson
9
12
 
10
- from .source import SourceGoogleSheets
13
+ from airbyte_cdk.entrypoint import AirbyteEntrypoint, launch
14
+ from airbyte_cdk.models import AirbyteErrorTraceMessage, AirbyteMessage, AirbyteMessageSerializer, AirbyteTraceMessage, TraceType, Type
15
+ from source_google_sheets.source import SourceGoogleSheets
16
+
17
+
18
+ def _get_source(args: List[str]):
19
+ catalog_path = AirbyteEntrypoint.extract_catalog(args)
20
+ config_path = AirbyteEntrypoint.extract_config(args)
21
+ state_path = AirbyteEntrypoint.extract_state(args)
22
+ try:
23
+ return SourceGoogleSheets(
24
+ SourceGoogleSheets.read_catalog(catalog_path) if catalog_path else None,
25
+ SourceGoogleSheets.read_config(config_path) if config_path else None,
26
+ SourceGoogleSheets.read_state(state_path) if state_path else None,
27
+ )
28
+ except Exception as error:
29
+ print(
30
+ orjson.dumps(
31
+ AirbyteMessageSerializer.dump(
32
+ AirbyteMessage(
33
+ type=Type.TRACE,
34
+ trace=AirbyteTraceMessage(
35
+ type=TraceType.ERROR,
36
+ emitted_at=int(datetime.now().timestamp() * 1000),
37
+ error=AirbyteErrorTraceMessage(
38
+ message=f"Error starting the sync. This could be due to an invalid configuration or catalog. Please contact Support for assistance. Error: {error}",
39
+ stack_trace=traceback.format_exc(),
40
+ ),
41
+ ),
42
+ )
43
+ )
44
+ ).decode()
45
+ )
46
+ return None
11
47
 
12
48
 
13
49
  def run():
14
- source = SourceGoogleSheets()
15
- launch(source, sys.argv[1:])
50
+ _args = sys.argv[1:]
51
+ source = _get_source(_args)
52
+ if source:
53
+ launch(source, _args)