airbyte-source-google-sheets 0.8.5__py3-none-any.whl → 0.9.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,407 @@
1
+ version: 6.7.0
2
+
3
+ type: DeclarativeSource
4
+
5
+ check:
6
+ type: CheckDynamicStream
7
+ stream_count: 1
8
+ use_check_availability: false
9
+
10
+ dynamic_streams:
11
+ - type: DynamicDeclarativeStream
12
+ stream_template:
13
+ type: DeclarativeStream
14
+ name: ""
15
+ $parameters:
16
+ i: 123
17
+ primary_key: []
18
+ retriever:
19
+ type: SimpleRetriever
20
+ $parameters:
21
+ row_count: 0
22
+ sheet_id: ""
23
+ batch_size: 0
24
+ partition_router:
25
+ type: CustomPartitionRouter
26
+ class_name: "source_google_sheets.components.partition_routers.RangePartitionRouter"
27
+ paginator:
28
+ type: NoPagination
29
+ record_selector:
30
+ decoder:
31
+ type: JsonDecoder
32
+ extractor:
33
+ type: CustomRecordExtractor
34
+ class_name: source_google_sheets.components.DpathSchemaMatchingExtractor
35
+ description: Extract record list of values (rows) and matches such values to correct schema property to generate individual records.
36
+ field_path:
37
+ - valueRanges
38
+ - "*"
39
+ $parameters:
40
+ schema_type_identifier:
41
+ $ref: "#/definitions/schema_type_identifier"
42
+ values_to_match_key: "values"
43
+ properties_to_match: ""
44
+ type: RecordSelector
45
+ $parameters:
46
+ name: ""
47
+ requester:
48
+ $ref: "#/definitions/base_requester"
49
+ $parameters:
50
+ sheet_id: ""
51
+ name: ""
52
+ http_method: GET
53
+ path: >-
54
+ {% if config["spreadsheet_id"] | regex_search("^(https://.*)") %}{{ config["spreadsheet_id"] | regex_search("/([-\\w]{20,})([/]?)") }}{% else %}{{ config["spreadsheet_id"] }}{% endif %}/values:batchGet?ranges={{parameters["sheet_id"] | urlencode}}!{{stream_partition.start_range}}:{{stream_partition.end_range}}&majorDimension=ROWS&alt=json
55
+ error_handler:
56
+ type: DefaultErrorHandler
57
+ backoff_strategies:
58
+ - type: ExponentialBackoffStrategy
59
+ response_filters:
60
+ $ref: "#/definitions/response_filters/response_error_filters"
61
+ schema_loader:
62
+ type: DynamicSchemaLoader
63
+ retriever:
64
+ type: SimpleRetriever
65
+ paginator:
66
+ type: NoPagination
67
+ record_selector:
68
+ extractor:
69
+ type: CustomRecordExtractor
70
+ class_name: source_google_sheets.components.DpathSchemaExtractor
71
+ parameters:
72
+ schema_type_identifier:
73
+ $ref: "#/definitions/schema_type_identifier"
74
+ field_path:
75
+ - sheets
76
+ - "*"
77
+ - data
78
+ - "*"
79
+ - rowData
80
+ - "*"
81
+ type: RecordSelector
82
+ requester:
83
+ $ref: "#/definitions/base_requester"
84
+ $parameters:
85
+ sheet_id: ""
86
+ http_method: GET
87
+ path: >-
88
+ {% if config["spreadsheet_id"] | regex_search("^(https://.*)") %}{{ config["spreadsheet_id"] | regex_search("/([-\\w]{20,})([/]?)") }}{% else %}{{ config["spreadsheet_id"] }}{% endif %}?includeGridData=true&ranges={{parameters["sheet_id"] | urlencode}}!1:1&alt=json
89
+ error_handler:
90
+ type: CompositeErrorHandler
91
+ error_handlers:
92
+ - type: DefaultErrorHandler
93
+ backoff_strategies:
94
+ - type: ExponentialBackoffStrategy
95
+ response_filters:
96
+ $ref: "#/definitions/response_filters/single_sheet_response_error_filters"
97
+ schema_type_identifier:
98
+ $ref: "#/definitions/schema_type_identifier"
99
+ components_resolver:
100
+ type: HttpComponentsResolver
101
+ description: We use first row of sheet to obtain data.
102
+ retriever:
103
+ $ref: "#/definitions/retrievers/components_resolver_retriever"
104
+ components_mapping:
105
+ - field_path:
106
+ - name
107
+ type: ComponentMappingDefinition
108
+ value: "{{components_values['properties']['title']}}"
109
+ description: name for dynamic stream.
110
+ - field_path:
111
+ - schema_loader
112
+ - retriever
113
+ - requester
114
+ - $parameters
115
+ - sheet_id
116
+ type: ComponentMappingDefinition
117
+ value: "{{components_values['properties']['title']}}"
118
+ description: sheet_id for dynamic schema loader requester.
119
+ - field_path:
120
+ - retriever
121
+ - requester
122
+ - $parameters
123
+ - sheet_id
124
+ type: ComponentMappingDefinition
125
+ value: "{{components_values['properties']['title']}}"
126
+ description: sheet_id for dynamic stream retriever requester.
127
+ - field_path:
128
+ - retriever
129
+ - record_selector
130
+ - extractor
131
+ - $parameters
132
+ - properties_to_match
133
+ type: ComponentMappingDefinition
134
+ value: "{{components_values['data'][0].get('rowData', [{}])[0]}}"
135
+ description: indexed_schema to match with row values.
136
+ - field_path:
137
+ - retriever
138
+ - partition_router
139
+ - $parameters
140
+ - row_count
141
+ type: ComponentMappingDefinition
142
+ value: "{{components_values['properties']['gridProperties']['rowCount']}}"
143
+ - field_path:
144
+ - retriever
145
+ - partition_router
146
+ - $parameters
147
+ - sheet_id
148
+ type: ComponentMappingDefinition
149
+ value: "{{components_values['properties']['title']}}"
150
+ description: sheet_id for retriever.
151
+ - field_path:
152
+ - retriever
153
+ - partition_router
154
+ - $parameters
155
+ - batch_size
156
+ type: ComponentMappingDefinition
157
+ value: "{{config.get('batch_size', 1000000)}}"
158
+ description: batch size count for dynamic stream partition router (slicer).
159
+
160
+ definitions:
161
+ streams:
162
+ get_spreadsheet_info_and_sheets:
163
+ type: DeclarativeStream
164
+ name: get_spreadsheet_info_and_sheets
165
+ retriever:
166
+ type: SimpleRetriever
167
+ requester:
168
+ $ref: "#/definitions/base_requester"
169
+ path: >-
170
+ {% if config["spreadsheet_id"] | regex_search("^(https://.*)") %}{{ config["spreadsheet_id"] | regex_search("/([-\\w]{20,})([/]?)") }}{% else %}{{ config["spreadsheet_id"] }}{% endif %}?includeGridData=false&alt=json
171
+ http_method: GET
172
+ error_handler:
173
+ type: CompositeErrorHandler
174
+ error_handlers:
175
+ - type: DefaultErrorHandler
176
+ backoff_strategies:
177
+ - type: ExponentialBackoffStrategy
178
+ response_filters:
179
+ $ref: "#/definitions/response_filters/response_error_filters"
180
+ record_selector:
181
+ type: RecordSelector
182
+ extractor:
183
+ type: DpathExtractor
184
+ field_path:
185
+ - sheets
186
+ - "*"
187
+ - properties
188
+ record_filter:
189
+ type: RecordFilter
190
+ condition: '{{ record["sheetType"] == "GRID" and record["gridProperties"]["rowCount"] > 0}}'
191
+ schema_loader:
192
+ type: InlineSchemaLoader
193
+ schema:
194
+ $ref: "#/schemas/sheets"
195
+ base_requester:
196
+ type: HttpRequester
197
+ url_base: https://sheets.googleapis.com/v4/spreadsheets/
198
+ use_cache: true
199
+ authenticator: "#/definitions/authenticator"
200
+ retrievers:
201
+ components_resolver_retriever:
202
+ type: SimpleRetriever
203
+ paginator:
204
+ type: NoPagination
205
+ record_selector:
206
+ type: RecordSelector
207
+ extractor:
208
+ type: DpathExtractor
209
+ field_path:
210
+ - sheets
211
+ partition_router:
212
+ type: SubstreamPartitionRouter
213
+ parent_stream_configs:
214
+ - type: ParentStreamConfig
215
+ parent_key: title
216
+ partition_field: sheet_id
217
+ stream:
218
+ $ref: "#/definitions/streams/get_spreadsheet_info_and_sheets"
219
+ requester:
220
+ $ref: "#/definitions/base_requester"
221
+ description: spreadsheet_id can be either the full url to spreadsheet or the spreadsheet id.
222
+ http_method: GET
223
+ path: >-
224
+ {% if config["spreadsheet_id"] | regex_search("^(https://.*)") %}{{ config["spreadsheet_id"] | regex_search("/([-\\w]{20,})([/]?)") }}{% else %}{{ config["spreadsheet_id"] }}{% endif %}?includeGridData=true&ranges={{stream_partition.sheet_id | urlencode}}!1:1&alt=json
225
+ error_handler:
226
+ type: CompositeErrorHandler
227
+ error_handlers:
228
+ - type: DefaultErrorHandler
229
+ response_filters:
230
+ $ref: "#/definitions/response_filters/single_sheet_response_error_filters"
231
+ response_filters:
232
+ expected_one_sheet:
233
+ type: HttpResponseFilter
234
+ action: FAIL
235
+ predicate: >-
236
+ {{ 'sheets' in response and response["sheets"] | length != 1 }}
237
+ # error lacks of information as error_message can't interpolate stream_slice["potato"]
238
+ error_message: >-
239
+ Unable to read the schema of sheet. Error: Unexpected return
240
+ result: Sheet was
241
+ expected to contain data on exactly 1 sheet.
242
+ ignore_duplicate_headers:
243
+ type: HttpResponseFilter
244
+ action: IGNORE
245
+ predicate: >-
246
+ {{ response["sheets"][0]["data"][0]["rowData"][0]["values"] |
247
+ map(attribute="formattedValue") | list | length !=
248
+ response["sheets"][0]["data"][0]["rowData"][0]["values"] |
249
+ map(attribute="formattedValue") | list | unique | list | length }}
250
+ error_message: >-
251
+ {%- set headers_found = response["sheets"][0]["data"][0]["rowData"][0]["values"] | map(attribute="formattedValue") | list -%}
252
+ {%- set headers_count = {} -%}
253
+ {%- set duplicate_fields = [] -%}
254
+ {%- for headerFound in headers_found -%}
255
+ {%- if headerFound is not none -%}
256
+ {%- set headers_count = headers_count.update({headerFound: headers_count.get(headerFound, 0) + 1}) or headers_count -%}
257
+ {%- if headers_count.get(headerFound) > 1 and headerFound not in duplicate_fields -%}
258
+ {%- set duplicate_fields = duplicate_fields.append(headerFound) -%}
259
+ {%- endif -%}
260
+ {%- endif -%}
261
+ {%- endfor -%}
262
+ Duplicate headers found in sheet {{ response["sheets"][0]["properties"]["title"] }}.
263
+ Ignoring them: {{ duplicate_fields }}
264
+ fail_duplicate_headers:
265
+ $ref: "#/definitions/response_filters/ignore_duplicate_headers"
266
+ action: FAIL
267
+ error_message: >-
268
+ {%- set headers_found = response["sheets"][0]["data"][0]["rowData"][0]["values"] | map(attribute="formattedValue") | list -%}
269
+ {%- set headers_count = {} -%}
270
+ {%- set duplicate_fields = [] -%}
271
+ {%- for headerFound in headers_found -%}
272
+ {%- if headerFound is not none -%}
273
+ {%- set headers_count = headers_count.update({headerFound: headers_count.get(headerFound, 0) + 1}) or headers_count -%}
274
+ {%- if headers_count.get(headerFound) > 1 and headerFound not in duplicate_fields -%}
275
+ {%- set duplicate_fields = duplicate_fields.append(headerFound) -%}
276
+ {%- endif -%}
277
+ {%- endif -%}
278
+ {%- endfor -%}
279
+ The following duplicate headers were found in the sheet.
280
+ Please fix them to continue: {{ duplicate_fields }}
281
+ server_error:
282
+ type: HttpResponseFilter
283
+ action: RETRY
284
+ http_codes:
285
+ - 500
286
+ - 502
287
+ - 503
288
+ error_message: >-
289
+ There was an issue
290
+ with the Google Sheets API. This is usually a temporary issue from
291
+ Google's side. Please try again. If this issue persists, contact
292
+ support
293
+ forbidden:
294
+ type: HttpResponseFilter
295
+ action: FAIL
296
+ http_codes:
297
+ - 403
298
+ error_message: >-
299
+ The authenticated Google Sheets user does not have permissions to view the
300
+ spreadsheet with id {{config["spreadsheet_id"]}}. Please ensure the authenticated user has access
301
+ to the Spreadsheet and reauthenticate. If the issue persists, contact support.
302
+ The caller does not have right permissions.
303
+ not_found:
304
+ type: HttpResponseFilter
305
+ action: FAIL
306
+ http_codes:
307
+ - 404
308
+ error_message: >-
309
+ The requested Google Sheets spreadsheet with id {{config["spreadsheet_id"]}} does not exist.
310
+ Please ensure the Spreadsheet Link you have set is valid and the spreadsheet exists. If the issue persists, contact support
311
+ rate_limit:
312
+ type: HttpResponseFilter
313
+ action: RATE_LIMITED
314
+ http_codes:
315
+ - 429
316
+ error_message: >-
317
+ Rate limit has been
318
+ reached. Please try later or request a higher quota for your account.
319
+ single_sheet_response_error_filters:
320
+ - $ref: "#/definitions/response_filters/expected_one_sheet"
321
+ - $ref: "#/definitions/response_filters/ignore_duplicate_headers"
322
+ check_operation_single_sheet_response_error_filters:
323
+ - $ref: "#/definitions/response_filters/expected_one_sheet"
324
+ - $ref: "#/definitions/response_filters/fail_duplicate_headers"
325
+ response_error_filters:
326
+ - $ref: "#/definitions/response_filters/server_error"
327
+ - $ref: "#/definitions/response_filters/forbidden"
328
+ - $ref: "#/definitions/response_filters/not_found"
329
+ - $ref: "#/definitions/response_filters/rate_limit"
330
+ schema_type_identifier:
331
+ key_pointer:
332
+ - formattedValue
333
+ schema_pointer:
334
+ - values
335
+ jwt_authenticator:
336
+ type: JwtAuthenticator
337
+ secret_key: "{{ json_loads(config['credentials']['service_account_info'])['private_key'] }}"
338
+ algorithm: "RS256"
339
+ token_duration: 3600
340
+ jwt_payload:
341
+ aud: "{{ json_loads(config['credentials']['service_account_info'])['token_uri'] }}"
342
+ iss: "{{ json_loads(config['credentials']['service_account_info'])['client_email'] }}"
343
+ additional_jwt_payload:
344
+ scope: "https://www.googleapis.com/auth/spreadsheets.readonly https://www.googleapis.com/auth/drive.readonly"
345
+ oauth_authenticator:
346
+ type: OAuthAuthenticator
347
+ refresh_request_body: {}
348
+ token_refresh_endpoint: https://www.googleapis.com/oauth2/v4/token
349
+ grant_type: refresh_token
350
+ client_id: '{{ config["credentials"]["client_id"] }}'
351
+ client_secret: '{{ config["credentials"]["client_secret"] }}'
352
+ refresh_token: '{{ config["credentials"]["refresh_token"] }}'
353
+ jwt_profile_assertion_oauth_authenticator:
354
+ type: OAuthAuthenticator
355
+ token_refresh_endpoint: https://oauth2.googleapis.com/token
356
+ refresh_request_headers:
357
+ Content-Type: application/x-www-form-urlencoded
358
+ use_profile_assertion: true
359
+ profile_assertion:
360
+ $ref: "#/definitions/jwt_authenticator"
361
+ authenticator:
362
+ type: SelectiveAuthenticator
363
+ authenticator_selection_path: ["credentials", "auth_type"]
364
+ authenticators:
365
+ Client: "#/definitions/oauth_authenticator"
366
+ Service: "#/definitions/jwt_profile_assertion_oauth_authenticator"
367
+
368
+ schemas:
369
+ sheets:
370
+ type: object
371
+ $schema: http://json-schema.org/schema#
372
+ additionalProperties: true
373
+ properties:
374
+ gridProperties:
375
+ type:
376
+ - object
377
+ - "null"
378
+ properties:
379
+ columnCount:
380
+ type:
381
+ - number
382
+ - "null"
383
+ rowCount:
384
+ type:
385
+ - number
386
+ - "null"
387
+ index:
388
+ type:
389
+ - number
390
+ - "null"
391
+ sheetId:
392
+ type:
393
+ - number
394
+ - "null"
395
+ sheetType:
396
+ type:
397
+ - string
398
+ - "null"
399
+ title:
400
+ type:
401
+ - string
402
+ - "null"
403
+
404
+ concurrency_level:
405
+ type: ConcurrencyLevel
406
+ default_concurrency: 1
407
+ max_concurrency: 1
@@ -1,5 +1,5 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
 
@@ -1,5 +1,5 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
 
@@ -1,15 +1,53 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
 
6
6
  import sys
7
+ import traceback
8
+ from datetime import datetime
9
+ from typing import List
7
10
 
8
- from airbyte_cdk.entrypoint import launch
11
+ from orjson import orjson
9
12
 
10
- from .source import SourceGoogleSheets
13
+ from airbyte_cdk.entrypoint import AirbyteEntrypoint, launch
14
+ from airbyte_cdk.models import AirbyteErrorTraceMessage, AirbyteMessage, AirbyteMessageSerializer, AirbyteTraceMessage, TraceType, Type
15
+ from source_google_sheets.source import SourceGoogleSheets
16
+
17
+
18
+ def _get_source(args: List[str]):
19
+ catalog_path = AirbyteEntrypoint.extract_catalog(args)
20
+ config_path = AirbyteEntrypoint.extract_config(args)
21
+ state_path = AirbyteEntrypoint.extract_state(args)
22
+ try:
23
+ return SourceGoogleSheets(
24
+ SourceGoogleSheets.read_catalog(catalog_path) if catalog_path else None,
25
+ SourceGoogleSheets.read_config(config_path) if config_path else None,
26
+ SourceGoogleSheets.read_state(state_path) if state_path else None,
27
+ )
28
+ except Exception as error:
29
+ print(
30
+ orjson.dumps(
31
+ AirbyteMessageSerializer.dump(
32
+ AirbyteMessage(
33
+ type=Type.TRACE,
34
+ trace=AirbyteTraceMessage(
35
+ type=TraceType.ERROR,
36
+ emitted_at=int(datetime.now().timestamp() * 1000),
37
+ error=AirbyteErrorTraceMessage(
38
+ message=f"Error starting the sync. This could be due to an invalid configuration or catalog. Please contact Support for assistance. Error: {error}",
39
+ stack_trace=traceback.format_exc(),
40
+ ),
41
+ ),
42
+ )
43
+ )
44
+ ).decode()
45
+ )
46
+ return None
11
47
 
12
48
 
13
49
  def run():
14
- source = SourceGoogleSheets()
15
- launch(source, sys.argv[1:])
50
+ _args = sys.argv[1:]
51
+ source = _get_source(_args)
52
+ if source:
53
+ launch(source, _args)