airbyte-source-google-sheets 0.8.4__py3-none-any.whl → 0.9.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,408 @@
1
+ version: 6.7.0
2
+
3
+ type: DeclarativeSource
4
+
5
+ check:
6
+ type: CheckDynamicStream
7
+ stream_count: 1
8
+ use_check_availability: false
9
+
10
+ dynamic_streams:
11
+ - type: DynamicDeclarativeStream
12
+ stream_template:
13
+ type: DeclarativeStream
14
+ name: ""
15
+ $parameters:
16
+ i: 123
17
+ primary_key: []
18
+ retriever:
19
+ type: SimpleRetriever
20
+ $parameters:
21
+ row_count: 0
22
+ sheet_id: ""
23
+ batch_size: 0
24
+ partition_router:
25
+ type: CustomPartitionRouter
26
+ class_name: "source_google_sheets.components.partition_routers.RangePartitionRouter"
27
+ paginator:
28
+ type: NoPagination
29
+ record_selector:
30
+ decoder:
31
+ type: JsonDecoder
32
+ extractor:
33
+ type: CustomRecordExtractor
34
+ class_name: source_google_sheets.components.DpathSchemaMatchingExtractor
35
+ description: Extract record list of values (rows) and matches such values to correct schema property to generate individual records.
36
+ field_path:
37
+ - valueRanges
38
+ - "*"
39
+ $parameters:
40
+ schema_type_identifier:
41
+ $ref: "#/definitions/schema_type_identifier"
42
+ values_to_match_key: "values"
43
+ properties_to_match: ""
44
+ type: RecordSelector
45
+ $parameters:
46
+ name: ""
47
+ requester:
48
+ $ref: "#/definitions/base_requester"
49
+ $parameters:
50
+ sheet_id: ""
51
+ name: ""
52
+ http_method: GET
53
+ path: >-
54
+ {% if config["spreadsheet_id"] | regex_search("^(https://.*)") %}{{ config["spreadsheet_id"] | regex_search("/([-\\w]{20,})([/]?)") }}{% else %}{{ config["spreadsheet_id"] }}{% endif %}/values:batchGet?ranges={{parameters["sheet_id"] | urlencode}}!{{stream_partition.start_range}}:{{stream_partition.end_range}}&majorDimension=ROWS&alt=json
55
+ error_handler:
56
+ type: DefaultErrorHandler
57
+ backoff_strategies:
58
+ - type: ExponentialBackoffStrategy
59
+ response_filters:
60
+ $ref: "#/definitions/response_filters/response_error_filters"
61
+ schema_loader:
62
+ type: DynamicSchemaLoader
63
+ retriever:
64
+ type: SimpleRetriever
65
+ paginator:
66
+ type: NoPagination
67
+ record_selector:
68
+ extractor:
69
+ type: CustomRecordExtractor
70
+ class_name: source_google_sheets.components.DpathSchemaExtractor
71
+ parameters:
72
+ schema_type_identifier:
73
+ $ref: "#/definitions/schema_type_identifier"
74
+ field_path:
75
+ - sheets
76
+ - "*"
77
+ - data
78
+ - "*"
79
+ - rowData
80
+ - "*"
81
+ type: RecordSelector
82
+ requester:
83
+ $ref: "#/definitions/base_requester"
84
+ $parameters:
85
+ sheet_id: ""
86
+ http_method: GET
87
+ path: >-
88
+ {% if config["spreadsheet_id"] | regex_search("^(https://.*)") %}{{ config["spreadsheet_id"] | regex_search("/([-\\w]{20,})([/]?)") }}{% else %}{{ config["spreadsheet_id"] }}{% endif %}?includeGridData=true&ranges={{parameters["sheet_id"] | urlencode}}!1:1&alt=json
89
+ error_handler:
90
+ type: CompositeErrorHandler
91
+ error_handlers:
92
+ - type: DefaultErrorHandler
93
+ backoff_strategies:
94
+ - type: ExponentialBackoffStrategy
95
+ response_filters:
96
+ $ref: "#/definitions/response_filters/single_sheet_response_error_filters"
97
+ schema_type_identifier:
98
+ $ref: "#/definitions/schema_type_identifier"
99
+ components_resolver:
100
+ type: HttpComponentsResolver
101
+ description: We use first row of sheet to obtain data.
102
+ retriever:
103
+ $ref: "#/definitions/retrievers/components_resolver_retriever"
104
+ components_mapping:
105
+ - field_path:
106
+ - name
107
+ type: ComponentMappingDefinition
108
+ value: "{{components_values['properties']['title']}}"
109
+ value_type: string
110
+ description: name for dynamic stream.
111
+ - field_path:
112
+ - schema_loader
113
+ - retriever
114
+ - requester
115
+ - $parameters
116
+ - sheet_id
117
+ type: ComponentMappingDefinition
118
+ value: "{{components_values['properties']['title']}}"
119
+ description: sheet_id for dynamic schema loader requester.
120
+ - field_path:
121
+ - retriever
122
+ - requester
123
+ - $parameters
124
+ - sheet_id
125
+ type: ComponentMappingDefinition
126
+ value: "{{components_values['properties']['title']}}"
127
+ description: sheet_id for dynamic stream retriever requester.
128
+ - field_path:
129
+ - retriever
130
+ - record_selector
131
+ - extractor
132
+ - $parameters
133
+ - properties_to_match
134
+ type: ComponentMappingDefinition
135
+ value: "{{components_values['data'][0].get('rowData', [{}])[0]}}"
136
+ description: indexed_schema to match with row values.
137
+ - field_path:
138
+ - retriever
139
+ - partition_router
140
+ - $parameters
141
+ - row_count
142
+ type: ComponentMappingDefinition
143
+ value: "{{components_values['properties']['gridProperties']['rowCount']}}"
144
+ - field_path:
145
+ - retriever
146
+ - partition_router
147
+ - $parameters
148
+ - sheet_id
149
+ type: ComponentMappingDefinition
150
+ value: "{{components_values['properties']['title']}}"
151
+ description: sheet_id for retriever.
152
+ - field_path:
153
+ - retriever
154
+ - partition_router
155
+ - $parameters
156
+ - batch_size
157
+ type: ComponentMappingDefinition
158
+ value: "{{config.get('batch_size', 1000000)}}"
159
+ description: batch size count for dynamic stream partition router (slicer).
160
+
161
+ definitions:
162
+ streams:
163
+ get_spreadsheet_info_and_sheets:
164
+ type: DeclarativeStream
165
+ name: get_spreadsheet_info_and_sheets
166
+ retriever:
167
+ type: SimpleRetriever
168
+ requester:
169
+ $ref: "#/definitions/base_requester"
170
+ path: >-
171
+ {% if config["spreadsheet_id"] | regex_search("^(https://.*)") %}{{ config["spreadsheet_id"] | regex_search("/([-\\w]{20,})([/]?)") }}{% else %}{{ config["spreadsheet_id"] }}{% endif %}?includeGridData=false&alt=json
172
+ http_method: GET
173
+ error_handler:
174
+ type: CompositeErrorHandler
175
+ error_handlers:
176
+ - type: DefaultErrorHandler
177
+ backoff_strategies:
178
+ - type: ExponentialBackoffStrategy
179
+ response_filters:
180
+ $ref: "#/definitions/response_filters/response_error_filters"
181
+ record_selector:
182
+ type: RecordSelector
183
+ extractor:
184
+ type: DpathExtractor
185
+ field_path:
186
+ - sheets
187
+ - "*"
188
+ - properties
189
+ record_filter:
190
+ type: RecordFilter
191
+ condition: '{{ record["sheetType"] == "GRID" and record["gridProperties"]["rowCount"] > 0}}'
192
+ schema_loader:
193
+ type: InlineSchemaLoader
194
+ schema:
195
+ $ref: "#/schemas/sheets"
196
+ base_requester:
197
+ type: HttpRequester
198
+ url_base: https://sheets.googleapis.com/v4/spreadsheets/
199
+ use_cache: true
200
+ authenticator: "#/definitions/authenticator"
201
+ retrievers:
202
+ components_resolver_retriever:
203
+ type: SimpleRetriever
204
+ paginator:
205
+ type: NoPagination
206
+ record_selector:
207
+ type: RecordSelector
208
+ extractor:
209
+ type: DpathExtractor
210
+ field_path:
211
+ - sheets
212
+ partition_router:
213
+ type: SubstreamPartitionRouter
214
+ parent_stream_configs:
215
+ - type: ParentStreamConfig
216
+ parent_key: title
217
+ partition_field: sheet_id
218
+ stream:
219
+ $ref: "#/definitions/streams/get_spreadsheet_info_and_sheets"
220
+ requester:
221
+ $ref: "#/definitions/base_requester"
222
+ description: spreadsheet_id can be either the full url to spreadsheet or the spreadsheet id.
223
+ http_method: GET
224
+ path: >-
225
+ {% if config["spreadsheet_id"] | regex_search("^(https://.*)") %}{{ config["spreadsheet_id"] | regex_search("/([-\\w]{20,})([/]?)") }}{% else %}{{ config["spreadsheet_id"] }}{% endif %}?includeGridData=true&ranges={{stream_partition.sheet_id | urlencode}}!1:1&alt=json
226
+ error_handler:
227
+ type: CompositeErrorHandler
228
+ error_handlers:
229
+ - type: DefaultErrorHandler
230
+ response_filters:
231
+ $ref: "#/definitions/response_filters/single_sheet_response_error_filters"
232
+ response_filters:
233
+ expected_one_sheet:
234
+ type: HttpResponseFilter
235
+ action: FAIL
236
+ predicate: >-
237
+ {{ 'sheets' in response and response["sheets"] | length != 1 }}
238
+ # error lacks of information as error_message can't interpolate stream_slice["potato"]
239
+ error_message: >-
240
+ Unable to read the schema of sheet. Error: Unexpected return
241
+ result: Sheet was
242
+ expected to contain data on exactly 1 sheet.
243
+ ignore_duplicate_headers:
244
+ type: HttpResponseFilter
245
+ action: IGNORE
246
+ predicate: >-
247
+ {{ response["sheets"][0]["data"][0]["rowData"][0]["values"] |
248
+ map(attribute="formattedValue") | list | length !=
249
+ response["sheets"][0]["data"][0]["rowData"][0]["values"] |
250
+ map(attribute="formattedValue") | list | unique | list | length }}
251
+ error_message: >-
252
+ {%- set headers_found = response["sheets"][0]["data"][0]["rowData"][0]["values"] | map(attribute="formattedValue") | list -%}
253
+ {%- set headers_count = {} -%}
254
+ {%- set duplicate_fields = [] -%}
255
+ {%- for headerFound in headers_found -%}
256
+ {%- if headerFound is not none -%}
257
+ {%- set headers_count = headers_count.update({headerFound: headers_count.get(headerFound, 0) + 1}) or headers_count -%}
258
+ {%- if headers_count.get(headerFound) > 1 and headerFound not in duplicate_fields -%}
259
+ {%- set duplicate_fields = duplicate_fields.append(headerFound) -%}
260
+ {%- endif -%}
261
+ {%- endif -%}
262
+ {%- endfor -%}
263
+ Duplicate headers found in sheet {{ response["sheets"][0]["properties"]["title"] }}.
264
+ Ignoring them: {{ duplicate_fields }}
265
+ fail_duplicate_headers:
266
+ $ref: "#/definitions/response_filters/ignore_duplicate_headers"
267
+ action: FAIL
268
+ error_message: >-
269
+ {%- set headers_found = response["sheets"][0]["data"][0]["rowData"][0]["values"] | map(attribute="formattedValue") | list -%}
270
+ {%- set headers_count = {} -%}
271
+ {%- set duplicate_fields = [] -%}
272
+ {%- for headerFound in headers_found -%}
273
+ {%- if headerFound is not none -%}
274
+ {%- set headers_count = headers_count.update({headerFound: headers_count.get(headerFound, 0) + 1}) or headers_count -%}
275
+ {%- if headers_count.get(headerFound) > 1 and headerFound not in duplicate_fields -%}
276
+ {%- set duplicate_fields = duplicate_fields.append(headerFound) -%}
277
+ {%- endif -%}
278
+ {%- endif -%}
279
+ {%- endfor -%}
280
+ The following duplicate headers were found in the sheet.
281
+ Please fix them to continue: {{ duplicate_fields }}
282
+ server_error:
283
+ type: HttpResponseFilter
284
+ action: RETRY
285
+ http_codes:
286
+ - 500
287
+ - 502
288
+ - 503
289
+ error_message: >-
290
+ There was an issue
291
+ with the Google Sheets API. This is usually a temporary issue from
292
+ Google's side. Please try again. If this issue persists, contact
293
+ support
294
+ forbidden:
295
+ type: HttpResponseFilter
296
+ action: FAIL
297
+ http_codes:
298
+ - 403
299
+ error_message: >-
300
+ The authenticated Google Sheets user does not have permissions to view the
301
+ spreadsheet with id {{config["spreadsheet_id"]}}. Please ensure the authenticated user has access
302
+ to the Spreadsheet and reauthenticate. If the issue persists, contact support.
303
+ The caller does not have right permissions.
304
+ not_found:
305
+ type: HttpResponseFilter
306
+ action: FAIL
307
+ http_codes:
308
+ - 404
309
+ error_message: >-
310
+ The requested Google Sheets spreadsheet with id {{config["spreadsheet_id"]}} does not exist.
311
+ Please ensure the Spreadsheet Link you have set is valid and the spreadsheet exists. If the issue persists, contact support
312
+ rate_limit:
313
+ type: HttpResponseFilter
314
+ action: RATE_LIMITED
315
+ http_codes:
316
+ - 429
317
+ error_message: >-
318
+ Rate limit has been
319
+ reached. Please try later or request a higher quota for your account.
320
+ single_sheet_response_error_filters:
321
+ - $ref: "#/definitions/response_filters/expected_one_sheet"
322
+ - $ref: "#/definitions/response_filters/ignore_duplicate_headers"
323
+ check_operation_single_sheet_response_error_filters:
324
+ - $ref: "#/definitions/response_filters/expected_one_sheet"
325
+ - $ref: "#/definitions/response_filters/fail_duplicate_headers"
326
+ response_error_filters:
327
+ - $ref: "#/definitions/response_filters/server_error"
328
+ - $ref: "#/definitions/response_filters/forbidden"
329
+ - $ref: "#/definitions/response_filters/not_found"
330
+ - $ref: "#/definitions/response_filters/rate_limit"
331
+ schema_type_identifier:
332
+ key_pointer:
333
+ - formattedValue
334
+ schema_pointer:
335
+ - values
336
+ jwt_authenticator:
337
+ type: JwtAuthenticator
338
+ secret_key: "{{ json_loads(config['credentials']['service_account_info'])['private_key'] }}"
339
+ algorithm: "RS256"
340
+ token_duration: 3600
341
+ jwt_payload:
342
+ aud: "{{ json_loads(config['credentials']['service_account_info'])['token_uri'] }}"
343
+ iss: "{{ json_loads(config['credentials']['service_account_info'])['client_email'] }}"
344
+ additional_jwt_payload:
345
+ scope: "https://www.googleapis.com/auth/spreadsheets.readonly https://www.googleapis.com/auth/drive.readonly"
346
+ oauth_authenticator:
347
+ type: OAuthAuthenticator
348
+ refresh_request_body: {}
349
+ token_refresh_endpoint: https://www.googleapis.com/oauth2/v4/token
350
+ grant_type: refresh_token
351
+ client_id: '{{ config["credentials"]["client_id"] }}'
352
+ client_secret: '{{ config["credentials"]["client_secret"] }}'
353
+ refresh_token: '{{ config["credentials"]["refresh_token"] }}'
354
+ jwt_profile_assertion_oauth_authenticator:
355
+ type: OAuthAuthenticator
356
+ token_refresh_endpoint: https://oauth2.googleapis.com/token
357
+ refresh_request_headers:
358
+ Content-Type: application/x-www-form-urlencoded
359
+ use_profile_assertion: true
360
+ profile_assertion:
361
+ $ref: "#/definitions/jwt_authenticator"
362
+ authenticator:
363
+ type: SelectiveAuthenticator
364
+ authenticator_selection_path: ["credentials", "auth_type"]
365
+ authenticators:
366
+ Client: "#/definitions/oauth_authenticator"
367
+ Service: "#/definitions/jwt_profile_assertion_oauth_authenticator"
368
+
369
+ schemas:
370
+ sheets:
371
+ type: object
372
+ $schema: http://json-schema.org/schema#
373
+ additionalProperties: true
374
+ properties:
375
+ gridProperties:
376
+ type:
377
+ - object
378
+ - "null"
379
+ properties:
380
+ columnCount:
381
+ type:
382
+ - number
383
+ - "null"
384
+ rowCount:
385
+ type:
386
+ - number
387
+ - "null"
388
+ index:
389
+ type:
390
+ - number
391
+ - "null"
392
+ sheetId:
393
+ type:
394
+ - number
395
+ - "null"
396
+ sheetType:
397
+ type:
398
+ - string
399
+ - "null"
400
+ title:
401
+ type:
402
+ - string
403
+ - "null"
404
+
405
+ concurrency_level:
406
+ type: ConcurrencyLevel
407
+ default_concurrency: 1
408
+ max_concurrency: 1
@@ -1,5 +1,5 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
 
@@ -1,5 +1,5 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
 
@@ -1,15 +1,53 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
 
6
6
  import sys
7
+ import traceback
8
+ from datetime import datetime
9
+ from typing import List
7
10
 
8
- from airbyte_cdk.entrypoint import launch
11
+ from orjson import orjson
9
12
 
10
- from .source import SourceGoogleSheets
13
+ from airbyte_cdk.entrypoint import AirbyteEntrypoint, launch
14
+ from airbyte_cdk.models import AirbyteErrorTraceMessage, AirbyteMessage, AirbyteMessageSerializer, AirbyteTraceMessage, TraceType, Type
15
+ from source_google_sheets.source import SourceGoogleSheets
16
+
17
+
18
+ def _get_source(args: List[str]):
19
+ catalog_path = AirbyteEntrypoint.extract_catalog(args)
20
+ config_path = AirbyteEntrypoint.extract_config(args)
21
+ state_path = AirbyteEntrypoint.extract_state(args)
22
+ try:
23
+ return SourceGoogleSheets(
24
+ SourceGoogleSheets.read_catalog(catalog_path) if catalog_path else None,
25
+ SourceGoogleSheets.read_config(config_path) if config_path else None,
26
+ SourceGoogleSheets.read_state(state_path) if state_path else None,
27
+ )
28
+ except Exception as error:
29
+ print(
30
+ orjson.dumps(
31
+ AirbyteMessageSerializer.dump(
32
+ AirbyteMessage(
33
+ type=Type.TRACE,
34
+ trace=AirbyteTraceMessage(
35
+ type=TraceType.ERROR,
36
+ emitted_at=int(datetime.now().timestamp() * 1000),
37
+ error=AirbyteErrorTraceMessage(
38
+ message=f"Error starting the sync. This could be due to an invalid configuration or catalog. Please contact Support for assistance. Error: {error}",
39
+ stack_trace=traceback.format_exc(),
40
+ ),
41
+ ),
42
+ )
43
+ )
44
+ ).decode()
45
+ )
46
+ return None
11
47
 
12
48
 
13
49
  def run():
14
- source = SourceGoogleSheets()
15
- launch(source, sys.argv[1:])
50
+ _args = sys.argv[1:]
51
+ source = _get_source(_args)
52
+ if source:
53
+ launch(source, _args)