airbyte-source-google-sheets 0.8.5__py3-none-any.whl → 0.9.0rc1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,407 @@
1
+ version: 6.7.0
2
+
3
+ type: DeclarativeSource
4
+
5
+ check:
6
+ type: CheckDynamicStream
7
+ stream_count: 1
8
+ use_check_availability: false
9
+
10
+ dynamic_streams:
11
+ - type: DynamicDeclarativeStream
12
+ stream_template:
13
+ type: DeclarativeStream
14
+ name: ""
15
+ $parameters:
16
+ i: 123
17
+ primary_key: []
18
+ retriever:
19
+ type: SimpleRetriever
20
+ $parameters:
21
+ row_count: 0
22
+ sheet_id: ""
23
+ batch_size: 0
24
+ partition_router:
25
+ type: CustomPartitionRouter
26
+ class_name: "source_google_sheets.components.partition_routers.RangePartitionRouter"
27
+ paginator:
28
+ type: NoPagination
29
+ record_selector:
30
+ decoder:
31
+ type: JsonDecoder
32
+ extractor:
33
+ type: CustomRecordExtractor
34
+ class_name: source_google_sheets.components.DpathSchemaMatchingExtractor
35
+ description: Extract record list of values (rows) and matches such values to correct schema property to generate individual records.
36
+ field_path:
37
+ - valueRanges
38
+ - "*"
39
+ $parameters:
40
+ schema_type_identifier:
41
+ $ref: "#/definitions/schema_type_identifier"
42
+ values_to_match_key: "values"
43
+ properties_to_match: ""
44
+ type: RecordSelector
45
+ $parameters:
46
+ name: ""
47
+ requester:
48
+ $ref: "#/definitions/base_requester"
49
+ $parameters:
50
+ sheet_id: ""
51
+ name: ""
52
+ http_method: GET
53
+ path: >-
54
+ {% if config["spreadsheet_id"] | regex_search("^(https://.*)") %}{{ config["spreadsheet_id"] | regex_search("/([-\\w]{20,})([/]?)") }}{% else %}{{ config["spreadsheet_id"] }}{% endif %}/values:batchGet?ranges={{parameters["sheet_id"]}}!{{stream_partition.start_range}}:{{stream_partition.end_range}}&majorDimension=ROWS&alt=json
55
+ error_handler:
56
+ type: DefaultErrorHandler
57
+ backoff_strategies:
58
+ - type: ExponentialBackoffStrategy
59
+ response_filters:
60
+ $ref: "#/definitions/response_filters/response_error_filters"
61
+ schema_loader:
62
+ type: DynamicSchemaLoader
63
+ retriever:
64
+ type: SimpleRetriever
65
+ paginator:
66
+ type: NoPagination
67
+ record_selector:
68
+ extractor:
69
+ type: CustomRecordExtractor
70
+ class_name: source_google_sheets.components.DpathSchemaExtractor
71
+ parameters:
72
+ schema_type_identifier:
73
+ $ref: "#/definitions/schema_type_identifier"
74
+ field_path:
75
+ - sheets
76
+ - "*"
77
+ - data
78
+ - "*"
79
+ - rowData
80
+ - "*"
81
+ type: RecordSelector
82
+ requester:
83
+ $ref: "#/definitions/base_requester"
84
+ $parameters:
85
+ sheet_id: ""
86
+ http_method: GET
87
+ path: >-
88
+ {% if config["spreadsheet_id"] | regex_search("^(https://.*)") %}{{ config["spreadsheet_id"] | regex_search("/([-\\w]{20,})([/]?)") }}{% else %}{{ config["spreadsheet_id"] }}{% endif %}?includeGridData=true&ranges={{parameters["sheet_id"]}}!1:1&alt=json
89
+ error_handler:
90
+ type: CompositeErrorHandler
91
+ error_handlers:
92
+ - type: DefaultErrorHandler
93
+ backoff_strategies:
94
+ - type: ExponentialBackoffStrategy
95
+ response_filters:
96
+ $ref: "#/definitions/response_filters/single_sheet_response_error_filters"
97
+ schema_type_identifier:
98
+ $ref: "#/definitions/schema_type_identifier"
99
+ components_resolver:
100
+ type: HttpComponentsResolver
101
+ description: We use first row of sheet to obtain data.
102
+ retriever:
103
+ $ref: "#/definitions/retrievers/components_resolver_retriever"
104
+ components_mapping:
105
+ - field_path:
106
+ - name
107
+ type: ComponentMappingDefinition
108
+ value: "{{components_values['properties']['title']}}"
109
+ description: name for dynamic stream.
110
+ - field_path:
111
+ - schema_loader
112
+ - retriever
113
+ - requester
114
+ - $parameters
115
+ - sheet_id
116
+ type: ComponentMappingDefinition
117
+ value: "{{components_values['properties']['title']}}"
118
+ description: sheet_id for dynamic schema loader requester.
119
+ - field_path:
120
+ - retriever
121
+ - requester
122
+ - $parameters
123
+ - sheet_id
124
+ type: ComponentMappingDefinition
125
+ value: "{{components_values['properties']['title']}}"
126
+ description: sheet_id for dynamic stream retriever requester.
127
+ - field_path:
128
+ - retriever
129
+ - record_selector
130
+ - extractor
131
+ - $parameters
132
+ - properties_to_match
133
+ type: ComponentMappingDefinition
134
+ value: "{{components_values['data'][0].get('rowData', [{}])[0]}}"
135
+ description: indexed_schema to match with row values.
136
+ - field_path:
137
+ - retriever
138
+ - partition_router
139
+ - $parameters
140
+ - row_count
141
+ type: ComponentMappingDefinition
142
+ value: "{{components_values['properties']['gridProperties']['rowCount']}}"
143
+ - field_path:
144
+ - retriever
145
+ - partition_router
146
+ - $parameters
147
+ - sheet_id
148
+ type: ComponentMappingDefinition
149
+ value: "{{components_values['properties']['title']}}"
150
+ description: sheet_id for retriever.
151
+ - field_path:
152
+ - retriever
153
+ - partition_router
154
+ - $parameters
155
+ - batch_size
156
+ type: ComponentMappingDefinition
157
+ value: "{{config.get('batch_size', 1000000)}}"
158
+ description: batch size count for dynamic stream partition router (slicer).
159
+
160
+ definitions:
161
+ streams:
162
+ get_spreadsheet_info_and_sheets:
163
+ type: DeclarativeStream
164
+ name: get_spreadsheet_info_and_sheets
165
+ retriever:
166
+ type: SimpleRetriever
167
+ requester:
168
+ $ref: "#/definitions/base_requester"
169
+ path: >-
170
+ {% if config["spreadsheet_id"] | regex_search("^(https://.*)") %}{{ config["spreadsheet_id"] | regex_search("/([-\\w]{20,})([/]?)") }}{% else %}{{ config["spreadsheet_id"] }}{% endif %}?includeGridData=false&alt=json
171
+ http_method: GET
172
+ error_handler:
173
+ type: CompositeErrorHandler
174
+ error_handlers:
175
+ - type: DefaultErrorHandler
176
+ backoff_strategies:
177
+ - type: ExponentialBackoffStrategy
178
+ response_filters:
179
+ $ref: "#/definitions/response_filters/response_error_filters"
180
+ record_selector:
181
+ type: RecordSelector
182
+ extractor:
183
+ type: DpathExtractor
184
+ field_path:
185
+ - sheets
186
+ - "*"
187
+ - properties
188
+ record_filter:
189
+ type: RecordFilter
190
+ condition: '{{ record["sheetType"] == "GRID" and record["gridProperties"]["rowCount"] > 0}}'
191
+ schema_loader:
192
+ type: InlineSchemaLoader
193
+ schema:
194
+ $ref: "#/schemas/sheets"
195
+ base_requester:
196
+ type: HttpRequester
197
+ url_base: https://sheets.googleapis.com/v4/spreadsheets/
198
+ use_cache: true
199
+ authenticator: "#/definitions/authenticator"
200
+ retrievers:
201
+ components_resolver_retriever:
202
+ type: SimpleRetriever
203
+ paginator:
204
+ type: NoPagination
205
+ record_selector:
206
+ type: RecordSelector
207
+ extractor:
208
+ type: DpathExtractor
209
+ field_path:
210
+ - sheets
211
+ partition_router:
212
+ type: SubstreamPartitionRouter
213
+ parent_stream_configs:
214
+ - type: ParentStreamConfig
215
+ parent_key: title
216
+ partition_field: sheet_id
217
+ stream:
218
+ $ref: "#/definitions/streams/get_spreadsheet_info_and_sheets"
219
+ requester:
220
+ $ref: "#/definitions/base_requester"
221
+ description: spreadsheet_id can be either the full url to spreadsheet or the spreadsheet id.
222
+ http_method: GET
223
+ path: >-
224
+ {% if config["spreadsheet_id"] | regex_search("^(https://.*)") %}{{ config["spreadsheet_id"] | regex_search("/([-\\w]{20,})([/]?)") }}{% else %}{{ config["spreadsheet_id"] }}{% endif %}?includeGridData=true&ranges={{stream_partition.sheet_id}}!1:1&alt=json
225
+ error_handler:
226
+ type: CompositeErrorHandler
227
+ error_handlers:
228
+ - type: DefaultErrorHandler
229
+ response_filters:
230
+ $ref: "#/definitions/response_filters/single_sheet_response_error_filters"
231
+ response_filters:
232
+ expected_one_sheet:
233
+ type: HttpResponseFilter
234
+ action: FAIL
235
+ predicate: >-
236
+ {{ 'sheets' in response and response["sheets"] | length != 1 }}
237
+ # error lacks of information as error_message can't interpolate stream_slice["potato"]
238
+ error_message: >-
239
+ Unable to read the schema of sheet. Error: Unexpected return
240
+ result: Sheet was
241
+ expected to contain data on exactly 1 sheet.
242
+ ignore_duplicate_headers:
243
+ type: HttpResponseFilter
244
+ action: IGNORE
245
+ predicate: >-
246
+ {{ response["sheets"][0]["data"][0]["rowData"][0]["values"] |
247
+ map(attribute="formattedValue") | list | length !=
248
+ response["sheets"][0]["data"][0]["rowData"][0]["values"] |
249
+ map(attribute="formattedValue") | list | unique | list | length }}
250
+ error_message: >-
251
+ {%- set headers_found = response["sheets"][0]["data"][0]["rowData"][0]["values"] | map(attribute="formattedValue") | list -%}
252
+ {%- set headers_count = {} -%}
253
+ {%- set duplicate_fields = [] -%}
254
+ {%- for headerFound in headers_found -%}
255
+ {%- if headerFound is not none -%}
256
+ {%- set headers_count = headers_count.update({headerFound: headers_count.get(headerFound, 0) + 1}) or headers_count -%}
257
+ {%- if headers_count.get(headerFound) > 1 and headerFound not in duplicate_fields -%}
258
+ {%- set duplicate_fields = duplicate_fields.append(headerFound) -%}
259
+ {%- endif -%}
260
+ {%- endif -%}
261
+ {%- endfor -%}
262
+ Duplicate headers found in sheet {{ response["sheets"][0]["properties"]["title"] }}.
263
+ Ignoring them: {{ duplicate_fields }}
264
+ fail_duplicate_headers:
265
+ $ref: "#/definitions/response_filters/ignore_duplicate_headers"
266
+ action: FAIL
267
+ error_message: >-
268
+ {%- set headers_found = response["sheets"][0]["data"][0]["rowData"][0]["values"] | map(attribute="formattedValue") | list -%}
269
+ {%- set headers_count = {} -%}
270
+ {%- set duplicate_fields = [] -%}
271
+ {%- for headerFound in headers_found -%}
272
+ {%- if headerFound is not none -%}
273
+ {%- set headers_count = headers_count.update({headerFound: headers_count.get(headerFound, 0) + 1}) or headers_count -%}
274
+ {%- if headers_count.get(headerFound) > 1 and headerFound not in duplicate_fields -%}
275
+ {%- set duplicate_fields = duplicate_fields.append(headerFound) -%}
276
+ {%- endif -%}
277
+ {%- endif -%}
278
+ {%- endfor -%}
279
+ The following duplicate headers were found in the sheet.
280
+ Please fix them to continue: {{ duplicate_fields }}
281
+ server_error:
282
+ type: HttpResponseFilter
283
+ action: RETRY
284
+ http_codes:
285
+ - 500
286
+ - 502
287
+ - 503
288
+ error_message: >-
289
+ There was an issue
290
+ with the Google Sheets API. This is usually a temporary issue from
291
+ Google's side. Please try again. If this issue persists, contact
292
+ support
293
+ forbidden:
294
+ type: HttpResponseFilter
295
+ action: FAIL
296
+ http_codes:
297
+ - 403
298
+ error_message: >-
299
+ The authenticated Google Sheets user does not have permissions to view the
300
+ spreadsheet with id {{config["spreadsheet_id"]}}. Please ensure the authenticated user has access
301
+ to the Spreadsheet and reauthenticate. If the issue persists, contact support.
302
+ The caller does not have right permissions.
303
+ not_found:
304
+ type: HttpResponseFilter
305
+ action: FAIL
306
+ http_codes:
307
+ - 404
308
+ error_message: >-
309
+ The requested Google Sheets spreadsheet with id {{config["spreadsheet_id"]}} does not exist.
310
+ Please ensure the Spreadsheet Link you have set is valid and the spreadsheet exists. If the issue persists, contact support
311
+ rate_limit:
312
+ type: HttpResponseFilter
313
+ action: RATE_LIMITED
314
+ http_codes:
315
+ - 429
316
+ error_message: >-
317
+ Rate limit has been
318
+ reached. Please try later or request a higher quota for your account.
319
+ single_sheet_response_error_filters:
320
+ - $ref: "#/definitions/response_filters/expected_one_sheet"
321
+ - $ref: "#/definitions/response_filters/ignore_duplicate_headers"
322
+ check_operation_single_sheet_response_error_filters:
323
+ - $ref: "#/definitions/response_filters/expected_one_sheet"
324
+ - $ref: "#/definitions/response_filters/fail_duplicate_headers"
325
+ response_error_filters:
326
+ - $ref: "#/definitions/response_filters/server_error"
327
+ - $ref: "#/definitions/response_filters/forbidden"
328
+ - $ref: "#/definitions/response_filters/not_found"
329
+ - $ref: "#/definitions/response_filters/rate_limit"
330
+ schema_type_identifier:
331
+ key_pointer:
332
+ - formattedValue
333
+ schema_pointer:
334
+ - values
335
+ jwt_authenticator:
336
+ type: JwtAuthenticator
337
+ secret_key: "{{ json_loads(config['credentials']['service_account_info'])['private_key'] }}"
338
+ algorithm: "RS256"
339
+ token_duration: 3600
340
+ jwt_payload:
341
+ aud: "{{ json_loads(config['credentials']['service_account_info'])['token_uri'] }}"
342
+ iss: "{{ json_loads(config['credentials']['service_account_info'])['client_email'] }}"
343
+ additional_jwt_payload:
344
+ scope: "https://www.googleapis.com/auth/spreadsheets.readonly https://www.googleapis.com/auth/drive.readonly"
345
+ oauth_authenticator:
346
+ type: OAuthAuthenticator
347
+ refresh_request_body: {}
348
+ token_refresh_endpoint: https://www.googleapis.com/oauth2/v4/token
349
+ grant_type: refresh_token
350
+ client_id: '{{ config["credentials"]["client_id"] }}'
351
+ client_secret: '{{ config["credentials"]["client_secret"] }}'
352
+ refresh_token: '{{ config["credentials"]["refresh_token"] }}'
353
+ jwt_profile_assertion_oauth_authenticator:
354
+ type: OAuthAuthenticator
355
+ token_refresh_endpoint: https://oauth2.googleapis.com/token
356
+ refresh_request_headers:
357
+ Content-Type: application/x-www-form-urlencoded
358
+ use_profile_assertion: true
359
+ profile_assertion:
360
+ $ref: "#/definitions/jwt_authenticator"
361
+ authenticator:
362
+ type: SelectiveAuthenticator
363
+ authenticator_selection_path: ["credentials", "auth_type"]
364
+ authenticators:
365
+ Client: "#/definitions/oauth_authenticator"
366
+ Service: "#/definitions/jwt_profile_assertion_oauth_authenticator"
367
+
368
+ schemas:
369
+ sheets:
370
+ type: object
371
+ $schema: http://json-schema.org/schema#
372
+ additionalProperties: true
373
+ properties:
374
+ gridProperties:
375
+ type:
376
+ - object
377
+ - "null"
378
+ properties:
379
+ columnCount:
380
+ type:
381
+ - number
382
+ - "null"
383
+ rowCount:
384
+ type:
385
+ - number
386
+ - "null"
387
+ index:
388
+ type:
389
+ - number
390
+ - "null"
391
+ sheetId:
392
+ type:
393
+ - number
394
+ - "null"
395
+ sheetType:
396
+ type:
397
+ - string
398
+ - "null"
399
+ title:
400
+ type:
401
+ - string
402
+ - "null"
403
+
404
+ concurrency_level:
405
+ type: ConcurrencyLevel
406
+ default_concurrency: 1
407
+ max_concurrency: 1
@@ -1,5 +1,5 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
 
@@ -1,5 +1,5 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
 
@@ -1,15 +1,53 @@
1
1
  #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
 
6
6
  import sys
7
+ import traceback
8
+ from datetime import datetime
9
+ from typing import List
7
10
 
8
- from airbyte_cdk.entrypoint import launch
11
+ from orjson import orjson
9
12
 
10
- from .source import SourceGoogleSheets
13
+ from airbyte_cdk.entrypoint import AirbyteEntrypoint, launch
14
+ from airbyte_cdk.models import AirbyteErrorTraceMessage, AirbyteMessage, AirbyteMessageSerializer, AirbyteTraceMessage, TraceType, Type
15
+ from source_google_sheets.source import SourceGoogleSheets
16
+
17
+
18
+ def _get_source(args: List[str]):
19
+ catalog_path = AirbyteEntrypoint.extract_catalog(args)
20
+ config_path = AirbyteEntrypoint.extract_config(args)
21
+ state_path = AirbyteEntrypoint.extract_state(args)
22
+ try:
23
+ return SourceGoogleSheets(
24
+ SourceGoogleSheets.read_catalog(catalog_path) if catalog_path else None,
25
+ SourceGoogleSheets.read_config(config_path) if config_path else None,
26
+ SourceGoogleSheets.read_state(state_path) if state_path else None,
27
+ )
28
+ except Exception as error:
29
+ print(
30
+ orjson.dumps(
31
+ AirbyteMessageSerializer.dump(
32
+ AirbyteMessage(
33
+ type=Type.TRACE,
34
+ trace=AirbyteTraceMessage(
35
+ type=TraceType.ERROR,
36
+ emitted_at=int(datetime.now().timestamp() * 1000),
37
+ error=AirbyteErrorTraceMessage(
38
+ message=f"Error starting the sync. This could be due to an invalid configuration or catalog. Please contact Support for assistance. Error: {error}",
39
+ stack_trace=traceback.format_exc(),
40
+ ),
41
+ ),
42
+ )
43
+ )
44
+ ).decode()
45
+ )
46
+ return None
11
47
 
12
48
 
13
49
  def run():
14
- source = SourceGoogleSheets()
15
- launch(source, sys.argv[1:])
50
+ _args = sys.argv[1:]
51
+ source = _get_source(_args)
52
+ if source:
53
+ launch(source, _args)