airbyte-source-google-search-console 1.5.18__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {airbyte_source_google_search_console-1.5.18.dist-info → airbyte_source_google_search_console-1.6.0.dist-info}/METADATA +1 -1
- {airbyte_source_google_search_console-1.5.18.dist-info → airbyte_source_google_search_console-1.6.0.dist-info}/RECORD +10 -11
- source_google_search_console/components.py +81 -0
- source_google_search_console/manifest.yaml +341 -0
- source_google_search_console/run.py +47 -9
- source_google_search_console/service_account_authenticator.py +3 -1
- source_google_search_console/source.py +25 -46
- source_google_search_console/streams.py +0 -41
- source_google_search_console/schemas/search_analytics_by_country.json +0 -41
- source_google_search_console/schemas/sitemaps.json +0 -61
- source_google_search_console/schemas/sites.json +0 -14
- {airbyte_source_google_search_console-1.5.18.dist-info → airbyte_source_google_search_console-1.6.0.dist-info}/WHEEL +0 -0
- {airbyte_source_google_search_console-1.5.18.dist-info → airbyte_source_google_search_console-1.6.0.dist-info}/entry_points.txt +0 -0
@@ -1,9 +1,10 @@
|
|
1
1
|
source_google_search_console/__init__.py,sha256=HQCPu-CK7XmVDtP9rmTdB2XyraVCc6pv9pw38-O8y48,1191
|
2
|
+
source_google_search_console/components.py,sha256=5o8kH2xwYUvk3yjnSd6okJVF2KBUqUIF4V97xRpPAyI,2803
|
2
3
|
source_google_search_console/config_migrations.py,sha256=Cl4SUdJpAf6wMM_vVhqjjU89NfUq9LIGJ9zNrWiBk-A,4235
|
3
4
|
source_google_search_console/exceptions.py,sha256=iD3jYC4WxVCEKGsqQ7Vaj1tbjhJZ4S5mnSDnwFJdsIQ,1097
|
4
|
-
source_google_search_console/
|
5
|
+
source_google_search_console/manifest.yaml,sha256=zoPWyJ7gwOOK6yKAk4H92cD-N7Xg3iHtp5hTT_e80QA,10974
|
6
|
+
source_google_search_console/run.py,sha256=TBkPlseTERarkj6wL8AMEKgm5Xsb2drnltPVH6257-M,2195
|
5
7
|
source_google_search_console/schemas/search_analytics_all_fields.json,sha256=iQxRh_c_yz3uGofqpo1KX571TMmzYjKScb0PtI6SN_Q,1729
|
6
|
-
source_google_search_console/schemas/search_analytics_by_country.json,sha256=xvUVjGRy63dsc7c0O-Kg7DUzybRpD-r_-VYhPmDBw_o,1491
|
7
8
|
source_google_search_console/schemas/search_analytics_by_date.json,sha256=meCbWDayc1y0q-Lu-CAdjQVnsM8xZBX3BdF129UC1P8,1388
|
8
9
|
source_google_search_console/schemas/search_analytics_by_device.json,sha256=VtoFjmmv9rx-uhSFaRn0wm4LeSxRIaexrxg2Spvbneo,1525
|
9
10
|
source_google_search_console/schemas/search_analytics_by_page.json,sha256=KyUojZc4Lv3hPswxIJzUL5QDNsbvSugGjl_uHGF7Am4,1473
|
@@ -14,13 +15,11 @@ source_google_search_console/schemas/search_analytics_keyword_site_report_by_sit
|
|
14
15
|
source_google_search_console/schemas/search_analytics_page_report.json,sha256=-b0Y0LenTchS0q9A2aQ4hIjUjXkYF8erOtyrTMhf6MM,1776
|
15
16
|
source_google_search_console/schemas/search_analytics_site_report_by_page.json,sha256=hWKHkm1reqGGu1dNcWBe6_XkZ5tK-UaiymrYRVgxRxI,1515
|
16
17
|
source_google_search_console/schemas/search_analytics_site_report_by_site.json,sha256=rAh6LuNy7nCrrNM9MTd0qxAVc886ecQaqWRgV63OfyA,1408
|
17
|
-
source_google_search_console/
|
18
|
-
source_google_search_console/
|
19
|
-
source_google_search_console/service_account_authenticator.py,sha256=gjUxt0xFxj82uviCQNTsA1Jlee__UDhYNjE7bRO1G0U,1227
|
20
|
-
source_google_search_console/source.py,sha256=2OpJDWhMNoRj9Q4p6Br3DMOrphqDb_6Ca0RWGAvtnrQ,10010
|
18
|
+
source_google_search_console/service_account_authenticator.py,sha256=pAWKAXfwfTY3xkXvQJH0EyFphFULdCIcC47YXYTO9X8,1307
|
19
|
+
source_google_search_console/source.py,sha256=7FD4ciRrsptU7ZIxAU2xLC37bgjKWzNkflE9ybmgpXM,9113
|
21
20
|
source_google_search_console/spec.json,sha256=WYtFvaSqWYGm1Dt2yV9G92U78Q94rh9oarbxJe3H7xo,8470
|
22
|
-
source_google_search_console/streams.py,sha256=
|
23
|
-
airbyte_source_google_search_console-1.
|
24
|
-
airbyte_source_google_search_console-1.
|
25
|
-
airbyte_source_google_search_console-1.
|
26
|
-
airbyte_source_google_search_console-1.
|
21
|
+
source_google_search_console/streams.py,sha256=T0eqhmxGPDAfFNMXFfG_vM4aYHFaHAHHAoj6s5XnjnI,18760
|
22
|
+
airbyte_source_google_search_console-1.6.0.dist-info/METADATA,sha256=QenlO6qMNtEw1Oe63N-tXiUZ-ky5qydJ-99oavTF3Ew,5621
|
23
|
+
airbyte_source_google_search_console-1.6.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
24
|
+
airbyte_source_google_search_console-1.6.0.dist-info/entry_points.txt,sha256=DMcgc9bCX-Vt6hm_68pa77qS3eGdeMhg-UdlFc-XKUM,85
|
25
|
+
airbyte_source_google_search_console-1.6.0.dist-info/RECORD,,
|
@@ -0,0 +1,81 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from typing import Any, Mapping, Optional
|
7
|
+
|
8
|
+
from airbyte_cdk.sources.declarative.migrations.state_migration import StateMigration
|
9
|
+
|
10
|
+
|
11
|
+
@dataclass
|
12
|
+
class NestedSubstreamStateMigration(StateMigration):
|
13
|
+
"""
|
14
|
+
We require a custom state migration because SearchAnalytics streams contain two nested levels of
|
15
|
+
substreams. The existing LegacyToPerPartitionStateMigration only handles one level.
|
16
|
+
|
17
|
+
Legacy state format is as follows:
|
18
|
+
{
|
19
|
+
"date": "2025-05-28",
|
20
|
+
"https://www.example.com/": {
|
21
|
+
"web": {
|
22
|
+
"date": "2025-05-25"
|
23
|
+
},
|
24
|
+
"news": {
|
25
|
+
"date": "2023-05-22"
|
26
|
+
}
|
27
|
+
}
|
28
|
+
}
|
29
|
+
|
30
|
+
The resulting migrated per-partition state is:
|
31
|
+
{
|
32
|
+
"use_global_cursor": false,
|
33
|
+
"states": [
|
34
|
+
{
|
35
|
+
"partition": {
|
36
|
+
"search_type": "web",
|
37
|
+
"site_url": "https://www.example.com/"
|
38
|
+
},
|
39
|
+
"cursor": {
|
40
|
+
"date": "2025-05-25"
|
41
|
+
}
|
42
|
+
},
|
43
|
+
{
|
44
|
+
"partition": {
|
45
|
+
"search_type": "news",
|
46
|
+
"site_url": "https://www.example.com/"
|
47
|
+
},
|
48
|
+
"cursor": {
|
49
|
+
"date": "2023-05-22"
|
50
|
+
}
|
51
|
+
}],
|
52
|
+
"state": {
|
53
|
+
"date": "2025-05-25"
|
54
|
+
}
|
55
|
+
}
|
56
|
+
"""
|
57
|
+
|
58
|
+
def should_migrate(self, stream_state: Mapping[str, Any]) -> bool:
|
59
|
+
return len(stream_state) > 0 and "states" not in stream_state
|
60
|
+
|
61
|
+
def migrate(self, stream_state: Mapping[str, Any]) -> Mapping[str, Any]:
|
62
|
+
global_state: Optional[Mapping[str, Any]] = None
|
63
|
+
per_partition_state = []
|
64
|
+
for site_url_key, search_type_state in stream_state.items():
|
65
|
+
if site_url_key == "date":
|
66
|
+
# The legacy state also contains a global cursor value under the `date` key which equates
|
67
|
+
# to global state.
|
68
|
+
#
|
69
|
+
# However, the Python implementation does not appear to be implemented
|
70
|
+
# correctly and simply saves the state of the last seen partition. Since I don't trust the
|
71
|
+
# legacy value and in the current implementation global state is applied to partitions
|
72
|
+
# without an existing value, I'm making a conscious choice to not migrate the global value.
|
73
|
+
continue
|
74
|
+
else:
|
75
|
+
site_url = site_url_key
|
76
|
+
for search_type_key, cursor in search_type_state.items():
|
77
|
+
per_partition_state.append({"partition": {"site_url": site_url, "search_type": search_type_key}, "cursor": cursor})
|
78
|
+
return {
|
79
|
+
"use_global_cursor": False,
|
80
|
+
"states": per_partition_state,
|
81
|
+
}
|
@@ -0,0 +1,341 @@
|
|
1
|
+
version: 6.44.0
|
2
|
+
|
3
|
+
type: DeclarativeSource
|
4
|
+
|
5
|
+
check:
|
6
|
+
type: CheckStream
|
7
|
+
stream_names:
|
8
|
+
- sites
|
9
|
+
|
10
|
+
definitions:
|
11
|
+
oauth_authenticator:
|
12
|
+
type: OAuthAuthenticator
|
13
|
+
client_id: "{{ config.get('authorization', {}).get('client_id') }}"
|
14
|
+
client_secret: "{{ config.get('authorization', {}).get('client_secret') }}"
|
15
|
+
refresh_token: "{{ config.get('authorization', {}).get('refresh_token') }}"
|
16
|
+
token_refresh_endpoint: "https://oauth2.googleapis.com/token"
|
17
|
+
|
18
|
+
jwt_profile_assertion_oauth_authenticator:
|
19
|
+
type: OAuthAuthenticator
|
20
|
+
token_refresh_endpoint: https://oauth2.googleapis.com/token
|
21
|
+
refresh_request_headers:
|
22
|
+
Content-Type: application/x-www-form-urlencoded
|
23
|
+
use_profile_assertion: true
|
24
|
+
profile_assertion:
|
25
|
+
type: JwtAuthenticator
|
26
|
+
secret_key: "{{ json_loads(config.get('authorization', {}).get('service_account_info', {})).get('private_key') }}"
|
27
|
+
algorithm: "RS256"
|
28
|
+
token_duration: 3600
|
29
|
+
jwt_payload:
|
30
|
+
aud: "{{ json_loads(config.get('authorization', {}).get('service_account_info', {})).get('token_uri') }}"
|
31
|
+
iss: "{{ json_loads(config.get('authorization', {}).get('service_account_info', {})).get('client_email') }}"
|
32
|
+
additional_jwt_payload:
|
33
|
+
scope: "https://www.googleapis.com/auth/webmasters.readonly"
|
34
|
+
|
35
|
+
selective_authenticator:
|
36
|
+
type: SelectiveAuthenticator
|
37
|
+
authenticator_selection_path: ["authorization", "auth_type"]
|
38
|
+
authenticators:
|
39
|
+
Client: "#/definitions/oauth_authenticator"
|
40
|
+
Service: "#/definitions/jwt_profile_assertion_oauth_authenticator"
|
41
|
+
|
42
|
+
search_analytics_by_country_stream:
|
43
|
+
type: DeclarativeStream
|
44
|
+
name: search_analytics_by_country
|
45
|
+
primary_key:
|
46
|
+
- site_url
|
47
|
+
- date
|
48
|
+
- country
|
49
|
+
- search_type
|
50
|
+
retriever:
|
51
|
+
type: SimpleRetriever
|
52
|
+
requester:
|
53
|
+
type: HttpRequester
|
54
|
+
url_base: https://www.googleapis.com/webmasters/v3
|
55
|
+
path: "/sites/{{ sanitize_url(stream_partition.get('site_url')) }}/searchAnalytics/query"
|
56
|
+
http_method: POST
|
57
|
+
authenticator: "#/definitions/selective_authenticator"
|
58
|
+
request_headers:
|
59
|
+
Content-Type: "application/json"
|
60
|
+
request_body_json:
|
61
|
+
startDate: "{{ stream_interval.get('start_time') }}"
|
62
|
+
endDate: "{{ stream_interval.get('end_time') }}"
|
63
|
+
dimensions: ["date", "country"]
|
64
|
+
type: "{{ stream_partition.get('search_type') }}"
|
65
|
+
aggregationType: auto
|
66
|
+
dataState: "{{ config.get('data_state', 'final') }}"
|
67
|
+
# Currently relying on the default error handler behavior. Two pieces of functionality not covered are
|
68
|
+
# - Silently skipping over 403 permissions errors and relying on partial success reporting
|
69
|
+
# - Retrying 400 errors with aggregation_type=auto instead of failing outright
|
70
|
+
paginator:
|
71
|
+
type: DefaultPaginator
|
72
|
+
page_token_option:
|
73
|
+
type: RequestOption
|
74
|
+
field_name: startRow
|
75
|
+
inject_into: body_json
|
76
|
+
page_size_option:
|
77
|
+
type: RequestOption
|
78
|
+
field_name: rowLimit
|
79
|
+
inject_into: body_json
|
80
|
+
pagination_strategy:
|
81
|
+
type: OffsetIncrement
|
82
|
+
page_size: 25000
|
83
|
+
inject_on_first_request: true
|
84
|
+
record_selector:
|
85
|
+
type: RecordSelector
|
86
|
+
extractor:
|
87
|
+
type: DpathExtractor
|
88
|
+
field_path:
|
89
|
+
- rows
|
90
|
+
partition_router:
|
91
|
+
- type: ListPartitionRouter
|
92
|
+
values: "{{ config['site_urls'] }}"
|
93
|
+
cursor_field: site_url
|
94
|
+
- type: ListPartitionRouter
|
95
|
+
values:
|
96
|
+
- web
|
97
|
+
- news
|
98
|
+
- image
|
99
|
+
- video
|
100
|
+
- discover
|
101
|
+
- googleNews
|
102
|
+
cursor_field: search_type
|
103
|
+
incremental_sync:
|
104
|
+
type: DatetimeBasedCursor
|
105
|
+
cursor_field: date
|
106
|
+
cursor_datetime_formats:
|
107
|
+
- "%Y-%m-%d"
|
108
|
+
datetime_format: "%Y-%m-%d"
|
109
|
+
start_datetime:
|
110
|
+
type: MinMaxDatetime
|
111
|
+
datetime: "{{ config.get('start_date', '2021-01-01') }}"
|
112
|
+
datetime_format: "%Y-%m-%d"
|
113
|
+
end_datetime:
|
114
|
+
type: MinMaxDatetime
|
115
|
+
datetime: "{{ config.get('end_date', today_utc()) }}"
|
116
|
+
datetime_format: "%Y-%m-%d"
|
117
|
+
step: P3D
|
118
|
+
cursor_granularity: P1D
|
119
|
+
transformations:
|
120
|
+
- type: AddFields
|
121
|
+
fields:
|
122
|
+
- path:
|
123
|
+
- site_url
|
124
|
+
value: "{{ stream_partition['site_url'] }}"
|
125
|
+
- path:
|
126
|
+
- search_type
|
127
|
+
value: "{{ stream_partition['search_type'] }}"
|
128
|
+
# The values in the 'keys' array in the record correspond to the same order that the dimensions
|
129
|
+
# are requested in the API request. For example, if the request body was `dimensions: ["date", "country"]`,
|
130
|
+
# then the first value of `keys` is placed under the `date` field. These arrays are always be the same length
|
131
|
+
# After extracting the keys, the `keys` array is removed from the record.
|
132
|
+
- type: AddFields
|
133
|
+
fields:
|
134
|
+
- path:
|
135
|
+
- date
|
136
|
+
value: "{{ record['keys'][0] }}"
|
137
|
+
- path:
|
138
|
+
- country
|
139
|
+
value: "{{ record['keys'][1] }}"
|
140
|
+
- type: RemoveFields
|
141
|
+
field_pointers:
|
142
|
+
- - keys
|
143
|
+
schema_loader:
|
144
|
+
type: InlineSchemaLoader
|
145
|
+
schema:
|
146
|
+
$ref: "#/schemas/search_analytics_by_country"
|
147
|
+
state_migrations:
|
148
|
+
- type: CustomStateMigration
|
149
|
+
class_name: source_google_search_console.components.NestedSubstreamStateMigration
|
150
|
+
|
151
|
+
sites_stream:
|
152
|
+
type: DeclarativeStream
|
153
|
+
name: sites
|
154
|
+
retriever:
|
155
|
+
type: SimpleRetriever
|
156
|
+
requester:
|
157
|
+
type: HttpRequester
|
158
|
+
url_base: https://www.googleapis.com/webmasters/v3
|
159
|
+
path: "/sites/{{ sanitize_url(stream_partition.get('site_url')) }}"
|
160
|
+
http_method: GET
|
161
|
+
authenticator: "#/definitions/selective_authenticator"
|
162
|
+
record_selector:
|
163
|
+
type: RecordSelector
|
164
|
+
extractor:
|
165
|
+
type: DpathExtractor
|
166
|
+
field_path: []
|
167
|
+
partition_router:
|
168
|
+
- type: ListPartitionRouter
|
169
|
+
values: "{{ config['site_urls'] }}"
|
170
|
+
cursor_field: site_url
|
171
|
+
schema_loader:
|
172
|
+
type: InlineSchemaLoader
|
173
|
+
schema:
|
174
|
+
$ref: "#/schemas/sites"
|
175
|
+
|
176
|
+
sitemaps_stream:
|
177
|
+
type: DeclarativeStream
|
178
|
+
name: sitemaps
|
179
|
+
retriever:
|
180
|
+
type: SimpleRetriever
|
181
|
+
requester:
|
182
|
+
type: HttpRequester
|
183
|
+
url_base: https://www.googleapis.com/webmasters/v3
|
184
|
+
path: "/sites/{{ sanitize_url(stream_partition.get('site_url')) }}/sitemaps"
|
185
|
+
http_method: GET
|
186
|
+
authenticator: "#/definitions/selective_authenticator"
|
187
|
+
record_selector:
|
188
|
+
type: RecordSelector
|
189
|
+
extractor:
|
190
|
+
type: DpathExtractor
|
191
|
+
field_path:
|
192
|
+
- "sitemap"
|
193
|
+
partition_router:
|
194
|
+
- type: ListPartitionRouter
|
195
|
+
values: "{{ config['site_urls'] }}"
|
196
|
+
cursor_field: site_url
|
197
|
+
schema_loader:
|
198
|
+
type: InlineSchemaLoader
|
199
|
+
schema:
|
200
|
+
$ref: "#/schemas/sitemaps"
|
201
|
+
|
202
|
+
streams:
|
203
|
+
- "#/definitions/search_analytics_by_country_stream"
|
204
|
+
- "#/definitions/sites_stream"
|
205
|
+
- "#/definitions/sitemaps_stream"
|
206
|
+
|
207
|
+
schemas:
|
208
|
+
search_analytics_by_country:
|
209
|
+
$schema: "http://json-schema.org/draft-07/schema#"
|
210
|
+
type: object
|
211
|
+
properties:
|
212
|
+
site_url:
|
213
|
+
description: The URL of the site for which the search analytics data is being reported.
|
214
|
+
type:
|
215
|
+
- "null"
|
216
|
+
- string
|
217
|
+
search_type:
|
218
|
+
description: >-
|
219
|
+
The type of search (web search, image search, video search, etc.) for
|
220
|
+
which the data is being reported.
|
221
|
+
type:
|
222
|
+
- "null"
|
223
|
+
- string
|
224
|
+
date:
|
225
|
+
description: The date for which the search analytics data is being reported.
|
226
|
+
type:
|
227
|
+
- "null"
|
228
|
+
- string
|
229
|
+
format: date
|
230
|
+
country:
|
231
|
+
description: The country for which the search analytics data is being reported.
|
232
|
+
type:
|
233
|
+
- "null"
|
234
|
+
- string
|
235
|
+
clicks:
|
236
|
+
description: >-
|
237
|
+
The number of times users clicked on the search result for a specific
|
238
|
+
country.
|
239
|
+
type:
|
240
|
+
- "null"
|
241
|
+
- integer
|
242
|
+
impressions:
|
243
|
+
description: >-
|
244
|
+
The total number of times a search result was shown in search results for
|
245
|
+
a specific country.
|
246
|
+
type:
|
247
|
+
- "null"
|
248
|
+
- integer
|
249
|
+
ctr:
|
250
|
+
description: >-
|
251
|
+
The click-through rate, i.e., the ratio of clicks to impressions for a
|
252
|
+
specific country.
|
253
|
+
type:
|
254
|
+
- "null"
|
255
|
+
- number
|
256
|
+
multipleOf: 1.e-25
|
257
|
+
position:
|
258
|
+
description: >-
|
259
|
+
The average position at which the site's search result appeared for a
|
260
|
+
specific country.
|
261
|
+
type:
|
262
|
+
- "null"
|
263
|
+
- number
|
264
|
+
multipleOf: 1.e-25
|
265
|
+
sites:
|
266
|
+
$schema: "http://json-schema.org/draft-07/schema#"
|
267
|
+
type: object
|
268
|
+
properties:
|
269
|
+
siteUrl:
|
270
|
+
description: "The URL of the site data being fetched"
|
271
|
+
type: ["null", "string"]
|
272
|
+
permissionLevel:
|
273
|
+
description: "The user's permission level for the site (owner, full, restricted, etc.)"
|
274
|
+
type: ["null", "string"]
|
275
|
+
sitemaps:
|
276
|
+
$schema: "http://json-schema.org/draft-07/schema#"
|
277
|
+
type: object
|
278
|
+
properties:
|
279
|
+
path:
|
280
|
+
description: "Path to the sitemap file"
|
281
|
+
type:
|
282
|
+
- "null"
|
283
|
+
- string
|
284
|
+
lastSubmitted:
|
285
|
+
description: "Timestamp when the sitemap was last submitted"
|
286
|
+
type:
|
287
|
+
- "null"
|
288
|
+
- string
|
289
|
+
format: date-time
|
290
|
+
isPending:
|
291
|
+
description: "Flag indicating if the sitemap is pending for processing"
|
292
|
+
type:
|
293
|
+
- "null"
|
294
|
+
- boolean
|
295
|
+
isSitemapsIndex:
|
296
|
+
description: "Flag indicating if the data represents a sitemap index"
|
297
|
+
type:
|
298
|
+
- "null"
|
299
|
+
- boolean
|
300
|
+
type:
|
301
|
+
description: "Type of the sitemap"
|
302
|
+
type:
|
303
|
+
- "null"
|
304
|
+
- string
|
305
|
+
lastDownloaded:
|
306
|
+
description: "Timestamp when the sitemap was last downloaded"
|
307
|
+
type:
|
308
|
+
- "null"
|
309
|
+
- string
|
310
|
+
format: date-time
|
311
|
+
warnings:
|
312
|
+
description: "Warnings encountered while processing the sitemaps"
|
313
|
+
type:
|
314
|
+
- "null"
|
315
|
+
- string
|
316
|
+
errors:
|
317
|
+
description: "Errors encountered while processing the sitemaps"
|
318
|
+
type:
|
319
|
+
- "null"
|
320
|
+
- string
|
321
|
+
contents:
|
322
|
+
description: "Data related to the sitemap contents"
|
323
|
+
type: array
|
324
|
+
items:
|
325
|
+
type: object
|
326
|
+
properties:
|
327
|
+
type:
|
328
|
+
description: "Type of the sitemap content"
|
329
|
+
type:
|
330
|
+
- "null"
|
331
|
+
- string
|
332
|
+
submitted:
|
333
|
+
description: "Number of submitted sitemap URLs"
|
334
|
+
type:
|
335
|
+
- "null"
|
336
|
+
- string
|
337
|
+
indexed:
|
338
|
+
description: "Number of indexed sitemap URLs"
|
339
|
+
type:
|
340
|
+
- "null"
|
341
|
+
- string
|
@@ -1,18 +1,56 @@
|
|
1
1
|
#
|
2
|
-
# Copyright (c)
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
|
6
5
|
import sys
|
6
|
+
import traceback
|
7
|
+
from datetime import datetime
|
8
|
+
from typing import List
|
9
|
+
|
10
|
+
from orjson import orjson
|
7
11
|
|
8
|
-
from airbyte_cdk.entrypoint import launch
|
12
|
+
from airbyte_cdk.entrypoint import AirbyteEntrypoint, launch, logger
|
13
|
+
from airbyte_cdk.exception_handler import init_uncaught_exception_handler
|
14
|
+
from airbyte_cdk.models import AirbyteErrorTraceMessage, AirbyteMessage, AirbyteMessageSerializer, AirbyteTraceMessage, TraceType, Type
|
9
15
|
from source_google_search_console import SourceGoogleSearchConsole
|
10
16
|
from source_google_search_console.config_migrations import MigrateCustomReports
|
11
17
|
|
12
18
|
|
13
|
-
def
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
+
def _get_source(args: List[str]):
|
20
|
+
catalog_path = AirbyteEntrypoint.extract_catalog(args)
|
21
|
+
config_path = AirbyteEntrypoint.extract_config(args)
|
22
|
+
state_path = AirbyteEntrypoint.extract_state(args)
|
23
|
+
try:
|
24
|
+
return SourceGoogleSearchConsole(
|
25
|
+
SourceGoogleSearchConsole.read_catalog(catalog_path) if catalog_path else None,
|
26
|
+
SourceGoogleSearchConsole.read_config(config_path) if config_path else None,
|
27
|
+
SourceGoogleSearchConsole.read_state(state_path) if state_path else None,
|
28
|
+
)
|
29
|
+
except Exception as error:
|
30
|
+
print(
|
31
|
+
orjson.dumps(
|
32
|
+
AirbyteMessageSerializer.dump(
|
33
|
+
AirbyteMessage(
|
34
|
+
type=Type.TRACE,
|
35
|
+
trace=AirbyteTraceMessage(
|
36
|
+
type=TraceType.ERROR,
|
37
|
+
emitted_at=int(datetime.now().timestamp() * 1000),
|
38
|
+
error=AirbyteErrorTraceMessage(
|
39
|
+
message=f"Error starting the sync. This could be due to an invalid configuration or catalog. Please contact Support for assistance. Error: {error}",
|
40
|
+
stack_trace=traceback.format_exc(),
|
41
|
+
),
|
42
|
+
),
|
43
|
+
)
|
44
|
+
)
|
45
|
+
).decode()
|
46
|
+
)
|
47
|
+
return None
|
48
|
+
|
49
|
+
|
50
|
+
def run() -> None:
|
51
|
+
init_uncaught_exception_handler(logger)
|
52
|
+
_args = sys.argv[1:]
|
53
|
+
source = _get_source(_args)
|
54
|
+
if source:
|
55
|
+
MigrateCustomReports.migrate(sys.argv[1:], source)
|
56
|
+
launch(source, _args)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
|
+
import json
|
4
5
|
|
5
6
|
import requests
|
6
7
|
from google.auth.transport.requests import Request
|
@@ -21,7 +22,8 @@ class ServiceAccountAuthenticator(AuthBase):
|
|
21
22
|
|
22
23
|
def __call__(self, request: requests.PreparedRequest) -> requests.PreparedRequest:
|
23
24
|
try:
|
24
|
-
|
25
|
+
service_account_info = json.loads(self.service_account_info)
|
26
|
+
credentials: Credentials = Credentials.from_service_account_info(service_account_info, scopes=self.scopes).with_subject(
|
25
27
|
self.email
|
26
28
|
)
|
27
29
|
if not credentials.valid:
|
@@ -11,8 +11,9 @@ import jsonschema
|
|
11
11
|
import pendulum
|
12
12
|
import requests
|
13
13
|
|
14
|
-
from airbyte_cdk.models import FailureType, SyncMode
|
15
|
-
from airbyte_cdk.sources import
|
14
|
+
from airbyte_cdk.models import ConfiguredAirbyteCatalog, FailureType, SyncMode
|
15
|
+
from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource
|
16
|
+
from airbyte_cdk.sources.source import TState
|
16
17
|
from airbyte_cdk.sources.streams import Stream
|
17
18
|
from airbyte_cdk.sources.streams.http.requests_native_auth import Oauth2Authenticator
|
18
19
|
from airbyte_cdk.utils import AirbyteTracedException
|
@@ -25,7 +26,6 @@ from source_google_search_console.exceptions import (
|
|
25
26
|
from source_google_search_console.service_account_authenticator import ServiceAccountAuthenticator
|
26
27
|
from source_google_search_console.streams import (
|
27
28
|
SearchAnalyticsAllFields,
|
28
|
-
SearchAnalyticsByCountry,
|
29
29
|
SearchAnalyticsByCustomDimensions,
|
30
30
|
SearchAnalyticsByDate,
|
31
31
|
SearchAnalyticsByDevice,
|
@@ -37,8 +37,6 @@ from source_google_search_console.streams import (
|
|
37
37
|
SearchAnalyticsPageReport,
|
38
38
|
SearchAnalyticsSiteReportByPage,
|
39
39
|
SearchAnalyticsSiteReportBySite,
|
40
|
-
Sitemaps,
|
41
|
-
Sites,
|
42
40
|
)
|
43
41
|
|
44
42
|
|
@@ -55,7 +53,10 @@ custom_reports_schema = {
|
|
55
53
|
}
|
56
54
|
|
57
55
|
|
58
|
-
class SourceGoogleSearchConsole(
|
56
|
+
class SourceGoogleSearchConsole(YamlDeclarativeSource):
|
57
|
+
def __init__(self, catalog: Optional[ConfiguredAirbyteCatalog], config: Optional[Mapping[str, Any]], state: TState, **kwargs):
|
58
|
+
super().__init__(catalog=catalog, config=config, state=state, **{"path_to_yaml": "manifest.yaml"})
|
59
|
+
|
59
60
|
@staticmethod
|
60
61
|
def normalize_url(url):
|
61
62
|
parse_result = urlparse(url)
|
@@ -68,7 +69,7 @@ class SourceGoogleSearchConsole(AbstractSource):
|
|
68
69
|
authorization = config["authorization"]
|
69
70
|
if authorization["auth_type"] == "Service":
|
70
71
|
try:
|
71
|
-
|
72
|
+
json.loads(authorization["service_account_info"])
|
72
73
|
except ValueError:
|
73
74
|
message = "authorization.service_account_info is not valid JSON"
|
74
75
|
raise AirbyteTracedException(message=message, internal_message=message, failure_type=FailureType.config_error)
|
@@ -110,29 +111,6 @@ class SourceGoogleSearchConsole(AbstractSource):
|
|
110
111
|
raise AirbyteTracedException(message=message, internal_message=message, failure_type=FailureType.config_error)
|
111
112
|
return config
|
112
113
|
|
113
|
-
def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Any]:
|
114
|
-
try:
|
115
|
-
config = self._validate_and_transform(config)
|
116
|
-
stream_kwargs = self.get_stream_kwargs(config)
|
117
|
-
self.validate_site_urls(config["site_urls"], stream_kwargs["authenticator"])
|
118
|
-
sites = Sites(**stream_kwargs)
|
119
|
-
stream_slice = sites.stream_slices(SyncMode.full_refresh)
|
120
|
-
|
121
|
-
# stream_slice returns all site_urls and we need to make sure that
|
122
|
-
# the connection is successful for all of them
|
123
|
-
for _slice in stream_slice:
|
124
|
-
sites_gen = sites.read_records(sync_mode=SyncMode.full_refresh, stream_slice=_slice)
|
125
|
-
next(sites_gen)
|
126
|
-
return True, None
|
127
|
-
|
128
|
-
except (InvalidSiteURLValidationError, UnauthorizedOauthError, UnauthorizedServiceAccountError, jsonschema.ValidationError) as e:
|
129
|
-
return False, repr(e)
|
130
|
-
except (Exception, UnidentifiedError) as error:
|
131
|
-
return (
|
132
|
-
False,
|
133
|
-
f"Unable to check connectivity to Google Search Console API - {repr(error)}",
|
134
|
-
)
|
135
|
-
|
136
114
|
def validate_site_urls(self, site_urls: List[str], auth: Union[ServiceAccountAuthenticator, Oauth2Authenticator]):
|
137
115
|
if isinstance(auth, ServiceAccountAuthenticator):
|
138
116
|
request = auth(requests.Request(method="GET", url="https://www.googleapis.com/webmasters/v3/sites"))
|
@@ -172,22 +150,23 @@ class SourceGoogleSearchConsole(AbstractSource):
|
|
172
150
|
config = self._validate_and_transform(config)
|
173
151
|
stream_config = self.get_stream_kwargs(config)
|
174
152
|
|
175
|
-
streams =
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
153
|
+
streams = super().streams(config=config)
|
154
|
+
|
155
|
+
streams.extend(
|
156
|
+
[
|
157
|
+
SearchAnalyticsByDevice(**stream_config),
|
158
|
+
SearchAnalyticsByDate(**stream_config),
|
159
|
+
SearchAnalyticsByQuery(**stream_config),
|
160
|
+
SearchAnalyticsByPage(**stream_config),
|
161
|
+
SearchAnalyticsAllFields(**stream_config),
|
162
|
+
SearchAnalyticsKeywordPageReport(**stream_config),
|
163
|
+
SearchAnalyticsPageReport(**stream_config),
|
164
|
+
SearchAnalyticsSiteReportBySite(**stream_config),
|
165
|
+
SearchAnalyticsSiteReportByPage(**stream_config),
|
166
|
+
SearchAnalyticsKeywordSiteReportByPage(**stream_config),
|
167
|
+
SearchAnalyticsKeywordSiteReportBySite(**stream_config),
|
168
|
+
]
|
169
|
+
)
|
191
170
|
|
192
171
|
streams = streams + self.get_custom_reports(config=config, stream_config=stream_config)
|
193
172
|
|
@@ -84,41 +84,6 @@ class GoogleSearchConsole(HttpStream, ABC):
|
|
84
84
|
return response.status_code == 429 or 500 <= response.status_code < 600
|
85
85
|
|
86
86
|
|
87
|
-
class Sites(GoogleSearchConsole):
|
88
|
-
"""
|
89
|
-
API docs: https://developers.google.com/webmaster-tools/search-console-api-original/v3/sites
|
90
|
-
"""
|
91
|
-
|
92
|
-
primary_key = None
|
93
|
-
is_resumable = False
|
94
|
-
|
95
|
-
def path(
|
96
|
-
self,
|
97
|
-
stream_state: Mapping[str, Any] = None,
|
98
|
-
stream_slice: Mapping[str, Any] = None,
|
99
|
-
next_page_token: Mapping[str, Any] = None,
|
100
|
-
) -> str:
|
101
|
-
return f"sites/{stream_slice.get('site_url')}"
|
102
|
-
|
103
|
-
|
104
|
-
class Sitemaps(GoogleSearchConsole):
|
105
|
-
"""
|
106
|
-
API docs: https://developers.google.com/webmaster-tools/search-console-api-original/v3/sitemaps
|
107
|
-
"""
|
108
|
-
|
109
|
-
primary_key = None
|
110
|
-
data_field = "sitemap"
|
111
|
-
is_resumable = False
|
112
|
-
|
113
|
-
def path(
|
114
|
-
self,
|
115
|
-
stream_state: Mapping[str, Any] = None,
|
116
|
-
stream_slice: Mapping[str, Any] = None,
|
117
|
-
next_page_token: Mapping[str, Any] = None,
|
118
|
-
) -> str:
|
119
|
-
return f"sites/{stream_slice.get('site_url')}/sitemaps"
|
120
|
-
|
121
|
-
|
122
87
|
class SearchAnalytics(GoogleSearchConsole, CheckpointMixin, ABC):
|
123
88
|
"""
|
124
89
|
API docs: https://developers.google.com/webmaster-tools/search-console-api-original/v3/searchanalytics
|
@@ -335,12 +300,6 @@ class SearchAnalyticsByDate(SearchAnalytics):
|
|
335
300
|
dimensions = ["date"]
|
336
301
|
|
337
302
|
|
338
|
-
class SearchAnalyticsByCountry(SearchAnalytics):
|
339
|
-
primary_key = ["site_url", "date", "country", "search_type"]
|
340
|
-
search_types = ["web", "news", "image", "video", "discover", "googleNews"]
|
341
|
-
dimensions = ["date", "country"]
|
342
|
-
|
343
|
-
|
344
303
|
class SearchAnalyticsByDevice(SearchAnalytics):
|
345
304
|
primary_key = ["site_url", "date", "device", "search_type"]
|
346
305
|
search_types = ["web", "news", "image", "video", "googleNews"]
|
@@ -1,41 +0,0 @@
|
|
1
|
-
{
|
2
|
-
"$schema": "http://json-schema.org/draft-07/schema#",
|
3
|
-
"type": "object",
|
4
|
-
"properties": {
|
5
|
-
"site_url": {
|
6
|
-
"description": "The URL of the site for which the search analytics data is being reported.",
|
7
|
-
"type": ["null", "string"]
|
8
|
-
},
|
9
|
-
"search_type": {
|
10
|
-
"description": "The type of search (web search, image search, video search, etc.) for which the data is being reported.",
|
11
|
-
"type": ["null", "string"]
|
12
|
-
},
|
13
|
-
"date": {
|
14
|
-
"description": "The date for which the search analytics data is being reported.",
|
15
|
-
"type": ["null", "string"],
|
16
|
-
"format": "date"
|
17
|
-
},
|
18
|
-
"country": {
|
19
|
-
"description": "The country for which the search analytics data is being reported.",
|
20
|
-
"type": ["null", "string"]
|
21
|
-
},
|
22
|
-
"clicks": {
|
23
|
-
"description": "The number of times users clicked on the search result for a specific country.",
|
24
|
-
"type": ["null", "integer"]
|
25
|
-
},
|
26
|
-
"impressions": {
|
27
|
-
"description": "The total number of times a search result was shown in search results for a specific country.",
|
28
|
-
"type": ["null", "integer"]
|
29
|
-
},
|
30
|
-
"ctr": {
|
31
|
-
"description": "The click-through rate, i.e., the ratio of clicks to impressions for a specific country.",
|
32
|
-
"type": ["null", "number"],
|
33
|
-
"multipleOf": 1e-25
|
34
|
-
},
|
35
|
-
"position": {
|
36
|
-
"description": "The average position at which the site's search result appeared for a specific country.",
|
37
|
-
"type": ["null", "number"],
|
38
|
-
"multipleOf": 1e-25
|
39
|
-
}
|
40
|
-
}
|
41
|
-
}
|
@@ -1,61 +0,0 @@
|
|
1
|
-
{
|
2
|
-
"$schema": "http://json-schema.org/draft-07/schema#",
|
3
|
-
"type": "object",
|
4
|
-
"properties": {
|
5
|
-
"path": {
|
6
|
-
"description": "Path to the sitemap file",
|
7
|
-
"type": ["null", "string"]
|
8
|
-
},
|
9
|
-
"lastSubmitted": {
|
10
|
-
"description": "Timestamp when the sitemap was last submitted",
|
11
|
-
"type": ["null", "string"],
|
12
|
-
"format": "date-time"
|
13
|
-
},
|
14
|
-
"isPending": {
|
15
|
-
"description": "Flag indicating if the sitemap is pending for processing",
|
16
|
-
"type": ["null", "boolean"]
|
17
|
-
},
|
18
|
-
"isSitemapsIndex": {
|
19
|
-
"description": "Flag indicating if the data represents a sitemap index",
|
20
|
-
"type": ["null", "boolean"]
|
21
|
-
},
|
22
|
-
"type": {
|
23
|
-
"description": "Type of the sitemap",
|
24
|
-
"type": ["null", "string"]
|
25
|
-
},
|
26
|
-
"lastDownloaded": {
|
27
|
-
"description": "Timestamp when the sitemap was last downloaded",
|
28
|
-
"type": ["null", "string"],
|
29
|
-
"format": "date-time"
|
30
|
-
},
|
31
|
-
"warnings": {
|
32
|
-
"description": "Warnings encountered while processing the sitemaps",
|
33
|
-
"type": ["null", "string"]
|
34
|
-
},
|
35
|
-
"errors": {
|
36
|
-
"description": "Errors encountered while processing the sitemaps",
|
37
|
-
"type": ["null", "string"]
|
38
|
-
},
|
39
|
-
"contents": {
|
40
|
-
"description": "Data related to the sitemap contents",
|
41
|
-
"type": "array",
|
42
|
-
"items": {
|
43
|
-
"type": "object",
|
44
|
-
"properties": {
|
45
|
-
"type": {
|
46
|
-
"description": "Type of the sitemap content",
|
47
|
-
"type": ["null", "string"]
|
48
|
-
},
|
49
|
-
"submitted": {
|
50
|
-
"description": "Number of submitted sitemap URLs",
|
51
|
-
"type": ["null", "string"]
|
52
|
-
},
|
53
|
-
"indexed": {
|
54
|
-
"description": "Number of indexed sitemap URLs",
|
55
|
-
"type": ["null", "string"]
|
56
|
-
}
|
57
|
-
}
|
58
|
-
}
|
59
|
-
}
|
60
|
-
}
|
61
|
-
}
|
@@ -1,14 +0,0 @@
|
|
1
|
-
{
|
2
|
-
"$schema": "http://json-schema.org/draft-07/schema#",
|
3
|
-
"type": "object",
|
4
|
-
"properties": {
|
5
|
-
"siteUrl": {
|
6
|
-
"description": "The URL of the site data being fetched",
|
7
|
-
"type": ["null", "string"]
|
8
|
-
},
|
9
|
-
"permissionLevel": {
|
10
|
-
"description": "The user's permission level for the site (owner, full, restricted, etc.)",
|
11
|
-
"type": ["null", "string"]
|
12
|
-
}
|
13
|
-
}
|
14
|
-
}
|
File without changes
|