airbyte-source-google-search-console 1.8.0__py3-none-any.whl → 1.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {airbyte_source_google_search_console-1.8.0.dist-info → airbyte_source_google_search_console-1.9.1.dist-info}/METADATA +1 -1
- {airbyte_source_google_search_console-1.8.0.dist-info → airbyte_source_google_search_console-1.9.1.dist-info}/RECORD +7 -8
- source_google_search_console/components.py +105 -3
- source_google_search_console/manifest.yaml +159 -7
- source_google_search_console/source.py +10 -15
- source_google_search_console/streams.py +0 -347
- {airbyte_source_google_search_console-1.8.0.dist-info → airbyte_source_google_search_console-1.9.1.dist-info}/WHEEL +0 -0
- {airbyte_source_google_search_console-1.8.0.dist-info → airbyte_source_google_search_console-1.9.1.dist-info}/entry_points.txt +0 -0
@@ -1,14 +1,13 @@
|
|
1
1
|
source_google_search_console/__init__.py,sha256=HQCPu-CK7XmVDtP9rmTdB2XyraVCc6pv9pw38-O8y48,1191
|
2
|
-
source_google_search_console/components.py,sha256=
|
2
|
+
source_google_search_console/components.py,sha256=_6gl-xgwmvRgChEGNZhMsuYAcTw7gIf3yfe7IKvWkPk,6767
|
3
3
|
source_google_search_console/config_migrations.py,sha256=Cl4SUdJpAf6wMM_vVhqjjU89NfUq9LIGJ9zNrWiBk-A,4235
|
4
4
|
source_google_search_console/exceptions.py,sha256=iD3jYC4WxVCEKGsqQ7Vaj1tbjhJZ4S5mnSDnwFJdsIQ,1097
|
5
|
-
source_google_search_console/manifest.yaml,sha256=
|
5
|
+
source_google_search_console/manifest.yaml,sha256=luNdDPiw_wOoZPFEHFiO7sSUH-AG_cpBiRhPses7vGQ,55028
|
6
6
|
source_google_search_console/run.py,sha256=TBkPlseTERarkj6wL8AMEKgm5Xsb2drnltPVH6257-M,2195
|
7
7
|
source_google_search_console/service_account_authenticator.py,sha256=pAWKAXfwfTY3xkXvQJH0EyFphFULdCIcC47YXYTO9X8,1307
|
8
|
-
source_google_search_console/source.py,sha256=
|
8
|
+
source_google_search_console/source.py,sha256=TFuafbBg8Nlb-LsYwNSXH1Xcz9U0ELZRX0e3hjCKPzs,7669
|
9
9
|
source_google_search_console/spec.json,sha256=HebTMHop7twcJH6jjgAccYEgg93bTLGsp6jVdMj1x0c,9409
|
10
|
-
|
11
|
-
airbyte_source_google_search_console-1.
|
12
|
-
airbyte_source_google_search_console-1.
|
13
|
-
airbyte_source_google_search_console-1.
|
14
|
-
airbyte_source_google_search_console-1.8.0.dist-info/RECORD,,
|
10
|
+
airbyte_source_google_search_console-1.9.1.dist-info/METADATA,sha256=6dftcAmz795H8LN_VxuYk9xXCEN9uhVfoK5sRBHl-No,5621
|
11
|
+
airbyte_source_google_search_console-1.9.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
12
|
+
airbyte_source_google_search_console-1.9.1.dist-info/entry_points.txt,sha256=DMcgc9bCX-Vt6hm_68pa77qS3eGdeMhg-UdlFc-XKUM,85
|
13
|
+
airbyte_source_google_search_console-1.9.1.dist-info/RECORD,,
|
@@ -2,10 +2,13 @@
|
|
2
2
|
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
from typing import Any, Mapping, Optional
|
5
|
+
from dataclasses import dataclass, field
|
6
|
+
from typing import Any, Dict, List, Mapping, Optional
|
7
7
|
|
8
8
|
from airbyte_cdk.sources.declarative.migrations.state_migration import StateMigration
|
9
|
+
from airbyte_cdk.sources.declarative.schema import SchemaLoader
|
10
|
+
from airbyte_cdk.sources.declarative.transformations import RecordTransformation
|
11
|
+
from airbyte_cdk.sources.types import Config, StreamSlice, StreamState
|
9
12
|
|
10
13
|
|
11
14
|
@dataclass
|
@@ -59,7 +62,6 @@ class NestedSubstreamStateMigration(StateMigration):
|
|
59
62
|
return len(stream_state) > 0 and "states" not in stream_state
|
60
63
|
|
61
64
|
def migrate(self, stream_state: Mapping[str, Any]) -> Mapping[str, Any]:
|
62
|
-
global_state: Optional[Mapping[str, Any]] = None
|
63
65
|
per_partition_state = []
|
64
66
|
for site_url_key, search_type_state in stream_state.items():
|
65
67
|
if site_url_key == "date":
|
@@ -79,3 +81,103 @@ class NestedSubstreamStateMigration(StateMigration):
|
|
79
81
|
"use_global_cursor": False,
|
80
82
|
"states": per_partition_state,
|
81
83
|
}
|
84
|
+
|
85
|
+
|
86
|
+
@dataclass
|
87
|
+
class CustomReportExtractDimensionsFromKeys(RecordTransformation):
|
88
|
+
"""
|
89
|
+
A record transformation that remaps each value in the keys array back to its associated
|
90
|
+
dimension. The reason this is a custom component is because we're unable to use list
|
91
|
+
comprehension and and enumerate() is not a valid function in our Jinja contact so can't
|
92
|
+
iterate over the dimensions defined in the config to create each field transformation on the
|
93
|
+
stream_template for each custom report.
|
94
|
+
|
95
|
+
If we were able to, the actual ComponentMappingDefinition would look like this:
|
96
|
+
|
97
|
+
type: ComponentMappingDefinition
|
98
|
+
field_path:
|
99
|
+
- transformations
|
100
|
+
- "1"
|
101
|
+
- fields
|
102
|
+
value: "{{ [{'path': [dimension], 'value': '{{ record['keys'][index]} for index, dimension in enumerate(components_values['dimensions'])] }}"
|
103
|
+
|
104
|
+
or
|
105
|
+
|
106
|
+
type: ComponentMappingDefinition
|
107
|
+
field_path:
|
108
|
+
- transformations
|
109
|
+
- "1"
|
110
|
+
- fields
|
111
|
+
value: >
|
112
|
+
{% for index, dimension in enumerate(components_values["dimensions"]) %}
|
113
|
+
- type: AddFields
|
114
|
+
fields:
|
115
|
+
- path: [ {{ dimension }} ]
|
116
|
+
value: "{{ record['keys'][index] }}"
|
117
|
+
{% endfor %}
|
118
|
+
"""
|
119
|
+
|
120
|
+
dimensions: List[str] = field(default_factory=lambda: [])
|
121
|
+
|
122
|
+
def transform(
|
123
|
+
self,
|
124
|
+
record: Dict[str, Any],
|
125
|
+
config: Optional[Config] = None,
|
126
|
+
stream_state: Optional[StreamState] = None,
|
127
|
+
stream_slice: Optional[StreamSlice] = None,
|
128
|
+
) -> None:
|
129
|
+
for dimension in self.dimensions:
|
130
|
+
record[dimension] = record["keys"].pop(0)
|
131
|
+
|
132
|
+
record.pop("keys")
|
133
|
+
|
134
|
+
|
135
|
+
@dataclass
|
136
|
+
class CustomReportSchemaLoader(SchemaLoader):
|
137
|
+
"""
|
138
|
+
Custom schema loader is needed because Google Search Console's custom reports streams
|
139
|
+
because the schema is dependent on which dimensions are selected in the config. Right now,
|
140
|
+
only DynamicSchemaLoader which is based on the response from an API endpoint supports
|
141
|
+
remapping of types to Airbyte schema types. This CustomReportSchemaLoader functions
|
142
|
+
more like a static schema loader and so we must perform the remapping in this custom component.
|
143
|
+
"""
|
144
|
+
|
145
|
+
DIMENSION_TO_PROPERTY_SCHEMA_MAP = {
|
146
|
+
"country": [{"country": {"type": ["null", "string"]}}],
|
147
|
+
"date": [{"date": {"type": ["null", "string"], "format": "date"}}],
|
148
|
+
"device": [{"device": {"type": ["null", "string"]}}],
|
149
|
+
"page": [{"page": {"type": ["null", "string"]}}],
|
150
|
+
"query": [{"query": {"type": ["null", "string"]}}],
|
151
|
+
}
|
152
|
+
|
153
|
+
dimensions: List[str]
|
154
|
+
|
155
|
+
def get_json_schema(self) -> Mapping[str, Any]:
|
156
|
+
schema: Mapping[str, Any] = {
|
157
|
+
"$schema": "https://json-schema.org/draft-07/schema#",
|
158
|
+
"type": ["null", "object"],
|
159
|
+
"additionalProperties": True,
|
160
|
+
"properties": {
|
161
|
+
# metrics
|
162
|
+
"clicks": {"type": ["null", "integer"]},
|
163
|
+
"ctr": {"type": ["null", "number"], "multipleOf": 1e-25},
|
164
|
+
"impressions": {"type": ["null", "integer"]},
|
165
|
+
"position": {"type": ["null", "number"], "multipleOf": 1e-25},
|
166
|
+
# default fields
|
167
|
+
"search_type": {"type": ["null", "string"]},
|
168
|
+
"site_url": {"type": ["null", "string"]},
|
169
|
+
},
|
170
|
+
}
|
171
|
+
|
172
|
+
# dimensions
|
173
|
+
dimension_properties = self._dimension_to_property_schema()
|
174
|
+
schema["properties"].update(dimension_properties)
|
175
|
+
return schema
|
176
|
+
|
177
|
+
def _dimension_to_property_schema(self) -> dict:
|
178
|
+
properties = {}
|
179
|
+
for dimension in sorted(self.dimensions):
|
180
|
+
fields = self.DIMENSION_TO_PROPERTY_SCHEMA_MAP[dimension]
|
181
|
+
for field in fields:
|
182
|
+
properties = {**properties, **field}
|
183
|
+
return properties
|
@@ -58,6 +58,19 @@ definitions:
|
|
58
58
|
type: "{{ stream_partition.get('search_type') }}"
|
59
59
|
aggregationType: "{{ 'auto' if config.get('always_use_aggregation_type_auto') else parameters.get('aggregationType') }}"
|
60
60
|
dataState: "{{ config.get('data_state', 'final') }}"
|
61
|
+
error_handler:
|
62
|
+
type: DefaultErrorHandler
|
63
|
+
response_filters:
|
64
|
+
- type: HttpResponseFilter
|
65
|
+
action: RATE_LIMITED
|
66
|
+
error_message_contains: "Search Analytics QPS quota exceeded"
|
67
|
+
- type: HttpResponseFilter
|
68
|
+
action: FAIL
|
69
|
+
http_codes:
|
70
|
+
- 400
|
71
|
+
error_message: >-
|
72
|
+
Invalid aggregationType '{{ parameters.get('aggregationType') }}' used in the body of the API request. If you see this error, enable the
|
73
|
+
'always_use_aggregation_type_auto' config setting which will automatically use aggregationType=auto
|
61
74
|
paginator:
|
62
75
|
type: DefaultPaginator
|
63
76
|
page_token_option:
|
@@ -605,8 +618,8 @@ definitions:
|
|
605
618
|
request_headers:
|
606
619
|
Content-Type: "application/json"
|
607
620
|
request_body_json:
|
608
|
-
startDate: "{{ config.get('start_date') }}"
|
609
|
-
endDate: "{{ config.get('end_date') }}"
|
621
|
+
startDate: "{{ config.get('start_date', '2021-01-01') }}"
|
622
|
+
endDate: "{{ config.get('end_date', today_utc()) }}"
|
610
623
|
dimensions: ["searchAppearance"]
|
611
624
|
type: "{{ stream_partition.get('search_type') }}"
|
612
625
|
aggregationType: auto
|
@@ -675,6 +688,9 @@ definitions:
|
|
675
688
|
error_handler:
|
676
689
|
type: DefaultErrorHandler
|
677
690
|
response_filters:
|
691
|
+
- type: HttpResponseFilter
|
692
|
+
action: RATE_LIMITED
|
693
|
+
error_message_contains: "Search Analytics QPS quota exceeded"
|
678
694
|
- type: HttpResponseFilter
|
679
695
|
action: FAIL
|
680
696
|
http_codes:
|
@@ -897,6 +913,135 @@ streams:
|
|
897
913
|
- "#/definitions/search_analytics_keyword_site_report_by_page_stream"
|
898
914
|
- "#/definitions/search_analytics_keyword_site_report_by_site_stream"
|
899
915
|
|
916
|
+
dynamic_streams:
|
917
|
+
- type: DynamicDeclarativeStream
|
918
|
+
stream_template:
|
919
|
+
type: DeclarativeStream
|
920
|
+
name: search_analytics_by_custom_dimensions # This will be replaced by the name of the custom report
|
921
|
+
primary_key: # This will be replaced by the dimensions of the custom report
|
922
|
+
- site_url
|
923
|
+
- search_type
|
924
|
+
retriever:
|
925
|
+
type: SimpleRetriever
|
926
|
+
requester:
|
927
|
+
type: HttpRequester
|
928
|
+
url_base: https://www.googleapis.com/webmasters/v3
|
929
|
+
path: "/sites/{{ sanitize_url(stream_partition.get('site_url')) }}/searchAnalytics/query"
|
930
|
+
http_method: POST
|
931
|
+
authenticator: "#/definitions/selective_authenticator"
|
932
|
+
request_headers:
|
933
|
+
Content-Type: "application/json"
|
934
|
+
request_body_json:
|
935
|
+
startDate: "{{ stream_interval.get('start_time') }}"
|
936
|
+
endDate: "{{ stream_interval.get('end_time') }}"
|
937
|
+
dimensions: ["date", "country"] # This will be replaced by the dimensions of the custom report
|
938
|
+
type: "{{ stream_partition.get('search_type') }}"
|
939
|
+
aggregationType: auto
|
940
|
+
dataState: "{{ config.get('data_state', 'final') }}"
|
941
|
+
paginator:
|
942
|
+
type: DefaultPaginator
|
943
|
+
page_token_option:
|
944
|
+
type: RequestOption
|
945
|
+
field_name: startRow
|
946
|
+
inject_into: body_json
|
947
|
+
page_size_option:
|
948
|
+
type: RequestOption
|
949
|
+
field_name: rowLimit
|
950
|
+
inject_into: body_json
|
951
|
+
pagination_strategy:
|
952
|
+
type: OffsetIncrement
|
953
|
+
page_size: 25000
|
954
|
+
inject_on_first_request: true
|
955
|
+
record_selector:
|
956
|
+
type: RecordSelector
|
957
|
+
extractor:
|
958
|
+
type: DpathExtractor
|
959
|
+
field_path:
|
960
|
+
- rows
|
961
|
+
partition_router:
|
962
|
+
- type: ListPartitionRouter
|
963
|
+
values: "{{ config['site_urls'] }}"
|
964
|
+
cursor_field: site_url
|
965
|
+
- type: ListPartitionRouter
|
966
|
+
values:
|
967
|
+
- web
|
968
|
+
- news
|
969
|
+
- image
|
970
|
+
- video
|
971
|
+
cursor_field: search_type
|
972
|
+
incremental_sync:
|
973
|
+
type: DatetimeBasedCursor
|
974
|
+
cursor_field: date
|
975
|
+
cursor_datetime_formats:
|
976
|
+
- "%Y-%m-%d"
|
977
|
+
datetime_format: "%Y-%m-%d"
|
978
|
+
start_datetime:
|
979
|
+
type: MinMaxDatetime
|
980
|
+
datetime: "{{ config.get('start_date', '2021-01-01') }}"
|
981
|
+
datetime_format: "%Y-%m-%d"
|
982
|
+
end_datetime:
|
983
|
+
type: MinMaxDatetime
|
984
|
+
datetime: "{{ config.get('end_date', today_utc()) }}"
|
985
|
+
datetime_format: "%Y-%m-%d"
|
986
|
+
step: P3D
|
987
|
+
cursor_granularity: P1D
|
988
|
+
transformations:
|
989
|
+
- type: AddFields
|
990
|
+
fields:
|
991
|
+
- path:
|
992
|
+
- site_url
|
993
|
+
value: "{{ stream_partition['site_url'] }}"
|
994
|
+
- path:
|
995
|
+
- search_type
|
996
|
+
value: "{{ stream_partition['search_type'] }}"
|
997
|
+
- type: CustomTransformation
|
998
|
+
class_name: source_google_search_console.components.CustomReportExtractDimensionsFromKeys
|
999
|
+
dimensions: # This will be replaced by the dimensions of the custom report
|
1000
|
+
- date
|
1001
|
+
- country
|
1002
|
+
schema_loader:
|
1003
|
+
type: CustomSchemaLoader
|
1004
|
+
class_name: source_google_search_console.components.CustomReportSchemaLoader
|
1005
|
+
dimensions: [] # This will be replaced by the dimensions of the custom report
|
1006
|
+
state_migrations:
|
1007
|
+
- type: CustomStateMigration
|
1008
|
+
class_name: source_google_search_console.components.NestedSubstreamStateMigration
|
1009
|
+
components_resolver:
|
1010
|
+
type: ConfigComponentsResolver
|
1011
|
+
stream_config:
|
1012
|
+
type: StreamConfig
|
1013
|
+
configs_pointer:
|
1014
|
+
- custom_reports_array
|
1015
|
+
components_mapping:
|
1016
|
+
- type: ComponentMappingDefinition
|
1017
|
+
field_path:
|
1018
|
+
# - "**" # is this needed
|
1019
|
+
- name
|
1020
|
+
value: "{{components_values['name']}}"
|
1021
|
+
- type: ComponentMappingDefinition
|
1022
|
+
field_path:
|
1023
|
+
- primary_key
|
1024
|
+
value: "{{ components_values['dimensions'] + (['date'] if 'date' not in components_values['dimensions'] else []) + ['site_url', 'search_type'] }}"
|
1025
|
+
- type: ComponentMappingDefinition
|
1026
|
+
field_path:
|
1027
|
+
- retriever
|
1028
|
+
- requester
|
1029
|
+
- request_body_json
|
1030
|
+
- dimensions
|
1031
|
+
# `date` is a cursor field therefore should be a mandatory dimension if not already present
|
1032
|
+
value: "{{ components_values['dimensions'] + (['date'] if 'date' not in components_values['dimensions'] else []) }}"
|
1033
|
+
- type: ComponentMappingDefinition
|
1034
|
+
field_path:
|
1035
|
+
- transformations
|
1036
|
+
- "1"
|
1037
|
+
- dimensions
|
1038
|
+
value: "{{ components_values['dimensions'] + (['date'] if 'date' not in components_values['dimensions'] else []) }}"
|
1039
|
+
- type: ComponentMappingDefinition
|
1040
|
+
field_path:
|
1041
|
+
- schema_loader
|
1042
|
+
- dimensions
|
1043
|
+
value: "{{ components_values['dimensions'] + (['date'] if 'date' not in components_values['dimensions'] else []) }}"
|
1044
|
+
|
900
1045
|
# Google Search Console has three layers of quotas that dictate rate limiting at the
|
901
1046
|
# user making requests, site being requested, and developer console key used.
|
902
1047
|
# https://developers.google.com/webmaster-tools/limits#qps-quota
|
@@ -905,13 +1050,20 @@ streams:
|
|
905
1050
|
# - Per Project Quota: 30,000,000 req/day (350 req/sec) / 40,000 req/min (60 req/sec)
|
906
1051
|
#
|
907
1052
|
# The most likely upper bound is based on the user quota since it is the lowest and the
|
908
|
-
# same authenticated user account may hit multiple site urls.
|
909
|
-
|
910
|
-
|
1053
|
+
# same authenticated user account may hit multiple site urls.
|
1054
|
+
api_budget:
|
1055
|
+
type: HTTPAPIBudget
|
1056
|
+
policies:
|
1057
|
+
- type: MovingWindowCallRatePolicy
|
1058
|
+
rates:
|
1059
|
+
- limit: 1200
|
1060
|
+
interval: PT1M
|
1061
|
+
matchers: []
|
1062
|
+
|
911
1063
|
concurrency_level:
|
912
1064
|
type: ConcurrencyLevel
|
913
|
-
default_concurrency: "{{ config.get('num_workers',
|
914
|
-
max_concurrency:
|
1065
|
+
default_concurrency: "{{ config.get('num_workers', 3) }}"
|
1066
|
+
max_concurrency: 50
|
915
1067
|
|
916
1068
|
schemas:
|
917
1069
|
search_analytics_all_fields:
|
@@ -22,11 +22,16 @@ from source_google_search_console.exceptions import (
|
|
22
22
|
UnidentifiedError,
|
23
23
|
)
|
24
24
|
from source_google_search_console.service_account_authenticator import ServiceAccountAuthenticator
|
25
|
-
from source_google_search_console.streams import (
|
26
|
-
SearchAnalyticsByCustomDimensions,
|
27
|
-
)
|
28
25
|
|
29
26
|
|
27
|
+
DIMENSION_TO_PROPERTY_SCHEMA_MAP = {
|
28
|
+
"country": [{"country": {"type": ["null", "string"]}}],
|
29
|
+
"date": [{"date": {"type": ["null", "string"], "format": "date"}}],
|
30
|
+
"device": [{"device": {"type": ["null", "string"]}}],
|
31
|
+
"page": [{"page": {"type": ["null", "string"]}}],
|
32
|
+
"query": [{"query": {"type": ["null", "string"]}}],
|
33
|
+
}
|
34
|
+
|
30
35
|
custom_reports_schema = {
|
31
36
|
"type": "array",
|
32
37
|
"items": {
|
@@ -93,7 +98,7 @@ class SourceGoogleSearchConsole(YamlDeclarativeSource):
|
|
93
98
|
jsonschema.validate(config["custom_reports_array"], custom_reports_schema)
|
94
99
|
for report in config["custom_reports_array"]:
|
95
100
|
for dimension in report["dimensions"]:
|
96
|
-
if dimension not in
|
101
|
+
if dimension not in DIMENSION_TO_PROPERTY_SCHEMA_MAP:
|
97
102
|
message = f"dimension: '{dimension}' not found"
|
98
103
|
raise AirbyteTracedException(message=message, internal_message=message, failure_type=FailureType.config_error)
|
99
104
|
return config
|
@@ -137,17 +142,7 @@ class SourceGoogleSearchConsole(YamlDeclarativeSource):
|
|
137
142
|
config = self._validate_and_transform(config)
|
138
143
|
stream_config = self.get_stream_kwargs(config)
|
139
144
|
|
140
|
-
|
141
|
-
|
142
|
-
streams = streams + self.get_custom_reports(config=config, stream_config=stream_config)
|
143
|
-
|
144
|
-
return streams
|
145
|
-
|
146
|
-
def get_custom_reports(self, config: Mapping[str, Any], stream_config: Mapping[str, Any]) -> List[Optional[Stream]]:
|
147
|
-
return [
|
148
|
-
type(report["name"], (SearchAnalyticsByCustomDimensions,), {})(dimensions=report["dimensions"], **stream_config)
|
149
|
-
for report in config.get("custom_reports_array", [])
|
150
|
-
]
|
145
|
+
return super().streams(config=config)
|
151
146
|
|
152
147
|
def get_stream_kwargs(self, config: Mapping[str, Any]) -> Mapping[str, Any]:
|
153
148
|
return {
|
@@ -1,347 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
-
#
|
4
|
-
|
5
|
-
from abc import ABC
|
6
|
-
from enum import Enum
|
7
|
-
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Union
|
8
|
-
from urllib.parse import quote_plus, unquote_plus
|
9
|
-
|
10
|
-
import pendulum
|
11
|
-
import requests
|
12
|
-
from requests.auth import AuthBase
|
13
|
-
|
14
|
-
from airbyte_cdk.models import SyncMode
|
15
|
-
from airbyte_cdk.sources.streams import CheckpointMixin
|
16
|
-
from airbyte_cdk.sources.streams.http import HttpStream
|
17
|
-
|
18
|
-
|
19
|
-
BASE_URL = "https://www.googleapis.com/webmasters/v3/"
|
20
|
-
ROW_LIMIT = 25000
|
21
|
-
|
22
|
-
|
23
|
-
class QueryAggregationType(Enum):
|
24
|
-
auto = "auto"
|
25
|
-
by_page = "byPage"
|
26
|
-
by_property = "byProperty"
|
27
|
-
|
28
|
-
|
29
|
-
class GoogleSearchConsole(HttpStream, ABC):
|
30
|
-
url_base = BASE_URL
|
31
|
-
data_field = ""
|
32
|
-
raise_on_http_errors = True
|
33
|
-
|
34
|
-
def __init__(
|
35
|
-
self,
|
36
|
-
authenticator: AuthBase,
|
37
|
-
site_urls: list,
|
38
|
-
start_date: str,
|
39
|
-
end_date: str,
|
40
|
-
data_state: str = "final",
|
41
|
-
):
|
42
|
-
super().__init__(authenticator=authenticator)
|
43
|
-
self._site_urls = self.sanitize_urls_list(site_urls)
|
44
|
-
self._start_date = start_date
|
45
|
-
self._end_date = end_date
|
46
|
-
self._data_state = data_state
|
47
|
-
|
48
|
-
@staticmethod
|
49
|
-
def sanitize_urls_list(site_urls: list) -> List[str]:
|
50
|
-
return list(map(quote_plus, site_urls))
|
51
|
-
|
52
|
-
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
|
53
|
-
return None
|
54
|
-
|
55
|
-
def stream_slices(
|
56
|
-
self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
|
57
|
-
) -> Iterable[Optional[Mapping[str, Any]]]:
|
58
|
-
for site_url in self._site_urls:
|
59
|
-
yield {"site_url": site_url}
|
60
|
-
|
61
|
-
def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
|
62
|
-
if not self.data_field:
|
63
|
-
yield response.json()
|
64
|
-
|
65
|
-
else:
|
66
|
-
records = response.json().get(self.data_field) or []
|
67
|
-
for record in records:
|
68
|
-
yield record
|
69
|
-
|
70
|
-
def should_retry(self, response: requests.Response) -> bool:
|
71
|
-
response_json = response.json()
|
72
|
-
if "error" in response_json:
|
73
|
-
error = response_json.get("error", {})
|
74
|
-
# handle the `HTTP-403` - insufficient permissions
|
75
|
-
if error.get("code", 0) == 403:
|
76
|
-
self.logger.error(f"Stream {self.name}. {error.get('message')}. Skipping.")
|
77
|
-
setattr(self, "raise_on_http_errors", False)
|
78
|
-
return False
|
79
|
-
# handle the `HTTP-400` - Bad query params with `aggregationType`
|
80
|
-
if error.get("code", 0) == 400:
|
81
|
-
self.logger.error(f"Stream `{self.name}`. {error.get('message')}. Trying with `aggregationType = auto` instead.")
|
82
|
-
self.aggregation_type = QueryAggregationType.auto
|
83
|
-
setattr(self, "raise_on_http_errors", False)
|
84
|
-
return response.status_code == 429 or 500 <= response.status_code < 600
|
85
|
-
|
86
|
-
|
87
|
-
class SearchAnalytics(GoogleSearchConsole, CheckpointMixin, ABC):
|
88
|
-
"""
|
89
|
-
API docs: https://developers.google.com/webmaster-tools/search-console-api-original/v3/searchanalytics
|
90
|
-
"""
|
91
|
-
|
92
|
-
data_field = "rows"
|
93
|
-
aggregation_type = QueryAggregationType.auto
|
94
|
-
start_row = 0
|
95
|
-
dimensions = []
|
96
|
-
search_types = ["web", "news", "image", "video"]
|
97
|
-
range_of_days = 3
|
98
|
-
|
99
|
-
def __init__(self, authenticator: AuthBase, site_urls: list, start_date: str, end_date: str, data_state: str = "final", **kwargs):
|
100
|
-
super().__init__(authenticator=authenticator, site_urls=site_urls, start_date=start_date, end_date=end_date, data_state=data_state)
|
101
|
-
self._state = {}
|
102
|
-
|
103
|
-
def path(
|
104
|
-
self,
|
105
|
-
stream_state: Mapping[str, Any] = None,
|
106
|
-
stream_slice: Mapping[str, Any] = None,
|
107
|
-
next_page_token: Mapping[str, Any] = None,
|
108
|
-
) -> str:
|
109
|
-
return f"sites/{stream_slice.get('site_url')}/searchAnalytics/query"
|
110
|
-
|
111
|
-
@property
|
112
|
-
def cursor_field(self) -> Union[str, List[str]]:
|
113
|
-
return "date"
|
114
|
-
|
115
|
-
@property
|
116
|
-
def http_method(self) -> str:
|
117
|
-
return "POST"
|
118
|
-
|
119
|
-
@property
|
120
|
-
def state(self) -> MutableMapping[str, Any]:
|
121
|
-
return self._state
|
122
|
-
|
123
|
-
@state.setter
|
124
|
-
def state(self, value: MutableMapping[str, Any]):
|
125
|
-
self._state = value
|
126
|
-
|
127
|
-
def stream_slices(
|
128
|
-
self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
|
129
|
-
) -> Iterable[Optional[Mapping[str, Any]]]:
|
130
|
-
"""
|
131
|
-
The `stream_slices` implements iterator functionality for `site_urls` and `searchType`. The user can pass many `site_url`,
|
132
|
-
and we have to process all of them, we can also pass the` searchType` parameter in the `request body` to get data using some`
|
133
|
-
searchType` value from [` web`, `news `,` image`, `video`, `discover`, `googleNews`].
|
134
|
-
It's just a double nested loop with a yield statement.
|
135
|
-
"""
|
136
|
-
|
137
|
-
for site_url in self._site_urls:
|
138
|
-
for search_type in self.search_types:
|
139
|
-
start_date = self._get_start_date(stream_state, site_url, search_type)
|
140
|
-
end_date = self._get_end_date()
|
141
|
-
|
142
|
-
if start_date > end_date:
|
143
|
-
start_date = end_date
|
144
|
-
|
145
|
-
next_start = start_date
|
146
|
-
period = pendulum.Duration(days=self.range_of_days - 1)
|
147
|
-
while next_start <= end_date:
|
148
|
-
next_end = min(next_start + period, end_date)
|
149
|
-
yield {
|
150
|
-
"site_url": site_url,
|
151
|
-
"search_type": search_type,
|
152
|
-
"start_date": next_start.to_date_string(),
|
153
|
-
"end_date": next_end.to_date_string(),
|
154
|
-
"data_state": self._data_state,
|
155
|
-
}
|
156
|
-
# add 1 day for the next slice's start date not to duplicate data from previous slice's end date.
|
157
|
-
next_start = next_end + pendulum.Duration(days=1)
|
158
|
-
|
159
|
-
def next_page_token(self, response: requests.Response) -> Optional[bool]:
|
160
|
-
"""
|
161
|
-
The `next_page_token` implements pagination functionality. This method gets the response
|
162
|
-
and compares the number of records with the constant `ROW_LIMITS` (maximum value 25000),
|
163
|
-
and if they are equal, this means that we get the end of the` Page`, and we need to go further,
|
164
|
-
for this we simply increase the `startRow` parameter in request body by `ROW_LIMIT` value.
|
165
|
-
"""
|
166
|
-
|
167
|
-
if len(response.json().get(self.data_field, [])) == ROW_LIMIT:
|
168
|
-
self.start_row += ROW_LIMIT
|
169
|
-
return True
|
170
|
-
|
171
|
-
self.start_row = 0
|
172
|
-
|
173
|
-
def request_headers(self, **kwargs) -> Mapping[str, Any]:
|
174
|
-
return {"Content-Type": "application/json"}
|
175
|
-
|
176
|
-
def request_body_json(
|
177
|
-
self,
|
178
|
-
stream_state: Mapping[str, Any] = None,
|
179
|
-
stream_slice: Mapping[str, Any] = None,
|
180
|
-
next_page_token: Mapping[str, Any] = None,
|
181
|
-
) -> Optional[Union[Dict[str, Any], str]]:
|
182
|
-
"""
|
183
|
-
Here is a description of the parameters and implementations of the request body:
|
184
|
-
1. The `startDate` is retrieved from the `_get_start_date`,
|
185
|
-
if` SyncMode = full_refresh` just use `start_date` from configuration, otherwise use `get_update_state`.
|
186
|
-
2. The `endDate` is retrieved from the `config.json`.
|
187
|
-
3. The `sizes` parameter is used to group the result by some dimension.
|
188
|
-
The following dimensions are available: `date`, `country`, `page`, `device`, `query`.
|
189
|
-
4. For the `type` check the paragraph stream_slices method.
|
190
|
-
Filter results to the following type ["web", "news", "image", "video", "discover", "googleNews"]
|
191
|
-
5. For the `startRow` and `rowLimit` check next_page_token method.
|
192
|
-
"""
|
193
|
-
|
194
|
-
data = {
|
195
|
-
"startDate": stream_slice["start_date"],
|
196
|
-
"endDate": stream_slice["end_date"],
|
197
|
-
"dimensions": self.dimensions,
|
198
|
-
"type": stream_slice.get("search_type"),
|
199
|
-
"aggregationType": self.aggregation_type.value,
|
200
|
-
"startRow": self.start_row,
|
201
|
-
"rowLimit": ROW_LIMIT,
|
202
|
-
"dataState": stream_slice.get("data_state"),
|
203
|
-
}
|
204
|
-
|
205
|
-
return data
|
206
|
-
|
207
|
-
def _get_end_date(self) -> pendulum.date:
|
208
|
-
end_date = pendulum.parse(self._end_date).date()
|
209
|
-
# limit `end_date` value with current date
|
210
|
-
return min(end_date, pendulum.now().date())
|
211
|
-
|
212
|
-
def _get_start_date(self, stream_state: Mapping[str, Any] = None, site_url: str = None, search_type: str = None) -> pendulum.date:
|
213
|
-
start_date = pendulum.parse(self._start_date)
|
214
|
-
|
215
|
-
if start_date and stream_state:
|
216
|
-
if stream_state.get(unquote_plus(site_url), {}).get(search_type):
|
217
|
-
stream_state_value = stream_state.get(unquote_plus(site_url), {}).get(search_type)
|
218
|
-
|
219
|
-
start_date = max(
|
220
|
-
pendulum.parse(stream_state_value[self.cursor_field]),
|
221
|
-
start_date,
|
222
|
-
)
|
223
|
-
|
224
|
-
return start_date.date()
|
225
|
-
|
226
|
-
def parse_response(
|
227
|
-
self,
|
228
|
-
response: requests.Response,
|
229
|
-
stream_state: Mapping[str, Any],
|
230
|
-
stream_slice: Mapping[str, Any] = None,
|
231
|
-
next_page_token: Mapping[str, Any] = None,
|
232
|
-
) -> Iterable[Mapping]:
|
233
|
-
records = response.json().get(self.data_field) or []
|
234
|
-
|
235
|
-
for record in records:
|
236
|
-
record["site_url"] = unquote_plus(stream_slice.get("site_url"))
|
237
|
-
record["search_type"] = stream_slice.get("search_type")
|
238
|
-
|
239
|
-
for dimension in self.dimensions:
|
240
|
-
record[dimension] = record["keys"].pop(0)
|
241
|
-
|
242
|
-
# remove unnecessary empty field
|
243
|
-
record.pop("keys")
|
244
|
-
|
245
|
-
yield record
|
246
|
-
|
247
|
-
def _get_updated_state(
|
248
|
-
self,
|
249
|
-
current_stream_state: MutableMapping[str, Any],
|
250
|
-
latest_record: Mapping[str, Any],
|
251
|
-
) -> Mapping[str, Any]:
|
252
|
-
"""
|
253
|
-
With the existing nested loop implementation, we have to store a `cursor_field` for each `site_url`
|
254
|
-
and `searchType`. This functionality is placed in `get_update_state`.
|
255
|
-
|
256
|
-
{
|
257
|
-
"stream": {
|
258
|
-
"https://domain1.com": {
|
259
|
-
"web": {"date": "2022-01-03"},
|
260
|
-
"news": {"date": "2022-01-03"},
|
261
|
-
"image": {"date": "2022-01-03"},
|
262
|
-
"video": {"date": "2022-01-03"}
|
263
|
-
},
|
264
|
-
"https://domain2.com": {
|
265
|
-
"web": {"date": "2022-01-03"},
|
266
|
-
"news": {"date": "2022-01-03"},
|
267
|
-
"image": {"date": "2022-01-03"},
|
268
|
-
"video": {"date": "2022-01-03"}
|
269
|
-
},
|
270
|
-
"date": "2022-01-03",
|
271
|
-
}
|
272
|
-
}
|
273
|
-
"""
|
274
|
-
|
275
|
-
latest_benchmark = latest_record.get(self.cursor_field)
|
276
|
-
|
277
|
-
site_url = latest_record.get("site_url")
|
278
|
-
search_type = latest_record.get("search_type")
|
279
|
-
|
280
|
-
value = current_stream_state.get(site_url, {}).get(search_type, {}).get(self.cursor_field)
|
281
|
-
if value:
|
282
|
-
latest_benchmark = max(latest_benchmark, value)
|
283
|
-
current_stream_state.setdefault(site_url, {}).setdefault(search_type, {})[self.cursor_field] = latest_benchmark
|
284
|
-
|
285
|
-
# we need to get the max date over all searchTypes but the current acceptance test YAML format doesn't
|
286
|
-
# support that
|
287
|
-
current_stream_state[self.cursor_field] = current_stream_state[site_url][search_type][self.cursor_field]
|
288
|
-
|
289
|
-
return current_stream_state
|
290
|
-
|
291
|
-
def read_records(self, **kwargs) -> Iterable[Mapping[str, Any]]:
|
292
|
-
for record in super().read_records(**kwargs):
|
293
|
-
self.state = self._get_updated_state(self.state, record)
|
294
|
-
yield record
|
295
|
-
|
296
|
-
|
297
|
-
class SearchAnalyticsByCustomDimensions(SearchAnalytics):
|
298
|
-
# `date` is a cursor field therefore should be mandatory
|
299
|
-
DEFAULT_DIMENSIONS = ["date"]
|
300
|
-
DIMENSION_TO_PROPERTY_SCHEMA_MAP = {
|
301
|
-
"country": [{"country": {"type": ["null", "string"]}}],
|
302
|
-
"date": [{"date": {"type": ["null", "string"], "format": "date"}}],
|
303
|
-
"device": [{"device": {"type": ["null", "string"]}}],
|
304
|
-
"page": [{"page": {"type": ["null", "string"]}}],
|
305
|
-
"query": [{"query": {"type": ["null", "string"]}}],
|
306
|
-
}
|
307
|
-
|
308
|
-
primary_key = None
|
309
|
-
|
310
|
-
def __init__(self, dimensions: List[str], *args, **kwargs):
|
311
|
-
super(SearchAnalyticsByCustomDimensions, self).__init__(*args, **kwargs)
|
312
|
-
self.dimensions = dimensions + [dimension for dimension in self.DEFAULT_DIMENSIONS if dimension not in dimensions]
|
313
|
-
# Assign the dimensions as PK for the custom report stream.
|
314
|
-
# Site URL and Search Type are included in the API call thus affect the resulting data.
|
315
|
-
# `site_url` is a required URL param for making API calls;
|
316
|
-
# `search_type` remains a query param for historical reasons, we do not want to remove it to not break existing connections.
|
317
|
-
self.primary_key = self.dimensions + ["site_url", "search_type"]
|
318
|
-
|
319
|
-
def get_json_schema(self) -> Mapping[str, Any]:
|
320
|
-
schema: Mapping[str, Any] = {
|
321
|
-
"$schema": "https://json-schema.org/draft-07/schema#",
|
322
|
-
"type": ["null", "object"],
|
323
|
-
"additionalProperties": True,
|
324
|
-
"properties": {
|
325
|
-
# metrics
|
326
|
-
"clicks": {"type": ["null", "integer"]},
|
327
|
-
"ctr": {"type": ["null", "number"], "multipleOf": 1e-25},
|
328
|
-
"impressions": {"type": ["null", "integer"]},
|
329
|
-
"position": {"type": ["null", "number"], "multipleOf": 1e-25},
|
330
|
-
# default fields
|
331
|
-
"search_type": {"type": ["null", "string"]},
|
332
|
-
"site_url": {"type": ["null", "string"]},
|
333
|
-
},
|
334
|
-
}
|
335
|
-
|
336
|
-
# dimensions
|
337
|
-
dimension_properties = self.dimension_to_property_schema()
|
338
|
-
schema["properties"].update(dimension_properties)
|
339
|
-
return schema
|
340
|
-
|
341
|
-
def dimension_to_property_schema(self) -> dict:
|
342
|
-
properties = {}
|
343
|
-
for dimension in sorted(self.dimensions):
|
344
|
-
fields = self.DIMENSION_TO_PROPERTY_SCHEMA_MAP[dimension]
|
345
|
-
for field in fields:
|
346
|
-
properties = {**properties, **field}
|
347
|
-
return properties
|
File without changes
|