airbyte-source-google-search-console 1.8.0__tar.gz → 1.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (14) hide show
  1. {airbyte_source_google_search_console-1.8.0 → airbyte_source_google_search_console-1.9.0}/PKG-INFO +1 -1
  2. {airbyte_source_google_search_console-1.8.0 → airbyte_source_google_search_console-1.9.0}/pyproject.toml +1 -1
  3. airbyte_source_google_search_console-1.9.0/source_google_search_console/components.py +183 -0
  4. {airbyte_source_google_search_console-1.8.0 → airbyte_source_google_search_console-1.9.0}/source_google_search_console/manifest.yaml +129 -0
  5. {airbyte_source_google_search_console-1.8.0 → airbyte_source_google_search_console-1.9.0}/source_google_search_console/source.py +10 -15
  6. airbyte_source_google_search_console-1.8.0/source_google_search_console/components.py +0 -81
  7. airbyte_source_google_search_console-1.8.0/source_google_search_console/streams.py +0 -347
  8. {airbyte_source_google_search_console-1.8.0 → airbyte_source_google_search_console-1.9.0}/README.md +0 -0
  9. {airbyte_source_google_search_console-1.8.0 → airbyte_source_google_search_console-1.9.0}/source_google_search_console/__init__.py +0 -0
  10. {airbyte_source_google_search_console-1.8.0 → airbyte_source_google_search_console-1.9.0}/source_google_search_console/config_migrations.py +0 -0
  11. {airbyte_source_google_search_console-1.8.0 → airbyte_source_google_search_console-1.9.0}/source_google_search_console/exceptions.py +0 -0
  12. {airbyte_source_google_search_console-1.8.0 → airbyte_source_google_search_console-1.9.0}/source_google_search_console/run.py +0 -0
  13. {airbyte_source_google_search_console-1.8.0 → airbyte_source_google_search_console-1.9.0}/source_google_search_console/service_account_authenticator.py +0 -0
  14. {airbyte_source_google_search_console-1.8.0 → airbyte_source_google_search_console-1.9.0}/source_google_search_console/spec.json +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: airbyte-source-google-search-console
3
- Version: 1.8.0
3
+ Version: 1.9.0
4
4
  Summary: Source implementation for Google Search Console.
5
5
  License: Elv2
6
6
  Author: Airbyte
@@ -5,7 +5,7 @@ requires = [
5
5
  build-backend = "poetry.core.masonry.api"
6
6
 
7
7
  [tool.poetry]
8
- version = "1.8.0"
8
+ version = "1.9.0"
9
9
  name = "airbyte-source-google-search-console"
10
10
  description = "Source implementation for Google Search Console."
11
11
  authors = [
@@ -0,0 +1,183 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any, Dict, List, Mapping, Optional
7
+
8
+ from airbyte_cdk.sources.declarative.migrations.state_migration import StateMigration
9
+ from airbyte_cdk.sources.declarative.schema import SchemaLoader
10
+ from airbyte_cdk.sources.declarative.transformations import RecordTransformation
11
+ from airbyte_cdk.sources.types import Config, StreamSlice, StreamState
12
+
13
+
14
+ @dataclass
15
+ class NestedSubstreamStateMigration(StateMigration):
16
+ """
17
+ We require a custom state migration because SearchAnalytics streams contain two nested levels of
18
+ substreams. The existing LegacyToPerPartitionStateMigration only handles one level.
19
+
20
+ Legacy state format is as follows:
21
+ {
22
+ "date": "2025-05-28",
23
+ "https://www.example.com/": {
24
+ "web": {
25
+ "date": "2025-05-25"
26
+ },
27
+ "news": {
28
+ "date": "2023-05-22"
29
+ }
30
+ }
31
+ }
32
+
33
+ The resulting migrated per-partition state is:
34
+ {
35
+ "use_global_cursor": false,
36
+ "states": [
37
+ {
38
+ "partition": {
39
+ "search_type": "web",
40
+ "site_url": "https://www.example.com/"
41
+ },
42
+ "cursor": {
43
+ "date": "2025-05-25"
44
+ }
45
+ },
46
+ {
47
+ "partition": {
48
+ "search_type": "news",
49
+ "site_url": "https://www.example.com/"
50
+ },
51
+ "cursor": {
52
+ "date": "2023-05-22"
53
+ }
54
+ }],
55
+ "state": {
56
+ "date": "2025-05-25"
57
+ }
58
+ }
59
+ """
60
+
61
+ def should_migrate(self, stream_state: Mapping[str, Any]) -> bool:
62
+ return len(stream_state) > 0 and "states" not in stream_state
63
+
64
+ def migrate(self, stream_state: Mapping[str, Any]) -> Mapping[str, Any]:
65
+ per_partition_state = []
66
+ for site_url_key, search_type_state in stream_state.items():
67
+ if site_url_key == "date":
68
+ # The legacy state also contains a global cursor value under the `date` key which equates
69
+ # to global state.
70
+ #
71
+ # However, the Python implementation does not appear to be implemented
72
+ # correctly and simply saves the state of the last seen partition. Since I don't trust the
73
+ # legacy value and in the current implementation global state is applied to partitions
74
+ # without an existing value, I'm making a conscious choice to not migrate the global value.
75
+ continue
76
+ else:
77
+ site_url = site_url_key
78
+ for search_type_key, cursor in search_type_state.items():
79
+ per_partition_state.append({"partition": {"site_url": site_url, "search_type": search_type_key}, "cursor": cursor})
80
+ return {
81
+ "use_global_cursor": False,
82
+ "states": per_partition_state,
83
+ }
84
+
85
+
86
+ @dataclass
87
+ class CustomReportExtractDimensionsFromKeys(RecordTransformation):
88
+ """
89
+ A record transformation that remaps each value in the keys array back to its associated
90
+ dimension. The reason this is a custom component is because we're unable to use list
91
+ comprehension and and enumerate() is not a valid function in our Jinja contact so can't
92
+ iterate over the dimensions defined in the config to create each field transformation on the
93
+ stream_template for each custom report.
94
+
95
+ If we were able to, the actual ComponentMappingDefinition would look like this:
96
+
97
+ type: ComponentMappingDefinition
98
+ field_path:
99
+ - transformations
100
+ - "1"
101
+ - fields
102
+ value: "{{ [{'path': [dimension], 'value': '{{ record['keys'][index]} for index, dimension in enumerate(components_values['dimensions'])] }}"
103
+
104
+ or
105
+
106
+ type: ComponentMappingDefinition
107
+ field_path:
108
+ - transformations
109
+ - "1"
110
+ - fields
111
+ value: >
112
+ {% for index, dimension in enumerate(components_values["dimensions"]) %}
113
+ - type: AddFields
114
+ fields:
115
+ - path: [ {{ dimension }} ]
116
+ value: "{{ record['keys'][index] }}"
117
+ {% endfor %}
118
+ """
119
+
120
+ dimensions: List[str] = field(default_factory=lambda: [])
121
+
122
+ def transform(
123
+ self,
124
+ record: Dict[str, Any],
125
+ config: Optional[Config] = None,
126
+ stream_state: Optional[StreamState] = None,
127
+ stream_slice: Optional[StreamSlice] = None,
128
+ ) -> None:
129
+ for dimension in self.dimensions:
130
+ record[dimension] = record["keys"].pop(0)
131
+
132
+ record.pop("keys")
133
+
134
+
135
+ @dataclass
136
+ class CustomReportSchemaLoader(SchemaLoader):
137
+ """
138
+ Custom schema loader is needed because Google Search Console's custom reports streams
139
+ because the schema is dependent on which dimensions are selected in the config. Right now,
140
+ only DynamicSchemaLoader which is based on the response from an API endpoint supports
141
+ remapping of types to Airbyte schema types. This CustomReportSchemaLoader functions
142
+ more like a static schema loader and so we must perform the remapping in this custom component.
143
+ """
144
+
145
+ DIMENSION_TO_PROPERTY_SCHEMA_MAP = {
146
+ "country": [{"country": {"type": ["null", "string"]}}],
147
+ "date": [{"date": {"type": ["null", "string"], "format": "date"}}],
148
+ "device": [{"device": {"type": ["null", "string"]}}],
149
+ "page": [{"page": {"type": ["null", "string"]}}],
150
+ "query": [{"query": {"type": ["null", "string"]}}],
151
+ }
152
+
153
+ dimensions: List[str]
154
+
155
+ def get_json_schema(self) -> Mapping[str, Any]:
156
+ schema: Mapping[str, Any] = {
157
+ "$schema": "https://json-schema.org/draft-07/schema#",
158
+ "type": ["null", "object"],
159
+ "additionalProperties": True,
160
+ "properties": {
161
+ # metrics
162
+ "clicks": {"type": ["null", "integer"]},
163
+ "ctr": {"type": ["null", "number"], "multipleOf": 1e-25},
164
+ "impressions": {"type": ["null", "integer"]},
165
+ "position": {"type": ["null", "number"], "multipleOf": 1e-25},
166
+ # default fields
167
+ "search_type": {"type": ["null", "string"]},
168
+ "site_url": {"type": ["null", "string"]},
169
+ },
170
+ }
171
+
172
+ # dimensions
173
+ dimension_properties = self._dimension_to_property_schema()
174
+ schema["properties"].update(dimension_properties)
175
+ return schema
176
+
177
+ def _dimension_to_property_schema(self) -> dict:
178
+ properties = {}
179
+ for dimension in sorted(self.dimensions):
180
+ fields = self.DIMENSION_TO_PROPERTY_SCHEMA_MAP[dimension]
181
+ for field in fields:
182
+ properties = {**properties, **field}
183
+ return properties
@@ -897,6 +897,135 @@ streams:
897
897
  - "#/definitions/search_analytics_keyword_site_report_by_page_stream"
898
898
  - "#/definitions/search_analytics_keyword_site_report_by_site_stream"
899
899
 
900
+ dynamic_streams:
901
+ - type: DynamicDeclarativeStream
902
+ stream_template:
903
+ type: DeclarativeStream
904
+ name: search_analytics_by_custom_dimensions # This will be replaced by the name of the custom report
905
+ primary_key: # This will be replaced by the dimensions of the custom report
906
+ - site_url
907
+ - search_type
908
+ retriever:
909
+ type: SimpleRetriever
910
+ requester:
911
+ type: HttpRequester
912
+ url_base: https://www.googleapis.com/webmasters/v3
913
+ path: "/sites/{{ sanitize_url(stream_partition.get('site_url')) }}/searchAnalytics/query"
914
+ http_method: POST
915
+ authenticator: "#/definitions/selective_authenticator"
916
+ request_headers:
917
+ Content-Type: "application/json"
918
+ request_body_json:
919
+ startDate: "{{ stream_interval.get('start_time') }}"
920
+ endDate: "{{ stream_interval.get('end_time') }}"
921
+ dimensions: ["date", "country"] # This will be replaced by the dimensions of the custom report
922
+ type: "{{ stream_partition.get('search_type') }}"
923
+ aggregationType: auto
924
+ dataState: "{{ config.get('data_state', 'final') }}"
925
+ paginator:
926
+ type: DefaultPaginator
927
+ page_token_option:
928
+ type: RequestOption
929
+ field_name: startRow
930
+ inject_into: body_json
931
+ page_size_option:
932
+ type: RequestOption
933
+ field_name: rowLimit
934
+ inject_into: body_json
935
+ pagination_strategy:
936
+ type: OffsetIncrement
937
+ page_size: 25000
938
+ inject_on_first_request: true
939
+ record_selector:
940
+ type: RecordSelector
941
+ extractor:
942
+ type: DpathExtractor
943
+ field_path:
944
+ - rows
945
+ partition_router:
946
+ - type: ListPartitionRouter
947
+ values: "{{ config['site_urls'] }}"
948
+ cursor_field: site_url
949
+ - type: ListPartitionRouter
950
+ values:
951
+ - web
952
+ - news
953
+ - image
954
+ - video
955
+ cursor_field: search_type
956
+ incremental_sync:
957
+ type: DatetimeBasedCursor
958
+ cursor_field: date
959
+ cursor_datetime_formats:
960
+ - "%Y-%m-%d"
961
+ datetime_format: "%Y-%m-%d"
962
+ start_datetime:
963
+ type: MinMaxDatetime
964
+ datetime: "{{ config.get('start_date', '2021-01-01') }}"
965
+ datetime_format: "%Y-%m-%d"
966
+ end_datetime:
967
+ type: MinMaxDatetime
968
+ datetime: "{{ config.get('end_date', today_utc()) }}"
969
+ datetime_format: "%Y-%m-%d"
970
+ step: P3D
971
+ cursor_granularity: P1D
972
+ transformations:
973
+ - type: AddFields
974
+ fields:
975
+ - path:
976
+ - site_url
977
+ value: "{{ stream_partition['site_url'] }}"
978
+ - path:
979
+ - search_type
980
+ value: "{{ stream_partition['search_type'] }}"
981
+ - type: CustomTransformation
982
+ class_name: source_google_search_console.components.CustomReportExtractDimensionsFromKeys
983
+ dimensions: # This will be replaced by the dimensions of the custom report
984
+ - date
985
+ - country
986
+ schema_loader:
987
+ type: CustomSchemaLoader
988
+ class_name: source_google_search_console.components.CustomReportSchemaLoader
989
+ dimensions: [] # This will be replaced by the dimensions of the custom report
990
+ state_migrations:
991
+ - type: CustomStateMigration
992
+ class_name: source_google_search_console.components.NestedSubstreamStateMigration
993
+ components_resolver:
994
+ type: ConfigComponentsResolver
995
+ stream_config:
996
+ type: StreamConfig
997
+ configs_pointer:
998
+ - custom_reports_array
999
+ components_mapping:
1000
+ - type: ComponentMappingDefinition
1001
+ field_path:
1002
+ # - "**" # is this needed
1003
+ - name
1004
+ value: "{{components_values['name']}}"
1005
+ - type: ComponentMappingDefinition
1006
+ field_path:
1007
+ - primary_key
1008
+ value: "{{ components_values['dimensions'] + (['date'] if 'date' not in components_values['dimensions'] else []) + ['site_url', 'search_type'] }}"
1009
+ - type: ComponentMappingDefinition
1010
+ field_path:
1011
+ - retriever
1012
+ - requester
1013
+ - request_body_json
1014
+ - dimensions
1015
+ # `date` is a cursor field therefore should be a mandatory dimension if not already present
1016
+ value: "{{ components_values['dimensions'] + (['date'] if 'date' not in components_values['dimensions'] else []) }}"
1017
+ - type: ComponentMappingDefinition
1018
+ field_path:
1019
+ - transformations
1020
+ - "1"
1021
+ - dimensions
1022
+ value: "{{ components_values['dimensions'] + (['date'] if 'date' not in components_values['dimensions'] else []) }}"
1023
+ - type: ComponentMappingDefinition
1024
+ field_path:
1025
+ - schema_loader
1026
+ - dimensions
1027
+ value: "{{ components_values['dimensions'] + (['date'] if 'date' not in components_values['dimensions'] else []) }}"
1028
+
900
1029
  # Google Search Console has three layers of quotas that dictate rate limiting at the
901
1030
  # user making requests, site being requested, and developer console key used.
902
1031
  # https://developers.google.com/webmaster-tools/limits#qps-quota
@@ -22,11 +22,16 @@ from source_google_search_console.exceptions import (
22
22
  UnidentifiedError,
23
23
  )
24
24
  from source_google_search_console.service_account_authenticator import ServiceAccountAuthenticator
25
- from source_google_search_console.streams import (
26
- SearchAnalyticsByCustomDimensions,
27
- )
28
25
 
29
26
 
27
+ DIMENSION_TO_PROPERTY_SCHEMA_MAP = {
28
+ "country": [{"country": {"type": ["null", "string"]}}],
29
+ "date": [{"date": {"type": ["null", "string"], "format": "date"}}],
30
+ "device": [{"device": {"type": ["null", "string"]}}],
31
+ "page": [{"page": {"type": ["null", "string"]}}],
32
+ "query": [{"query": {"type": ["null", "string"]}}],
33
+ }
34
+
30
35
  custom_reports_schema = {
31
36
  "type": "array",
32
37
  "items": {
@@ -93,7 +98,7 @@ class SourceGoogleSearchConsole(YamlDeclarativeSource):
93
98
  jsonschema.validate(config["custom_reports_array"], custom_reports_schema)
94
99
  for report in config["custom_reports_array"]:
95
100
  for dimension in report["dimensions"]:
96
- if dimension not in SearchAnalyticsByCustomDimensions.DIMENSION_TO_PROPERTY_SCHEMA_MAP:
101
+ if dimension not in DIMENSION_TO_PROPERTY_SCHEMA_MAP:
97
102
  message = f"dimension: '{dimension}' not found"
98
103
  raise AirbyteTracedException(message=message, internal_message=message, failure_type=FailureType.config_error)
99
104
  return config
@@ -137,17 +142,7 @@ class SourceGoogleSearchConsole(YamlDeclarativeSource):
137
142
  config = self._validate_and_transform(config)
138
143
  stream_config = self.get_stream_kwargs(config)
139
144
 
140
- streams = super().streams(config=config)
141
-
142
- streams = streams + self.get_custom_reports(config=config, stream_config=stream_config)
143
-
144
- return streams
145
-
146
- def get_custom_reports(self, config: Mapping[str, Any], stream_config: Mapping[str, Any]) -> List[Optional[Stream]]:
147
- return [
148
- type(report["name"], (SearchAnalyticsByCustomDimensions,), {})(dimensions=report["dimensions"], **stream_config)
149
- for report in config.get("custom_reports_array", [])
150
- ]
145
+ return super().streams(config=config)
151
146
 
152
147
  def get_stream_kwargs(self, config: Mapping[str, Any]) -> Mapping[str, Any]:
153
148
  return {
@@ -1,81 +0,0 @@
1
- #
2
- # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
- from dataclasses import dataclass
6
- from typing import Any, Mapping, Optional
7
-
8
- from airbyte_cdk.sources.declarative.migrations.state_migration import StateMigration
9
-
10
-
11
- @dataclass
12
- class NestedSubstreamStateMigration(StateMigration):
13
- """
14
- We require a custom state migration because SearchAnalytics streams contain two nested levels of
15
- substreams. The existing LegacyToPerPartitionStateMigration only handles one level.
16
-
17
- Legacy state format is as follows:
18
- {
19
- "date": "2025-05-28",
20
- "https://www.example.com/": {
21
- "web": {
22
- "date": "2025-05-25"
23
- },
24
- "news": {
25
- "date": "2023-05-22"
26
- }
27
- }
28
- }
29
-
30
- The resulting migrated per-partition state is:
31
- {
32
- "use_global_cursor": false,
33
- "states": [
34
- {
35
- "partition": {
36
- "search_type": "web",
37
- "site_url": "https://www.example.com/"
38
- },
39
- "cursor": {
40
- "date": "2025-05-25"
41
- }
42
- },
43
- {
44
- "partition": {
45
- "search_type": "news",
46
- "site_url": "https://www.example.com/"
47
- },
48
- "cursor": {
49
- "date": "2023-05-22"
50
- }
51
- }],
52
- "state": {
53
- "date": "2025-05-25"
54
- }
55
- }
56
- """
57
-
58
- def should_migrate(self, stream_state: Mapping[str, Any]) -> bool:
59
- return len(stream_state) > 0 and "states" not in stream_state
60
-
61
- def migrate(self, stream_state: Mapping[str, Any]) -> Mapping[str, Any]:
62
- global_state: Optional[Mapping[str, Any]] = None
63
- per_partition_state = []
64
- for site_url_key, search_type_state in stream_state.items():
65
- if site_url_key == "date":
66
- # The legacy state also contains a global cursor value under the `date` key which equates
67
- # to global state.
68
- #
69
- # However, the Python implementation does not appear to be implemented
70
- # correctly and simply saves the state of the last seen partition. Since I don't trust the
71
- # legacy value and in the current implementation global state is applied to partitions
72
- # without an existing value, I'm making a conscious choice to not migrate the global value.
73
- continue
74
- else:
75
- site_url = site_url_key
76
- for search_type_key, cursor in search_type_state.items():
77
- per_partition_state.append({"partition": {"site_url": site_url, "search_type": search_type_key}, "cursor": cursor})
78
- return {
79
- "use_global_cursor": False,
80
- "states": per_partition_state,
81
- }
@@ -1,347 +0,0 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
- from abc import ABC
6
- from enum import Enum
7
- from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Union
8
- from urllib.parse import quote_plus, unquote_plus
9
-
10
- import pendulum
11
- import requests
12
- from requests.auth import AuthBase
13
-
14
- from airbyte_cdk.models import SyncMode
15
- from airbyte_cdk.sources.streams import CheckpointMixin
16
- from airbyte_cdk.sources.streams.http import HttpStream
17
-
18
-
19
- BASE_URL = "https://www.googleapis.com/webmasters/v3/"
20
- ROW_LIMIT = 25000
21
-
22
-
23
- class QueryAggregationType(Enum):
24
- auto = "auto"
25
- by_page = "byPage"
26
- by_property = "byProperty"
27
-
28
-
29
- class GoogleSearchConsole(HttpStream, ABC):
30
- url_base = BASE_URL
31
- data_field = ""
32
- raise_on_http_errors = True
33
-
34
- def __init__(
35
- self,
36
- authenticator: AuthBase,
37
- site_urls: list,
38
- start_date: str,
39
- end_date: str,
40
- data_state: str = "final",
41
- ):
42
- super().__init__(authenticator=authenticator)
43
- self._site_urls = self.sanitize_urls_list(site_urls)
44
- self._start_date = start_date
45
- self._end_date = end_date
46
- self._data_state = data_state
47
-
48
- @staticmethod
49
- def sanitize_urls_list(site_urls: list) -> List[str]:
50
- return list(map(quote_plus, site_urls))
51
-
52
- def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
53
- return None
54
-
55
- def stream_slices(
56
- self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
57
- ) -> Iterable[Optional[Mapping[str, Any]]]:
58
- for site_url in self._site_urls:
59
- yield {"site_url": site_url}
60
-
61
- def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
62
- if not self.data_field:
63
- yield response.json()
64
-
65
- else:
66
- records = response.json().get(self.data_field) or []
67
- for record in records:
68
- yield record
69
-
70
- def should_retry(self, response: requests.Response) -> bool:
71
- response_json = response.json()
72
- if "error" in response_json:
73
- error = response_json.get("error", {})
74
- # handle the `HTTP-403` - insufficient permissions
75
- if error.get("code", 0) == 403:
76
- self.logger.error(f"Stream {self.name}. {error.get('message')}. Skipping.")
77
- setattr(self, "raise_on_http_errors", False)
78
- return False
79
- # handle the `HTTP-400` - Bad query params with `aggregationType`
80
- if error.get("code", 0) == 400:
81
- self.logger.error(f"Stream `{self.name}`. {error.get('message')}. Trying with `aggregationType = auto` instead.")
82
- self.aggregation_type = QueryAggregationType.auto
83
- setattr(self, "raise_on_http_errors", False)
84
- return response.status_code == 429 or 500 <= response.status_code < 600
85
-
86
-
87
- class SearchAnalytics(GoogleSearchConsole, CheckpointMixin, ABC):
88
- """
89
- API docs: https://developers.google.com/webmaster-tools/search-console-api-original/v3/searchanalytics
90
- """
91
-
92
- data_field = "rows"
93
- aggregation_type = QueryAggregationType.auto
94
- start_row = 0
95
- dimensions = []
96
- search_types = ["web", "news", "image", "video"]
97
- range_of_days = 3
98
-
99
- def __init__(self, authenticator: AuthBase, site_urls: list, start_date: str, end_date: str, data_state: str = "final", **kwargs):
100
- super().__init__(authenticator=authenticator, site_urls=site_urls, start_date=start_date, end_date=end_date, data_state=data_state)
101
- self._state = {}
102
-
103
- def path(
104
- self,
105
- stream_state: Mapping[str, Any] = None,
106
- stream_slice: Mapping[str, Any] = None,
107
- next_page_token: Mapping[str, Any] = None,
108
- ) -> str:
109
- return f"sites/{stream_slice.get('site_url')}/searchAnalytics/query"
110
-
111
- @property
112
- def cursor_field(self) -> Union[str, List[str]]:
113
- return "date"
114
-
115
- @property
116
- def http_method(self) -> str:
117
- return "POST"
118
-
119
- @property
120
- def state(self) -> MutableMapping[str, Any]:
121
- return self._state
122
-
123
- @state.setter
124
- def state(self, value: MutableMapping[str, Any]):
125
- self._state = value
126
-
127
- def stream_slices(
128
- self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
129
- ) -> Iterable[Optional[Mapping[str, Any]]]:
130
- """
131
- The `stream_slices` implements iterator functionality for `site_urls` and `searchType`. The user can pass many `site_url`,
132
- and we have to process all of them, we can also pass the` searchType` parameter in the `request body` to get data using some`
133
- searchType` value from [` web`, `news `,` image`, `video`, `discover`, `googleNews`].
134
- It's just a double nested loop with a yield statement.
135
- """
136
-
137
- for site_url in self._site_urls:
138
- for search_type in self.search_types:
139
- start_date = self._get_start_date(stream_state, site_url, search_type)
140
- end_date = self._get_end_date()
141
-
142
- if start_date > end_date:
143
- start_date = end_date
144
-
145
- next_start = start_date
146
- period = pendulum.Duration(days=self.range_of_days - 1)
147
- while next_start <= end_date:
148
- next_end = min(next_start + period, end_date)
149
- yield {
150
- "site_url": site_url,
151
- "search_type": search_type,
152
- "start_date": next_start.to_date_string(),
153
- "end_date": next_end.to_date_string(),
154
- "data_state": self._data_state,
155
- }
156
- # add 1 day for the next slice's start date not to duplicate data from previous slice's end date.
157
- next_start = next_end + pendulum.Duration(days=1)
158
-
159
- def next_page_token(self, response: requests.Response) -> Optional[bool]:
160
- """
161
- The `next_page_token` implements pagination functionality. This method gets the response
162
- and compares the number of records with the constant `ROW_LIMITS` (maximum value 25000),
163
- and if they are equal, this means that we get the end of the` Page`, and we need to go further,
164
- for this we simply increase the `startRow` parameter in request body by `ROW_LIMIT` value.
165
- """
166
-
167
- if len(response.json().get(self.data_field, [])) == ROW_LIMIT:
168
- self.start_row += ROW_LIMIT
169
- return True
170
-
171
- self.start_row = 0
172
-
173
- def request_headers(self, **kwargs) -> Mapping[str, Any]:
174
- return {"Content-Type": "application/json"}
175
-
176
- def request_body_json(
177
- self,
178
- stream_state: Mapping[str, Any] = None,
179
- stream_slice: Mapping[str, Any] = None,
180
- next_page_token: Mapping[str, Any] = None,
181
- ) -> Optional[Union[Dict[str, Any], str]]:
182
- """
183
- Here is a description of the parameters and implementations of the request body:
184
- 1. The `startDate` is retrieved from the `_get_start_date`,
185
- if` SyncMode = full_refresh` just use `start_date` from configuration, otherwise use `get_update_state`.
186
- 2. The `endDate` is retrieved from the `config.json`.
187
- 3. The `sizes` parameter is used to group the result by some dimension.
188
- The following dimensions are available: `date`, `country`, `page`, `device`, `query`.
189
- 4. For the `type` check the paragraph stream_slices method.
190
- Filter results to the following type ["web", "news", "image", "video", "discover", "googleNews"]
191
- 5. For the `startRow` and `rowLimit` check next_page_token method.
192
- """
193
-
194
- data = {
195
- "startDate": stream_slice["start_date"],
196
- "endDate": stream_slice["end_date"],
197
- "dimensions": self.dimensions,
198
- "type": stream_slice.get("search_type"),
199
- "aggregationType": self.aggregation_type.value,
200
- "startRow": self.start_row,
201
- "rowLimit": ROW_LIMIT,
202
- "dataState": stream_slice.get("data_state"),
203
- }
204
-
205
- return data
206
-
207
- def _get_end_date(self) -> pendulum.date:
208
- end_date = pendulum.parse(self._end_date).date()
209
- # limit `end_date` value with current date
210
- return min(end_date, pendulum.now().date())
211
-
212
- def _get_start_date(self, stream_state: Mapping[str, Any] = None, site_url: str = None, search_type: str = None) -> pendulum.date:
213
- start_date = pendulum.parse(self._start_date)
214
-
215
- if start_date and stream_state:
216
- if stream_state.get(unquote_plus(site_url), {}).get(search_type):
217
- stream_state_value = stream_state.get(unquote_plus(site_url), {}).get(search_type)
218
-
219
- start_date = max(
220
- pendulum.parse(stream_state_value[self.cursor_field]),
221
- start_date,
222
- )
223
-
224
- return start_date.date()
225
-
226
- def parse_response(
227
- self,
228
- response: requests.Response,
229
- stream_state: Mapping[str, Any],
230
- stream_slice: Mapping[str, Any] = None,
231
- next_page_token: Mapping[str, Any] = None,
232
- ) -> Iterable[Mapping]:
233
- records = response.json().get(self.data_field) or []
234
-
235
- for record in records:
236
- record["site_url"] = unquote_plus(stream_slice.get("site_url"))
237
- record["search_type"] = stream_slice.get("search_type")
238
-
239
- for dimension in self.dimensions:
240
- record[dimension] = record["keys"].pop(0)
241
-
242
- # remove unnecessary empty field
243
- record.pop("keys")
244
-
245
- yield record
246
-
247
- def _get_updated_state(
248
- self,
249
- current_stream_state: MutableMapping[str, Any],
250
- latest_record: Mapping[str, Any],
251
- ) -> Mapping[str, Any]:
252
- """
253
- With the existing nested loop implementation, we have to store a `cursor_field` for each `site_url`
254
- and `searchType`. This functionality is placed in `get_update_state`.
255
-
256
- {
257
- "stream": {
258
- "https://domain1.com": {
259
- "web": {"date": "2022-01-03"},
260
- "news": {"date": "2022-01-03"},
261
- "image": {"date": "2022-01-03"},
262
- "video": {"date": "2022-01-03"}
263
- },
264
- "https://domain2.com": {
265
- "web": {"date": "2022-01-03"},
266
- "news": {"date": "2022-01-03"},
267
- "image": {"date": "2022-01-03"},
268
- "video": {"date": "2022-01-03"}
269
- },
270
- "date": "2022-01-03",
271
- }
272
- }
273
- """
274
-
275
- latest_benchmark = latest_record.get(self.cursor_field)
276
-
277
- site_url = latest_record.get("site_url")
278
- search_type = latest_record.get("search_type")
279
-
280
- value = current_stream_state.get(site_url, {}).get(search_type, {}).get(self.cursor_field)
281
- if value:
282
- latest_benchmark = max(latest_benchmark, value)
283
- current_stream_state.setdefault(site_url, {}).setdefault(search_type, {})[self.cursor_field] = latest_benchmark
284
-
285
- # we need to get the max date over all searchTypes but the current acceptance test YAML format doesn't
286
- # support that
287
- current_stream_state[self.cursor_field] = current_stream_state[site_url][search_type][self.cursor_field]
288
-
289
- return current_stream_state
290
-
291
- def read_records(self, **kwargs) -> Iterable[Mapping[str, Any]]:
292
- for record in super().read_records(**kwargs):
293
- self.state = self._get_updated_state(self.state, record)
294
- yield record
295
-
296
-
297
- class SearchAnalyticsByCustomDimensions(SearchAnalytics):
298
- # `date` is a cursor field therefore should be mandatory
299
- DEFAULT_DIMENSIONS = ["date"]
300
- DIMENSION_TO_PROPERTY_SCHEMA_MAP = {
301
- "country": [{"country": {"type": ["null", "string"]}}],
302
- "date": [{"date": {"type": ["null", "string"], "format": "date"}}],
303
- "device": [{"device": {"type": ["null", "string"]}}],
304
- "page": [{"page": {"type": ["null", "string"]}}],
305
- "query": [{"query": {"type": ["null", "string"]}}],
306
- }
307
-
308
- primary_key = None
309
-
310
- def __init__(self, dimensions: List[str], *args, **kwargs):
311
- super(SearchAnalyticsByCustomDimensions, self).__init__(*args, **kwargs)
312
- self.dimensions = dimensions + [dimension for dimension in self.DEFAULT_DIMENSIONS if dimension not in dimensions]
313
- # Assign the dimensions as PK for the custom report stream.
314
- # Site URL and Search Type are included in the API call thus affect the resulting data.
315
- # `site_url` is a required URL param for making API calls;
316
- # `search_type` remains a query param for historical reasons, we do not want to remove it to not break existing connections.
317
- self.primary_key = self.dimensions + ["site_url", "search_type"]
318
-
319
- def get_json_schema(self) -> Mapping[str, Any]:
320
- schema: Mapping[str, Any] = {
321
- "$schema": "https://json-schema.org/draft-07/schema#",
322
- "type": ["null", "object"],
323
- "additionalProperties": True,
324
- "properties": {
325
- # metrics
326
- "clicks": {"type": ["null", "integer"]},
327
- "ctr": {"type": ["null", "number"], "multipleOf": 1e-25},
328
- "impressions": {"type": ["null", "integer"]},
329
- "position": {"type": ["null", "number"], "multipleOf": 1e-25},
330
- # default fields
331
- "search_type": {"type": ["null", "string"]},
332
- "site_url": {"type": ["null", "string"]},
333
- },
334
- }
335
-
336
- # dimensions
337
- dimension_properties = self.dimension_to_property_schema()
338
- schema["properties"].update(dimension_properties)
339
- return schema
340
-
341
- def dimension_to_property_schema(self) -> dict:
342
- properties = {}
343
- for dimension in sorted(self.dimensions):
344
- fields = self.DIMENSION_TO_PROPERTY_SCHEMA_MAP[dimension]
345
- for field in fields:
346
- properties = {**properties, **field}
347
- return properties