airbyte-source-google-search-console 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,394 +0,0 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
- from abc import ABC
6
- from enum import Enum
7
- from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Union
8
- from urllib.parse import quote_plus, unquote_plus
9
-
10
- import pendulum
11
- import requests
12
- from requests.auth import AuthBase
13
-
14
- from airbyte_cdk.models import SyncMode
15
- from airbyte_cdk.sources.streams import CheckpointMixin
16
- from airbyte_cdk.sources.streams.http import HttpStream
17
-
18
-
19
- BASE_URL = "https://www.googleapis.com/webmasters/v3/"
20
- ROW_LIMIT = 25000
21
-
22
-
23
- class QueryAggregationType(Enum):
24
- auto = "auto"
25
- by_page = "byPage"
26
- by_property = "byProperty"
27
-
28
-
29
- class GoogleSearchConsole(HttpStream, ABC):
30
- url_base = BASE_URL
31
- data_field = ""
32
- raise_on_http_errors = True
33
-
34
- def __init__(
35
- self,
36
- authenticator: AuthBase,
37
- site_urls: list,
38
- start_date: str,
39
- end_date: str,
40
- data_state: str = "final",
41
- ):
42
- super().__init__(authenticator=authenticator)
43
- self._site_urls = self.sanitize_urls_list(site_urls)
44
- self._start_date = start_date
45
- self._end_date = end_date
46
- self._data_state = data_state
47
-
48
- @staticmethod
49
- def sanitize_urls_list(site_urls: list) -> List[str]:
50
- return list(map(quote_plus, site_urls))
51
-
52
- def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
53
- return None
54
-
55
- def stream_slices(
56
- self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
57
- ) -> Iterable[Optional[Mapping[str, Any]]]:
58
- for site_url in self._site_urls:
59
- yield {"site_url": site_url}
60
-
61
- def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
62
- if not self.data_field:
63
- yield response.json()
64
-
65
- else:
66
- records = response.json().get(self.data_field) or []
67
- for record in records:
68
- yield record
69
-
70
- def should_retry(self, response: requests.Response) -> bool:
71
- response_json = response.json()
72
- if "error" in response_json:
73
- error = response_json.get("error", {})
74
- # handle the `HTTP-403` - insufficient permissions
75
- if error.get("code", 0) == 403:
76
- self.logger.error(f"Stream {self.name}. {error.get('message')}. Skipping.")
77
- setattr(self, "raise_on_http_errors", False)
78
- return False
79
- # handle the `HTTP-400` - Bad query params with `aggregationType`
80
- if error.get("code", 0) == 400:
81
- self.logger.error(f"Stream `{self.name}`. {error.get('message')}. Trying with `aggregationType = auto` instead.")
82
- self.aggregation_type = QueryAggregationType.auto
83
- setattr(self, "raise_on_http_errors", False)
84
- return response.status_code == 429 or 500 <= response.status_code < 600
85
-
86
-
87
- class SearchAnalytics(GoogleSearchConsole, CheckpointMixin, ABC):
88
- """
89
- API docs: https://developers.google.com/webmaster-tools/search-console-api-original/v3/searchanalytics
90
- """
91
-
92
- data_field = "rows"
93
- aggregation_type = QueryAggregationType.auto
94
- start_row = 0
95
- dimensions = []
96
- search_types = ["web", "news", "image", "video"]
97
- range_of_days = 3
98
-
99
- def __init__(self, authenticator: AuthBase, site_urls: list, start_date: str, end_date: str, data_state: str = "final", **kwargs):
100
- super().__init__(authenticator=authenticator, site_urls=site_urls, start_date=start_date, end_date=end_date, data_state=data_state)
101
- self._state = {}
102
-
103
- def path(
104
- self,
105
- stream_state: Mapping[str, Any] = None,
106
- stream_slice: Mapping[str, Any] = None,
107
- next_page_token: Mapping[str, Any] = None,
108
- ) -> str:
109
- return f"sites/{stream_slice.get('site_url')}/searchAnalytics/query"
110
-
111
- @property
112
- def cursor_field(self) -> Union[str, List[str]]:
113
- return "date"
114
-
115
- @property
116
- def http_method(self) -> str:
117
- return "POST"
118
-
119
- @property
120
- def state(self) -> MutableMapping[str, Any]:
121
- return self._state
122
-
123
- @state.setter
124
- def state(self, value: MutableMapping[str, Any]):
125
- self._state = value
126
-
127
- def stream_slices(
128
- self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
129
- ) -> Iterable[Optional[Mapping[str, Any]]]:
130
- """
131
- The `stream_slices` implements iterator functionality for `site_urls` and `searchType`. The user can pass many `site_url`,
132
- and we have to process all of them, we can also pass the` searchType` parameter in the `request body` to get data using some`
133
- searchType` value from [` web`, `news `,` image`, `video`, `discover`, `googleNews`].
134
- It's just a double nested loop with a yield statement.
135
- """
136
-
137
- for site_url in self._site_urls:
138
- for search_type in self.search_types:
139
- start_date = self._get_start_date(stream_state, site_url, search_type)
140
- end_date = self._get_end_date()
141
-
142
- if start_date > end_date:
143
- start_date = end_date
144
-
145
- next_start = start_date
146
- period = pendulum.Duration(days=self.range_of_days - 1)
147
- while next_start <= end_date:
148
- next_end = min(next_start + period, end_date)
149
- yield {
150
- "site_url": site_url,
151
- "search_type": search_type,
152
- "start_date": next_start.to_date_string(),
153
- "end_date": next_end.to_date_string(),
154
- "data_state": self._data_state,
155
- }
156
- # add 1 day for the next slice's start date not to duplicate data from previous slice's end date.
157
- next_start = next_end + pendulum.Duration(days=1)
158
-
159
- def next_page_token(self, response: requests.Response) -> Optional[bool]:
160
- """
161
- The `next_page_token` implements pagination functionality. This method gets the response
162
- and compares the number of records with the constant `ROW_LIMITS` (maximum value 25000),
163
- and if they are equal, this means that we get the end of the` Page`, and we need to go further,
164
- for this we simply increase the `startRow` parameter in request body by `ROW_LIMIT` value.
165
- """
166
-
167
- if len(response.json().get(self.data_field, [])) == ROW_LIMIT:
168
- self.start_row += ROW_LIMIT
169
- return True
170
-
171
- self.start_row = 0
172
-
173
- def request_headers(self, **kwargs) -> Mapping[str, Any]:
174
- return {"Content-Type": "application/json"}
175
-
176
- def request_body_json(
177
- self,
178
- stream_state: Mapping[str, Any] = None,
179
- stream_slice: Mapping[str, Any] = None,
180
- next_page_token: Mapping[str, Any] = None,
181
- ) -> Optional[Union[Dict[str, Any], str]]:
182
- """
183
- Here is a description of the parameters and implementations of the request body:
184
- 1. The `startDate` is retrieved from the `_get_start_date`,
185
- if` SyncMode = full_refresh` just use `start_date` from configuration, otherwise use `get_update_state`.
186
- 2. The `endDate` is retrieved from the `config.json`.
187
- 3. The `sizes` parameter is used to group the result by some dimension.
188
- The following dimensions are available: `date`, `country`, `page`, `device`, `query`.
189
- 4. For the `type` check the paragraph stream_slices method.
190
- Filter results to the following type ["web", "news", "image", "video", "discover", "googleNews"]
191
- 5. For the `startRow` and `rowLimit` check next_page_token method.
192
- """
193
-
194
- data = {
195
- "startDate": stream_slice["start_date"],
196
- "endDate": stream_slice["end_date"],
197
- "dimensions": self.dimensions,
198
- "type": stream_slice.get("search_type"),
199
- "aggregationType": self.aggregation_type.value,
200
- "startRow": self.start_row,
201
- "rowLimit": ROW_LIMIT,
202
- "dataState": stream_slice.get("data_state"),
203
- }
204
-
205
- return data
206
-
207
- def _get_end_date(self) -> pendulum.date:
208
- end_date = pendulum.parse(self._end_date).date()
209
- # limit `end_date` value with current date
210
- return min(end_date, pendulum.now().date())
211
-
212
- def _get_start_date(self, stream_state: Mapping[str, Any] = None, site_url: str = None, search_type: str = None) -> pendulum.date:
213
- start_date = pendulum.parse(self._start_date)
214
-
215
- if start_date and stream_state:
216
- if stream_state.get(unquote_plus(site_url), {}).get(search_type):
217
- stream_state_value = stream_state.get(unquote_plus(site_url), {}).get(search_type)
218
-
219
- start_date = max(
220
- pendulum.parse(stream_state_value[self.cursor_field]),
221
- start_date,
222
- )
223
-
224
- return start_date.date()
225
-
226
- def parse_response(
227
- self,
228
- response: requests.Response,
229
- stream_state: Mapping[str, Any],
230
- stream_slice: Mapping[str, Any] = None,
231
- next_page_token: Mapping[str, Any] = None,
232
- ) -> Iterable[Mapping]:
233
- records = response.json().get(self.data_field) or []
234
-
235
- for record in records:
236
- record["site_url"] = unquote_plus(stream_slice.get("site_url"))
237
- record["search_type"] = stream_slice.get("search_type")
238
-
239
- for dimension in self.dimensions:
240
- record[dimension] = record["keys"].pop(0)
241
-
242
- # remove unnecessary empty field
243
- record.pop("keys")
244
-
245
- yield record
246
-
247
- def _get_updated_state(
248
- self,
249
- current_stream_state: MutableMapping[str, Any],
250
- latest_record: Mapping[str, Any],
251
- ) -> Mapping[str, Any]:
252
- """
253
- With the existing nested loop implementation, we have to store a `cursor_field` for each `site_url`
254
- and `searchType`. This functionality is placed in `get_update_state`.
255
-
256
- {
257
- "stream": {
258
- "https://domain1.com": {
259
- "web": {"date": "2022-01-03"},
260
- "news": {"date": "2022-01-03"},
261
- "image": {"date": "2022-01-03"},
262
- "video": {"date": "2022-01-03"}
263
- },
264
- "https://domain2.com": {
265
- "web": {"date": "2022-01-03"},
266
- "news": {"date": "2022-01-03"},
267
- "image": {"date": "2022-01-03"},
268
- "video": {"date": "2022-01-03"}
269
- },
270
- "date": "2022-01-03",
271
- }
272
- }
273
- """
274
-
275
- latest_benchmark = latest_record.get(self.cursor_field)
276
-
277
- site_url = latest_record.get("site_url")
278
- search_type = latest_record.get("search_type")
279
-
280
- value = current_stream_state.get(site_url, {}).get(search_type, {}).get(self.cursor_field)
281
- if value:
282
- latest_benchmark = max(latest_benchmark, value)
283
- current_stream_state.setdefault(site_url, {}).setdefault(search_type, {})[self.cursor_field] = latest_benchmark
284
-
285
- # we need to get the max date over all searchTypes but the current acceptance test YAML format doesn't
286
- # support that
287
- current_stream_state[self.cursor_field] = current_stream_state[site_url][search_type][self.cursor_field]
288
-
289
- return current_stream_state
290
-
291
- def read_records(self, **kwargs) -> Iterable[Mapping[str, Any]]:
292
- for record in super().read_records(**kwargs):
293
- self.state = self._get_updated_state(self.state, record)
294
- yield record
295
-
296
-
297
- class SearchAnalyticsByDate(SearchAnalytics):
298
- primary_key = ["site_url", "date", "search_type"]
299
- search_types = ["web", "news", "image", "video", "discover", "googleNews"]
300
- dimensions = ["date"]
301
-
302
-
303
- class SearchAnalyticsByDevice(SearchAnalytics):
304
- primary_key = ["site_url", "date", "device", "search_type"]
305
- search_types = ["web", "news", "image", "video", "googleNews"]
306
- dimensions = ["date", "device"]
307
-
308
-
309
- class SearchAnalyticsByPage(SearchAnalytics):
310
- primary_key = ["site_url", "date", "page", "search_type"]
311
- search_types = ["web", "news", "image", "video", "discover", "googleNews"]
312
- dimensions = ["date", "page"]
313
-
314
-
315
- class SearchAnalyticsByQuery(SearchAnalytics):
316
- primary_key = ["site_url", "date", "query", "search_type"]
317
- dimensions = ["date", "query"]
318
-
319
-
320
- class SearchAnalyticsAllFields(SearchAnalytics):
321
- primary_key = ["site_url", "date", "country", "device", "query", "page", "search_type"]
322
- dimensions = ["date", "country", "device", "page", "query"]
323
-
324
-
325
- class SearchAnalyticsSiteReportBySite(SearchAnalytics):
326
- primary_key = ["site_url", "date", "country", "device", "search_type"]
327
- dimensions = ["date", "country", "device"]
328
- aggregation_type = QueryAggregationType.by_property
329
-
330
-
331
- class SearchAnalyticsSiteReportByPage(SearchAnalytics):
332
- primary_key = ["site_url", "date", "country", "device", "search_type"]
333
- search_types = ["web", "news", "image", "video", "googleNews"]
334
- dimensions = ["date", "country", "device"]
335
- aggregation_type = QueryAggregationType.by_page
336
-
337
-
338
- class SearchAnalyticsPageReport(SearchAnalytics):
339
- primary_key = ["site_url", "date", "country", "device", "search_type", "page"]
340
- search_types = ["web", "news", "image", "video", "googleNews"]
341
- dimensions = ["date", "country", "device", "page"]
342
-
343
-
344
- class SearchAnalyticsByCustomDimensions(SearchAnalytics):
345
- # `date` is a cursor field therefore should be mandatory
346
- DEFAULT_DIMENSIONS = ["date"]
347
- DIMENSION_TO_PROPERTY_SCHEMA_MAP = {
348
- "country": [{"country": {"type": ["null", "string"]}}],
349
- "date": [{"date": {"type": ["null", "string"], "format": "date"}}],
350
- "device": [{"device": {"type": ["null", "string"]}}],
351
- "page": [{"page": {"type": ["null", "string"]}}],
352
- "query": [{"query": {"type": ["null", "string"]}}],
353
- }
354
-
355
- primary_key = None
356
-
357
- def __init__(self, dimensions: List[str], *args, **kwargs):
358
- super(SearchAnalyticsByCustomDimensions, self).__init__(*args, **kwargs)
359
- self.dimensions = dimensions + [dimension for dimension in self.DEFAULT_DIMENSIONS if dimension not in dimensions]
360
- # Assign the dimensions as PK for the custom report stream.
361
- # Site URL and Search Type are included in the API call thus affect the resulting data.
362
- # `site_url` is a required URL param for making API calls;
363
- # `search_type` remains a query param for historical reasons, we do not want to remove it to not break existing connections.
364
- self.primary_key = self.dimensions + ["site_url", "search_type"]
365
-
366
- def get_json_schema(self) -> Mapping[str, Any]:
367
- schema: Mapping[str, Any] = {
368
- "$schema": "https://json-schema.org/draft-07/schema#",
369
- "type": ["null", "object"],
370
- "additionalProperties": True,
371
- "properties": {
372
- # metrics
373
- "clicks": {"type": ["null", "integer"]},
374
- "ctr": {"type": ["null", "number"], "multipleOf": 1e-25},
375
- "impressions": {"type": ["null", "integer"]},
376
- "position": {"type": ["null", "number"], "multipleOf": 1e-25},
377
- # default fields
378
- "search_type": {"type": ["null", "string"]},
379
- "site_url": {"type": ["null", "string"]},
380
- },
381
- }
382
-
383
- # dimensions
384
- dimension_properties = self.dimension_to_property_schema()
385
- schema["properties"].update(dimension_properties)
386
- return schema
387
-
388
- def dimension_to_property_schema(self) -> dict:
389
- properties = {}
390
- for dimension in sorted(self.dimensions):
391
- fields = self.DIMENSION_TO_PROPERTY_SCHEMA_MAP[dimension]
392
- for field in fields:
393
- properties = {**properties, **field}
394
- return properties