fraudcrawler 0.4.6__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,664 @@
1
+ from abc import ABC, abstractmethod
2
+ from enum import Enum
3
+ import logging
4
+ from pydantic import BaseModel
5
+ from typing import Dict, List
6
+ from urllib.parse import quote_plus
7
+
8
+ from bs4 import BeautifulSoup
9
+ import httpx
10
+ from tenacity import RetryCallState
11
+
12
+ from fraudcrawler.settings import SEARCH_DEFAULT_COUNTRY_CODES
13
+ from fraudcrawler.base.base import Host, Language, Location, DomainUtils
14
+ from fraudcrawler.base.retry import get_async_retry
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class SearchResult(BaseModel):
20
+ """Model for a single search result."""
21
+
22
+ url: str
23
+ domain: str
24
+ search_engine_name: str
25
+ filtered: bool = False
26
+ filtered_at_stage: str | None = None
27
+
28
+
29
+ class SearchEngineName(Enum):
30
+ """Enum for search engine names."""
31
+
32
+ GOOGLE = "google"
33
+ GOOGLE_SHOPPING = "google_shopping"
34
+ TOPPREISE = "toppreise"
35
+
36
+
37
+ class SearchEngine(ABC, DomainUtils):
38
+ """Abstract base class for search engines."""
39
+
40
+ _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
41
+
42
+ @property
43
+ @abstractmethod
44
+ def _search_engine_name(self) -> str:
45
+ """The name of the search engine."""
46
+ pass
47
+
48
+ @abstractmethod
49
+ async def search(self, *args, **kwargs) -> List[SearchResult]:
50
+ """Apply the search with the given parameters and return results."""
51
+ pass
52
+
53
+ @classmethod
54
+ def _log_before(
55
+ cls, search_string: str, retry_state: RetryCallState | None
56
+ ) -> None:
57
+ """Context aware logging before the request is made."""
58
+ if retry_state:
59
+ logger.debug(
60
+ f'Performing search in {cls.__name__} with q="{search_string}" '
61
+ f"(attempt {retry_state.attempt_number})."
62
+ )
63
+ else:
64
+ logger.debug(f"retry_state is {retry_state}; not logging before.")
65
+
66
+ @classmethod
67
+ def _log_before_sleep(
68
+ cls, search_string: str, retry_state: RetryCallState | None
69
+ ) -> None:
70
+ """Context aware logging before sleeping after a failed request."""
71
+ if retry_state and retry_state.outcome:
72
+ logger.warning(
73
+ f'Attempt {retry_state.attempt_number} of {cls.__name__} search with q="{search_string}" '
74
+ f"failed with error: {retry_state.outcome.exception()}. "
75
+ f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
76
+ )
77
+ else:
78
+ logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
79
+
80
+ def _create_search_result(self, url: str) -> SearchResult:
81
+ """From a given url it creates the class:`SearchResult` instance."""
82
+ # Get marketplace name
83
+ domain = self._get_domain(url=url)
84
+
85
+ # Create and return the SearchResult object
86
+ result = SearchResult(
87
+ url=url,
88
+ domain=domain,
89
+ search_engine_name=self._search_engine_name,
90
+ )
91
+ return result
92
+
93
+
94
+ class SerpAPI(SearchEngine):
95
+ """Base class for SerpAPI search engines."""
96
+
97
+ _endpoint = "https://serpapi.com/search"
98
+
99
+ def __init__(self, http_client: httpx.AsyncClient, api_key: str):
100
+ """Initializes the SerpAPI client with the given API key.
101
+
102
+ Args:
103
+ http_client: An httpx.AsyncClient to use for the async requests.
104
+ api_key: The API key for SerpAPI.
105
+ """
106
+ self._http_client = http_client
107
+ self._api_key = api_key
108
+
109
+ @property
110
+ @abstractmethod
111
+ def _engine(self) -> str:
112
+ """The search engine name used in the SerpAPI request."""
113
+ pass
114
+
115
+ @staticmethod
116
+ @abstractmethod
117
+ def _extract_search_results_urls(data: dict) -> List[str]:
118
+ """Extracts search results urls from the response.
119
+
120
+ Args:
121
+ data: The json from the SerpAPI search response.
122
+ """
123
+ pass
124
+
125
+ @staticmethod
126
+ def _get_search_string(search_term: str, marketplaces: List[Host] | None) -> str:
127
+ """Constructs the search string with site: parameters for marketplaces."""
128
+ search_string = search_term
129
+ if marketplaces:
130
+ sites = [dom for host in marketplaces for dom in host.domains]
131
+ search_string += " site:" + " OR site:".join(s for s in sites)
132
+ return search_string
133
+
134
+ async def _search(
135
+ self,
136
+ search_string: str,
137
+ language: Language,
138
+ location: Location,
139
+ num_results: int,
140
+ ) -> List[str]:
141
+ """Performs a search using SerpAPI and returns the URLs of the results.
142
+
143
+ Args:
144
+ search_string: The search string to use (with potentially added site: parameters).
145
+ language: The language to use for the query ('hl' parameter).
146
+ location: The location to use for the query ('gl' parameter).
147
+ num_results: Max number of results to return.
148
+
149
+ The SerpAPI parameters are:
150
+ engine: The search engine to use ('google', 'google_shopping' etc.).
151
+ q: The search string (with potentially added site: parameters).
152
+ google_domain: The Google domain to use for the search (e.g. google.[com]).
153
+ location_[requested|used]: The location to use for the search.
154
+ tbs: The to-be-searched parameters (e.g. 'ctr:CH').
155
+ cr: The country code to limit the search to (e.g. 'countryCH').
156
+ gl: The country code to use for the search.
157
+ hl: The language code to use for the search.
158
+ num: The number of results to return.
159
+ api_key: The API key to use for the search.
160
+ """
161
+ engine = self._engine
162
+
163
+ # Log the search parameters
164
+ logger.debug(
165
+ f'Performing SerpAPI search with engine="{engine}", '
166
+ f'q="{search_string}", '
167
+ f'location="{location.name}", '
168
+ f'language="{language.code}", '
169
+ f"num_results={num_results}."
170
+ )
171
+
172
+ # Setup the parameters
173
+ params: Dict[str, str | int] = {
174
+ "engine": engine,
175
+ "q": search_string,
176
+ "google_domain": f"google.{location.code}",
177
+ "location_requested": location.name,
178
+ "location_used": location.name,
179
+ "tbs": f"ctr:{location.code.upper()}",
180
+ "cr": f"country{location.code.upper()}",
181
+ "gl": location.code,
182
+ "hl": language.code,
183
+ "num": num_results,
184
+ "api_key": self._api_key,
185
+ }
186
+ logger.debug(f"SerpAPI search with params: {params}")
187
+
188
+ # Perform the request and retry if necessary. There is some context aware logging:
189
+ # - `before`: before the request is made (and before retrying)
190
+ # - `before_sleep`: if the request fails before sleeping
191
+ retry = get_async_retry()
192
+ retry.before = lambda retry_state: self._log_before(
193
+ search_string=search_string, retry_state=retry_state
194
+ )
195
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
196
+ search_string=search_string, retry_state=retry_state
197
+ )
198
+ async for attempt in retry:
199
+ with attempt:
200
+ response = await self._http_client.get(
201
+ url=self._endpoint, params=params
202
+ )
203
+ response.raise_for_status()
204
+
205
+ # Extract the URLs from the response
206
+ data = response.json()
207
+ urls = self._extract_search_results_urls(data=data)
208
+
209
+ logger.debug(
210
+ f'Found total of {len(urls)} URLs from SerpAPI search for q="{search_string}" and engine="{engine}".'
211
+ )
212
+ return urls
213
+
214
+
215
+ class SerpAPIGoogle(SerpAPI):
216
+ """Search engine for Google in SerpAPI."""
217
+
218
+ def __init__(self, http_client: httpx.AsyncClient, api_key: str):
219
+ """Initializes the SerpAPIGoogle client with the given API key.
220
+
221
+ Args:
222
+ http_client: An httpx.AsyncClient to use for the async requests.
223
+ api_key: The API key for SerpAPI.
224
+ """
225
+ super().__init__(http_client=http_client, api_key=api_key)
226
+
227
+ @property
228
+ def _search_engine_name(self) -> str:
229
+ """The name of the search engine."""
230
+ return SearchEngineName.GOOGLE.value
231
+
232
+ @property
233
+ def _engine(self) -> str:
234
+ """The search engine name used in the SerpAPI request."""
235
+ return "google"
236
+
237
+ @staticmethod
238
+ def _extract_search_results_urls(data: dict) -> List[str]:
239
+ """Extracts search results urls from the response data.
240
+
241
+ Args:
242
+ data: The json data from the SerpApi search response.
243
+ """
244
+ results = data.get("organic_results")
245
+ if results is not None:
246
+ return [url for res in results if (url := res.get("link"))]
247
+ return []
248
+
249
+ async def search(
250
+ self,
251
+ search_term: str,
252
+ language: Language,
253
+ location: Location,
254
+ num_results: int,
255
+ marketplaces: List[Host] | None = None,
256
+ ) -> List[SearchResult]:
257
+ """Performs a google search using SerpApi and returns SearchResults.
258
+
259
+ Args:
260
+ search_term: The search term to use for the query.
261
+ language: The language to use for the query ('hl' parameter).
262
+ location: The location to use for the query ('gl' parameter).
263
+ num_results: Max number of results to return.
264
+ marketplaces: The marketplaces to include in the search.
265
+ """
266
+ # Construct the search string
267
+ search_string = self._get_search_string(
268
+ search_term=search_term,
269
+ marketplaces=marketplaces,
270
+ )
271
+
272
+ # Perform the search
273
+ urls = await self._search(
274
+ search_string=search_string,
275
+ language=language,
276
+ location=location,
277
+ num_results=num_results,
278
+ )
279
+
280
+ # Create and return SearchResult objects from the URLs
281
+ results = [self._create_search_result(url=url) for url in urls]
282
+ logger.debug(
283
+ f'Produced {len(results)} results from SerpAPI with engine="{self._engine}" and q="{search_string}".'
284
+ )
285
+ return results
286
+
287
+
288
+ class SerpAPIGoogleShopping(SerpAPI):
289
+ """Search engine for Google Shopping in SerpAPI."""
290
+
291
+ def __init__(self, http_client: httpx.AsyncClient, api_key: str):
292
+ """Initializes the SerpAPIGoogleShopping client with the given API key.
293
+
294
+ Args:
295
+ http_client: An httpx.AsyncClient to use for the async requests.
296
+ api_key: The API key for SerpAPI.
297
+ """
298
+ super().__init__(http_client=http_client, api_key=api_key)
299
+
300
+ @property
301
+ def _search_engine_name(self) -> str:
302
+ """The name of the search engine."""
303
+ return SearchEngineName.GOOGLE_SHOPPING.value
304
+
305
+ @property
306
+ def _engine(self) -> str:
307
+ """The search engine name used in the SerpAPI request."""
308
+ return "google_shopping"
309
+
310
+ @staticmethod
311
+ def _extract_search_results_urls(data: dict) -> List[str]:
312
+ """Extracts search results urls from the response data.
313
+
314
+ Args:
315
+ data: The json data from the SerpApi search response.
316
+ """
317
+ results = data.get("shopping_results")
318
+ if results is not None:
319
+ return [url for res in results if (url := res.get("product_link"))]
320
+ return []
321
+
322
+ async def search(
323
+ self,
324
+ search_term: str,
325
+ language: Language,
326
+ location: Location,
327
+ num_results: int,
328
+ marketplaces: List[Host] | None = None,
329
+ ) -> List[SearchResult]:
330
+ """Performs a google shopping search using SerpApi and returns SearchResults.
331
+
332
+ Args:
333
+ search_term: The search term to use for the query.
334
+ language: The language to use for the query ('hl' parameter).
335
+ location: The location to use for the query ('gl' parameter).
336
+ num_results: Max number of results to return.
337
+ marketplaces: The marketplaces to include in the search.
338
+ """
339
+ # Construct the search string
340
+ search_string = self._get_search_string(
341
+ search_term=search_term,
342
+ marketplaces=marketplaces,
343
+ )
344
+
345
+ # Perform the search
346
+ urls = await self._search(
347
+ search_string=search_string,
348
+ language=language,
349
+ location=location,
350
+ num_results=num_results,
351
+ )
352
+
353
+ # !!! NOTE !!!: Google Shopping results do not properly support the 'num' parameter,
354
+ # so we might get more results than requested. This is a known issue with SerpAPI
355
+ # and Google Shopping searches (see https://github.com/serpapi/public-roadmap/issues/1858)
356
+ urls = urls[:num_results]
357
+
358
+ # Create and return SearchResult objects from the URLs
359
+ results = [self._create_search_result(url=url) for url in urls]
360
+ logger.debug(
361
+ f'Produced {len(results)} results from SerpAPI with engine="{self._engine}" and q="{search_string}".'
362
+ )
363
+ return results
364
+
365
+
366
+ class Toppreise(SearchEngine):
367
+ """Search engine for toppreise.ch."""
368
+
369
+ _endpoint = "https://www.toppreise.ch/produktsuche"
370
+ _headers = {
371
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
372
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
373
+ "Accept-Language": "en-US,en;q=0.5",
374
+ "Accept-Encoding": "gzip, deflate",
375
+ "Connection": "keep-alive",
376
+ "Upgrade-Insecure-Requests": "1",
377
+ }
378
+
379
+ def __init__(self, http_client: httpx.AsyncClient):
380
+ """Initializes the Toppreise client.
381
+
382
+ Args:
383
+ http_client: An httpx.AsyncClient to use for the async requests.
384
+ """
385
+ self._http_client = http_client
386
+
387
+ @property
388
+ def _search_engine_name(self) -> str:
389
+ """The name of the search engine."""
390
+ return SearchEngineName.TOPPREISE.value
391
+
392
+ @staticmethod
393
+ def _get_external_product_urls(content: bytes) -> List[str]:
394
+ """Extracts external product URLs from the Toppreise search results page."""
395
+
396
+ # Parse the HTML
397
+ soup = BeautifulSoup(content, "html.parser")
398
+ links = soup.find_all("a", href=True)
399
+
400
+ # Filter links to only include external product links
401
+ hrefs = [
402
+ href
403
+ for link in links
404
+ if (
405
+ hasattr(link, "get") # Ensure we have a Tag object with href attribute
406
+ and (href := link.get("href")) # Ensure href is not None
407
+ and not href.startswith("javascript:") # Skip javascript links
408
+ and isinstance(href, str) # Ensure href is a string
409
+ and "ext_" in href # Skip links that are not external product link
410
+ )
411
+ ]
412
+
413
+ # Make relative URLs absolute
414
+ urls = []
415
+ for href in hrefs:
416
+ if href.startswith("/"):
417
+ href = f"https://www.toppreise.ch{href}"
418
+ elif not href.startswith("http"):
419
+ href = f"https://www.toppreise.ch/{href}"
420
+ urls.append(href)
421
+
422
+ # Return deduplicated urls
423
+ urls = list(set(urls))
424
+ logger.debug(
425
+ f"Found {len(urls)} external product URLs from Toppreise search results."
426
+ )
427
+ return urls
428
+
429
+ async def _search(self, search_string: str, num_results: int) -> List[str]:
430
+ """Performs a search on Toppreise and returns the URLs of the results.
431
+
432
+ Args:
433
+ search_string: The search string to use for the query.
434
+ num_results: Max number of results to return.
435
+ """
436
+ # Build the search URL for Toppreise
437
+ encoded_search = quote_plus(search_string)
438
+ url = f"{self._endpoint}?q={encoded_search}"
439
+ logger.debug(f"Toppreise search URL: {url}")
440
+
441
+ # Perform the request and retry if necessary. There is some context aware logging:
442
+ # - `before`: before the request is made (and before retrying)
443
+ # - `before_sleep`: if the request fails before sleeping
444
+ retry = get_async_retry()
445
+ retry.before = lambda retry_state: self._log_before(
446
+ search_string=search_string, retry_state=retry_state
447
+ )
448
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
449
+ search_string=search_string, retry_state=retry_state
450
+ )
451
+ async for attempt in retry:
452
+ with attempt:
453
+ response = await self._http_client.get(
454
+ url=url,
455
+ headers=self._headers,
456
+ )
457
+ response.raise_for_status()
458
+
459
+ # Get external product urls from the content
460
+ content = response.content
461
+ urls = self._get_external_product_urls(content=content)
462
+ urls = urls[:num_results] # Limit to num_results if needed
463
+
464
+ return urls
465
+
466
+ async def search(
467
+ self,
468
+ search_term: str,
469
+ num_results: int,
470
+ ) -> List[SearchResult]:
471
+ """Performs a Toppreise search and returns SearchResults.
472
+
473
+ Args:
474
+ search_term: The search term to use for the query.
475
+ num_results: Max number of results to return.
476
+ """
477
+ # Perform the search
478
+ urls = await self._search(
479
+ search_string=search_term,
480
+ num_results=num_results,
481
+ )
482
+
483
+ # Create and return SearchResult objects from the URLs
484
+ results = [self._create_search_result(url=url) for url in urls]
485
+ logger.debug(
486
+ f'Produced {len(results)} results from Toppreise search with q="{search_term}".'
487
+ )
488
+ return results
489
+
490
+
491
+ class Search(DomainUtils):
492
+ """Class to perform searches using different search engines."""
493
+
494
+ def __init__(self, http_client: httpx.AsyncClient, serpapi_key: str):
495
+ """Initializes the Search class with the given SerpAPI key.
496
+
497
+ Args:
498
+ http_client: An httpx.AsyncClient to use for the async requests.
499
+ serpapi_key: The API key for SERP API.
500
+ """
501
+ self._google = SerpAPIGoogle(http_client=http_client, api_key=serpapi_key)
502
+ self._google_shopping = SerpAPIGoogleShopping(
503
+ http_client=http_client, api_key=serpapi_key
504
+ )
505
+ self._toppreise = Toppreise(http_client=http_client)
506
+
507
+ @staticmethod
508
+ def _domain_in_host(domain: str, host: Host) -> bool:
509
+ """Checks if the domain is present in the host.
510
+
511
+ Note:
512
+ By checking `if domain == hst_dom or domain.endswith(f".{hst_dom}")`
513
+ it also checks for subdomains. For example, if the domain is
514
+ `link.springer.com` and the host domain is `springer.com`,
515
+ it will be detected as being present in the hosts.
516
+
517
+ Args:
518
+ domain: The domain to check.
519
+ host: The host to check against.
520
+ """
521
+ return any(
522
+ domain == hst_dom or domain.endswith(f".{hst_dom}")
523
+ for hst_dom in host.domains
524
+ )
525
+
526
+ def _domain_in_hosts(self, domain: str, hosts: List[Host]) -> bool:
527
+ """Checks if the domain is present in the list of hosts.
528
+
529
+ Args:
530
+ domain: The domain to check.
531
+ hosts: The list of hosts to check against.
532
+ """
533
+ return any(self._domain_in_host(domain=domain, host=hst) for hst in hosts)
534
+
535
+ @staticmethod
536
+ def _relevant_country_code(url: str, country_code: str) -> bool:
537
+ """Determines whether the url shows relevant country codes.
538
+
539
+ Args:
540
+ url: The URL to investigate.
541
+ country_code: The country code used to filter the products.
542
+ """
543
+ url = url.lower()
544
+ country_code_relevance = f".{country_code}" in url
545
+ default_relevance = any(cc in url for cc in SEARCH_DEFAULT_COUNTRY_CODES)
546
+ return country_code_relevance or default_relevance
547
+
548
+ def _is_excluded_url(self, domain: str, excluded_urls: List[Host]) -> bool:
549
+ """Checks if the domain is in the excluded URLs.
550
+
551
+ Args:
552
+ domain: The domain to check.
553
+ excluded_urls: The list of excluded URLs.
554
+ """
555
+ return self._domain_in_hosts(domain=domain, hosts=excluded_urls)
556
+
557
+ def _apply_filters(
558
+ self,
559
+ result: SearchResult,
560
+ location: Location,
561
+ marketplaces: List[Host] | None = None,
562
+ excluded_urls: List[Host] | None = None,
563
+ ) -> SearchResult:
564
+ """Checks for filters and updates the SearchResult accordingly.
565
+
566
+ Args:
567
+ result: The SearchResult object to check.
568
+ location: The location to use for the query.
569
+ marketplaces: The list of marketplaces to compare the URL against.
570
+ excluded_urls: The list of excluded URLs.
571
+ """
572
+ domain = result.domain
573
+ # Check if the URL is in the marketplaces (if yes, keep the result un-touched)
574
+ if marketplaces:
575
+ if self._domain_in_hosts(domain=domain, hosts=marketplaces):
576
+ return result
577
+
578
+ # Check if the URL has a relevant country_code
579
+ if not self._relevant_country_code(url=result.url, country_code=location.code):
580
+ result.filtered = True
581
+ result.filtered_at_stage = "Search (country code filtering)"
582
+ return result
583
+
584
+ # Check if the URL is in the excluded URLs
585
+ if excluded_urls and self._is_excluded_url(result.domain, excluded_urls):
586
+ result.filtered = True
587
+ result.filtered_at_stage = "Search (excluded URLs filtering)"
588
+ return result
589
+
590
+ return result
591
+
592
+ async def apply(
593
+ self,
594
+ search_term: str,
595
+ language: Language,
596
+ location: Location,
597
+ num_results: int,
598
+ marketplaces: List[Host] | None = None,
599
+ excluded_urls: List[Host] | None = None,
600
+ search_engines: List[SearchEngineName | str] | None = None,
601
+ ) -> List[SearchResult]:
602
+ """Performs a search and returns SearchResults.
603
+
604
+ Args:
605
+ search_term: The search term to use for the query.
606
+ language: The language to use for the query ('hl' parameter).
607
+ location: The location to use for the query ('gl' parameter).
608
+ num_results: Max number of results per search engine.
609
+ marketplaces: The marketplaces to include in the search.
610
+ excluded_urls: The URLs to exclude from the search.
611
+ search_engines: The list of search engines to use for the search.
612
+ """
613
+ if search_engines is None:
614
+ search_engines = list(SearchEngineName)
615
+ else:
616
+ search_engines = [
617
+ SearchEngineName(sen) if isinstance(sen, str) else sen
618
+ for sen in search_engines
619
+ ]
620
+ results: List[SearchResult] = []
621
+
622
+ # Make SerpAPI google search
623
+ if SearchEngineName.GOOGLE in search_engines:
624
+ res = await self._google.search(
625
+ search_term=search_term,
626
+ language=language,
627
+ location=location,
628
+ num_results=num_results,
629
+ marketplaces=marketplaces,
630
+ )
631
+ results.extend(res)
632
+
633
+ # Make SerpAPI google shopping search
634
+ if SearchEngineName.GOOGLE_SHOPPING in search_engines:
635
+ res = await self._google_shopping.search(
636
+ search_term=search_term,
637
+ language=language,
638
+ location=location,
639
+ num_results=num_results,
640
+ marketplaces=marketplaces,
641
+ )
642
+ results.extend(res)
643
+
644
+ # Make Toppreise search
645
+ if SearchEngineName.TOPPREISE in search_engines:
646
+ res = await self._toppreise.search(
647
+ search_term=search_term,
648
+ num_results=num_results,
649
+ )
650
+ results.extend(res)
651
+
652
+ # Apply filters
653
+ results = [
654
+ self._apply_filters(
655
+ result=res,
656
+ location=location,
657
+ marketplaces=marketplaces,
658
+ excluded_urls=excluded_urls,
659
+ )
660
+ for res in results
661
+ ]
662
+
663
+ logger.debug(f"Search produced a total of {len(results)} results.")
664
+ return results