fraudcrawler 0.4.7__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

@@ -1,515 +0,0 @@
1
- from enum import Enum
2
- import logging
3
- from pydantic import BaseModel
4
- from typing import List
5
- from urllib.parse import urlparse
6
- import re
7
-
8
- from tenacity import RetryCallState
9
-
10
- from fraudcrawler.settings import SERP_DEFAULT_COUNTRY_CODES
11
- from fraudcrawler.base.base import Host, Language, Location, AsyncClient
12
- from fraudcrawler.base.retry import get_async_retry
13
-
14
- logger = logging.getLogger(__name__)
15
-
16
-
17
- class SerpResult(BaseModel):
18
- """Model for a single search result from SerpApi."""
19
-
20
- url: str
21
- domain: str
22
- marketplace_name: str
23
- filtered: bool = False
24
- filtered_at_stage: str | None = None
25
-
26
-
27
- class SearchEngine(Enum):
28
- """Enum for the supported search engines."""
29
-
30
- GOOGLE = "google"
31
- GOOGLE_SHOPPING = "google_shopping"
32
-
33
-
34
- class SerpApi(AsyncClient):
35
- """A client to interact with the SerpApi for performing searches."""
36
-
37
- _endpoint = "https://serpapi.com/search"
38
- _engine_marketplace_names = {
39
- SearchEngine.GOOGLE.value: "Google",
40
- SearchEngine.GOOGLE_SHOPPING.value: "Google Shopping",
41
- }
42
- _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
43
-
44
- def __init__(
45
- self,
46
- api_key: str,
47
- ):
48
- """Initializes the SerpApiClient with the given API key.
49
-
50
- Args:
51
- api_key: The API key for SerpApi.
52
- """
53
- super().__init__()
54
- self._api_key = api_key
55
-
56
- def _get_domain(self, url: str) -> str:
57
- """Extracts the second-level domain together with the top-level domain (e.g. `google.com`).
58
-
59
- Args:
60
- url: The URL to be processed.
61
-
62
- """
63
- # Add scheme (if needed -> urlparse requires it)
64
- if not url.startswith(("http://", "https://")):
65
- url = "http://" + url
66
-
67
- # Get the hostname
68
- hostname = urlparse(url).hostname
69
- if hostname is None and (match := re.search(self._hostname_pattern, url)):
70
- hostname = match.group(1)
71
- if hostname is None:
72
- logger.warning(
73
- f'Failed to extract domain from url="{url}"; full url is returned'
74
- )
75
- return url.lower()
76
-
77
- # Remove www. prefix
78
- if hostname and hostname.startswith("www."):
79
- hostname = hostname[4:]
80
- return hostname.lower()
81
-
82
- @staticmethod
83
- def _extract_search_results(response: dict, engine: str) -> List[str]:
84
- """Extracts search results from the response based on the engine type.
85
-
86
- Args:
87
- response: The response from the SerpApi search.
88
- engine: The search engine used.
89
-
90
- Returns:
91
- A list of URLs extracted from the response.
92
- """
93
- urls = []
94
- if engine == SearchEngine.GOOGLE.value:
95
- # Get the organic_results
96
- results = response.get("organic_results")
97
- if results is None:
98
- logger.warning(f'No SerpAPI results for engine="{engine}".')
99
- else:
100
- urls = [url for res in results if (url := res.get("link"))]
101
-
102
- elif engine == SearchEngine.GOOGLE_SHOPPING.value:
103
- # Get the shopping_results
104
- results = response.get("shopping_results")
105
- if results is None:
106
- logger.warning(f'No SerpAPI results for engine="{engine}".')
107
- else:
108
- urls = [url for res in results if (url := res.get("product_link"))]
109
-
110
- else:
111
- raise ValueError(f"Invalid SerpAPI search engine: {engine}")
112
-
113
- return urls
114
-
115
- @staticmethod
116
- def _log_before(search_string: str, retry_state: RetryCallState | None) -> None:
117
- """Context aware logging before the request is made."""
118
- if retry_state:
119
- logger.debug(
120
- f'Performing SerpAPI search with q="{search_string}" '
121
- f"(attempt {retry_state.attempt_number})."
122
- )
123
- else:
124
- logger.debug(f"retry_state is {retry_state}, not logging before.")
125
-
126
- @staticmethod
127
- def _log_before_sleep(
128
- search_string: str, retry_state: RetryCallState | None
129
- ) -> None:
130
- """Context aware logging before sleeping after a failed request."""
131
- if retry_state and retry_state.outcome:
132
- logger.warning(
133
- f'Attempt {retry_state.attempt_number} of SerpAPI search with q="{search_string}" '
134
- f"failed with error: {retry_state.outcome.exception()}. "
135
- f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
136
- )
137
- else:
138
- logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
139
-
140
- async def _search(
141
- self,
142
- engine: str,
143
- search_string: str,
144
- language: Language,
145
- location: Location,
146
- num_results: int,
147
- ) -> List[str]:
148
- """Performs a search using SerpApi and returns the URLs of the results.
149
-
150
- Args:
151
- engine: The search engine to use.
152
- search_string: The search string (with potentially added site: parameters).
153
- language: The language to use for the query ('hl' parameter).
154
- location: The location to use for the query ('gl' parameter).
155
- num_results: Max number of results to return.
156
-
157
- The SerpAPI parameters are:
158
- engine: The search engine to use ('google' NOT 'google_shopping').
159
- q: The search string (with potentially added site: parameters).
160
- google_domain: The Google domain to use for the search (e.g. google.[com]).
161
- location_[requested|used]: The location to use for the search.
162
- tbs: The to-be-searched parameters (e.g. 'ctr:CH').
163
- cr: The country code to limit the search to (e.g. 'countryCH').
164
- gl: The country code to use for the search.
165
- hl: The language code to use for the search.
166
- num: The number of results to return.
167
- api_key: The API key to use for the search.
168
- """
169
- if engine not in self._engine_marketplace_names:
170
- raise ValueError(
171
- f"Invalid SerpAPI search engine: {engine}. "
172
- f"Supported engines are: {list(self._engine_marketplace_names.keys())}."
173
- )
174
- logger.debug(
175
- f'Performing SerpAPI search with engine="{engine}", '
176
- f'q="{search_string}", '
177
- f'location="{location.name}", '
178
- f'language="{language.code}", '
179
- f"num_results={num_results}."
180
- )
181
-
182
- # Setup the parameters
183
- params = {
184
- "engine": engine,
185
- "q": search_string,
186
- "google_domain": f"google.{location.code}",
187
- "location_requested": location.name,
188
- "location_used": location.name,
189
- "tbs": f"ctr:{location.code.upper()}",
190
- "cr": f"country{location.code.upper()}",
191
- "gl": location.code,
192
- "hl": language.code,
193
- "num": num_results,
194
- "api_key": self._api_key,
195
- }
196
- logger.debug(f"SerpAPI search with params: {params}")
197
-
198
- # Perform the request and retry if necessary. There is some context aware logging:
199
- # - `before`: before the request is made (and before retrying)
200
- # - `before_sleep`: if the request fails before sleeping
201
- retry = get_async_retry()
202
- retry.before = lambda retry_state: self._log_before(
203
- search_string=search_string, retry_state=retry_state
204
- )
205
- retry.before_sleep = lambda retry_state: self._log_before_sleep(
206
- search_string=search_string, retry_state=retry_state
207
- )
208
- async for attempt in retry:
209
- with attempt:
210
- response = await self.get(url=self._endpoint, params=params)
211
-
212
- # Extract the URLs from the response
213
- urls = self._extract_search_results(response=response, engine=engine)
214
-
215
- logger.debug(
216
- f'Found total of {len(urls)} URLs from SerpApi search for q="{search_string}" and engine="{engine}".'
217
- )
218
- return urls
219
-
220
- @staticmethod
221
- def _relevant_country_code(url: str, country_code: str) -> bool:
222
- """Determines whether the url shows relevant country codes.
223
-
224
- Args:
225
- url: The URL to investigate.
226
- country_code: The country code used to filter the products.
227
- """
228
- url = url.lower()
229
- country_code_relevance = f".{country_code}" in url
230
- default_relevance = any(cc in url for cc in SERP_DEFAULT_COUNTRY_CODES)
231
- return country_code_relevance or default_relevance
232
-
233
- @staticmethod
234
- def _domain_in_host(domain: str, host: Host) -> bool:
235
- """Checks if the domain is present in the host.
236
-
237
- Args:
238
- domain: The domain to check.
239
- host: The host to check against.
240
- """
241
- return any(
242
- domain == hst_dom or domain.endswith(f".{hst_dom}")
243
- for hst_dom in host.domains
244
- )
245
-
246
- def _domain_in_hosts(self, domain: str, hosts: List[Host]) -> bool:
247
- """Checks if the domain is present in the list of hosts.
248
-
249
- Note:
250
- By checking `if domain == hst_dom or domain.endswith(f".{hst_dom}")`
251
- it also checks for subdomains. For example, if the domain is
252
- `link.springer.com` and the host domain is `springer.com`,
253
- it will be detected as being present in the hosts.
254
-
255
- Args:
256
- domain: The domain to check.
257
- hosts: The list of hosts to check against.
258
- """
259
- return any(self._domain_in_host(domain=domain, host=hst) for hst in hosts)
260
-
261
- def _is_excluded_url(self, domain: str, excluded_urls: List[Host]) -> bool:
262
- """Checks if the domain is in the excluded URLs.
263
-
264
- Args:
265
- domain: The domain to check.
266
- excluded_urls: The list of excluded URLs.
267
- """
268
- return self._domain_in_hosts(domain=domain, hosts=excluded_urls)
269
-
270
- def _apply_filters(
271
- self,
272
- result: SerpResult,
273
- location: Location,
274
- marketplaces: List[Host] | None = None,
275
- excluded_urls: List[Host] | None = None,
276
- ) -> SerpResult:
277
- """Checks for filters and updates the SerpResult accordingly.
278
-
279
- Args:
280
- result: The SerpResult object to check.
281
- location: The location to use for the query.
282
- marketplaces: The list of marketplaces to compare the URL against.
283
- excluded_urls: The list of excluded URLs.
284
- """
285
- domain = result.domain
286
- # Check if the URL is in the marketplaces (if yes, keep the result un-touched)
287
- if marketplaces:
288
- if self._domain_in_hosts(domain=domain, hosts=marketplaces):
289
- return result
290
-
291
- # Check if the URL has a relevant country_code
292
- if not self._relevant_country_code(url=result.url, country_code=location.code):
293
- result.filtered = True
294
- result.filtered_at_stage = "SerpAPI (country code filtering)"
295
- return result
296
-
297
- # Check if the URL is in the excluded URLs
298
- if excluded_urls and self._is_excluded_url(result.domain, excluded_urls):
299
- result.filtered = True
300
- result.filtered_at_stage = "SerpAPI (excluded URLs filtering)"
301
- return result
302
-
303
- return result
304
-
305
- def _create_serp_result(
306
- self,
307
- engine: str,
308
- url: str,
309
- location: Location,
310
- marketplaces: List[Host] | None = None,
311
- excluded_urls: List[Host] | None = None,
312
- ) -> SerpResult:
313
- """From a given url it creates the class:`SerpResult` instance.
314
-
315
- If marketplaces is None or the domain can not be extracted, the default marketplace name is used.
316
-
317
- Args:
318
- engine: The search engine used.
319
- url: The URL to be processed.
320
- location: The location to use for the query.
321
- marketplaces: The list of marketplaces to compare the URL against.
322
- excluded_urls: The list of excluded URLs.
323
- """
324
- # Get marketplace name
325
- domain = self._get_domain(url=url)
326
-
327
- # Select marketplace name based on engine
328
- marketplace_name = self._engine_marketplace_names[engine]
329
-
330
- if marketplaces:
331
- try:
332
- marketplace_name = next(
333
- mp.name
334
- for mp in marketplaces
335
- if self._domain_in_host(domain=domain, host=mp)
336
- )
337
- except StopIteration:
338
- logger.warning(f'Failed to find marketplace for domain="{domain}".')
339
-
340
- # Create the SerpResult object
341
- result = SerpResult(
342
- url=url,
343
- domain=domain,
344
- marketplace_name=marketplace_name,
345
- )
346
-
347
- # Apply filters
348
- result = self._apply_filters(
349
- result=result,
350
- location=location,
351
- marketplaces=marketplaces,
352
- excluded_urls=excluded_urls,
353
- )
354
- return result
355
-
356
- async def _search_google(
357
- self,
358
- search_string: str,
359
- language: Language,
360
- location: Location,
361
- num_results: int,
362
- marketplaces: List[Host] | None = None,
363
- excluded_urls: List[Host] | None = None,
364
- ) -> List[SerpResult]:
365
- """Performs a google search using SerpApi and returns SerpResults.
366
-
367
- Args:
368
- search_string: The search string (with potentially added site: parameters).
369
- language: The language to use for the query ('hl' parameter).
370
- location: The location to use for the query ('gl' parameter).
371
- num_results: Max number of results to return.
372
- marketplaces: The marketplaces to include in the search.
373
- excluded_urls: The URLs to exclude from the search.
374
- """
375
- engine = SearchEngine.GOOGLE.value
376
-
377
- # Perform the search
378
- urls = await self._search(
379
- engine=engine,
380
- search_string=search_string,
381
- language=language,
382
- location=location,
383
- num_results=num_results,
384
- )
385
-
386
- # Create SerpResult objects from the URLs
387
- results = [
388
- self._create_serp_result(
389
- url=url,
390
- location=location,
391
- marketplaces=marketplaces,
392
- excluded_urls=excluded_urls,
393
- engine=engine,
394
- )
395
- for url in urls
396
- ]
397
-
398
- logger.debug(
399
- f'Produced {len(results)} results from google search with q="{search_string}".'
400
- )
401
- return results
402
-
403
- async def _search_google_shopping(
404
- self,
405
- search_string: str,
406
- language: Language,
407
- location: Location,
408
- num_results: int,
409
- marketplaces: List[Host] | None = None,
410
- excluded_urls: List[Host] | None = None,
411
- ) -> List[SerpResult]:
412
- """Performs a google search using SerpApi and returns SerpResults.
413
-
414
- Args:
415
- search_string: The search string (with potentially added site: parameters).
416
- language: The language to use for the query ('hl' parameter).
417
- location: The location to use for the query ('gl' parameter).
418
- num_results: Max number of results to return.
419
- marketplaces: The marketplaces to include in the search.
420
- excluded_urls: The URLs to exclude from the search.
421
- """
422
- engine = SearchEngine.GOOGLE_SHOPPING.value
423
-
424
- # Perform the search
425
- urls = await self._search(
426
- engine=engine,
427
- search_string=search_string,
428
- language=language,
429
- location=location,
430
- num_results=num_results,
431
- )
432
-
433
- # !!! NOTE !!!: Google Shopping results do not properly support the 'num' parameter,
434
- # so we might get more results than requested. This is a known issue with SerpAPI
435
- # and Google Shopping searches (see https://github.com/serpapi/public-roadmap/issues/1858)
436
- urls = urls[:num_results]
437
-
438
- # Create SerpResult objects from the URLs
439
- results = [
440
- self._create_serp_result(
441
- url=url,
442
- location=location,
443
- marketplaces=marketplaces,
444
- excluded_urls=excluded_urls,
445
- engine=engine,
446
- )
447
- for url in urls
448
- ]
449
-
450
- logger.debug(
451
- f'Produced {len(results)} results from google shopping search with q="{search_string}".'
452
- )
453
- return results
454
-
455
- async def apply(
456
- self,
457
- search_term: str,
458
- search_engines: List[SearchEngine],
459
- language: Language,
460
- location: Location,
461
- num_results: int,
462
- marketplaces: List[Host] | None = None,
463
- excluded_urls: List[Host] | None = None,
464
- ) -> List[SerpResult]:
465
- """Performs a search using SerpApi, filters based on country code and returns the URLs.
466
-
467
- Args:
468
- search_term: The search term to use for the query.
469
- language: The language to use for the query.
470
- location: The location to use for the query.
471
- num_results: Max number of results to return (default: 10).
472
- marketplaces: The marketplaces to include in the search.
473
- excluded_urls: The URLs to exclude from the search.
474
- """
475
- # Setup the parameters
476
- logger.info(f'Performing SerpAPI search for search_term="{search_term}".')
477
-
478
- # Setup the search string
479
- search_string = search_term
480
- if marketplaces:
481
- sites = [dom for host in marketplaces for dom in host.domains]
482
- search_string += " site:" + " OR site:".join(s for s in sites)
483
-
484
- # Initialize the results list
485
- results: List[SerpResult] = []
486
-
487
- # Perform the google search
488
- if SearchEngine.GOOGLE in search_engines:
489
- ggl_res = await self._search_google(
490
- search_string=search_string,
491
- language=language,
492
- location=location,
493
- num_results=num_results,
494
- marketplaces=marketplaces,
495
- excluded_urls=excluded_urls,
496
- )
497
- results.extend(ggl_res)
498
-
499
- # Perform the google shopping search
500
- if SearchEngine.GOOGLE_SHOPPING in search_engines:
501
- shp_res = await self._search_google_shopping(
502
- search_string=search_string,
503
- language=language,
504
- location=location,
505
- num_results=num_results,
506
- marketplaces=marketplaces,
507
- excluded_urls=excluded_urls,
508
- )
509
- results.extend(shp_res)
510
-
511
- num_non_filtered = len([res for res in results if not res.filtered])
512
- logger.info(
513
- f'Produced a total of {num_non_filtered} results from SerpApi search with q="{search_string}".'
514
- )
515
- return results
@@ -1,22 +0,0 @@
1
- fraudcrawler/__init__.py,sha256=zAqnJ9Mewq0qzSfOjyaICyqDRQZE_Z3FmyF2IPdOhXo,788
2
- fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- fraudcrawler/base/base.py,sha256=pYGdRV_Ssw5fA6tLVhlZwAO0OLQl6qn6LgJPCzOCrpc,6258
4
- fraudcrawler/base/client.py,sha256=FibiYycjUys-c4sv66Y2JqJu5y15be2MYd2_9yB3wG8,4936
5
- fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
- fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
- fraudcrawler/base/orchestrator.py,sha256=UzqEtC7Szw1-Ic31lex04Mgpf2f7MM-odwhC0gTxN2Q,23566
8
- fraudcrawler/base/retry.py,sha256=OKdOed7mP2VLYJLi1zo0MC8ISMm7k3gZgtNuqn50NhI,995
9
- fraudcrawler/launch_demo_pipeline.py,sha256=No9KYMbJWTdW1GIZjXVBDb-Xn_QNdDquHQ70BzAVxfc,4655
10
- fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- fraudcrawler/processing/processor.py,sha256=-QdLiAhdPLdYWcMvbKmuPQ_WlvFEDpmEXNps1QGChvQ,7421
12
- fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- fraudcrawler/scraping/enrich.py,sha256=1vRGUtF9F8aw46qjKSUiVqGXLdRPaUmI8e5Bu-ZYt8Y,12398
14
- fraudcrawler/scraping/serp.py,sha256=aTsrH9R9yOpEH_ga-h1BylAtVl4sf9eHIaCv798GLEE,18782
15
- fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
16
- fraudcrawler/scraping/zyte.py,sha256=Pv0i2Ni6oamIo_aFdG9c-Kon0PM6oTmMgVYdT3KwvYo,7602
17
- fraudcrawler/settings.py,sha256=zoNd4LCBL1JNfICiYlLkggw8rGr_tkFc7rrE1morLKI,3442
18
- fraudcrawler-0.4.7.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
19
- fraudcrawler-0.4.7.dist-info/METADATA,sha256=zK5_0rJh2ewfhg_PyqRpfgI1Ge5BZSb6Gexqc5vNaiw,5922
20
- fraudcrawler-0.4.7.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
21
- fraudcrawler-0.4.7.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
22
- fraudcrawler-0.4.7.dist-info/RECORD,,