fraudcrawler 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

fraudcrawler/base/base.py CHANGED
@@ -9,7 +9,7 @@ from pydantic import (
9
9
  from pydantic_settings import BaseSettings
10
10
  from urllib.parse import urlparse
11
11
  import re
12
- from typing import Any, Dict, List
12
+ from typing import Any, Dict, List, TYPE_CHECKING
13
13
 
14
14
  import httpx
15
15
 
@@ -23,6 +23,9 @@ from fraudcrawler.settings import (
23
23
  DEFAULT_HTTPX_REDIRECTS,
24
24
  )
25
25
 
26
+ if TYPE_CHECKING:
27
+ from fraudcrawler.scraping.zyte import ZyteAPI
28
+
26
29
  logger = logging.getLogger(__name__)
27
30
 
28
31
  # Load google locations and languages
@@ -241,3 +244,35 @@ class DomainUtils:
241
244
  if hostname and hostname.startswith("www."):
242
245
  hostname = hostname[4:]
243
246
  return hostname.lower()
247
+
248
+ async def _unblock_url(self, url: str, zyte_api: "ZyteAPI") -> bytes | None:
249
+ """Attempts to unblock a URL using Zyte proxy mode when direct access fails.
250
+
251
+ This method is specifically designed to handle 403 Forbidden errors for domains
252
+ that may be blocking requests from certain IP ranges (like cloud providers).
253
+
254
+ Args:
255
+ url: The URL to fetch using Zyte proxy mode.
256
+ zyte_api: An instance of ZyteAPI to use for the request.
257
+
258
+ Returns:
259
+ The HTML content as bytes if successful, None if failed.
260
+ """
261
+ try:
262
+ logger.info(f"Attempting to unblock URL using Zyte proxy: {url}")
263
+ details = await zyte_api.details(url)
264
+
265
+ if details and "httpResponseBody" in details:
266
+ # Decode the base64 content
267
+ import base64
268
+
269
+ html_content = base64.b64decode(details["httpResponseBody"])
270
+ logger.info(f"Successfully unblocked URL using Zyte proxy: {url}")
271
+ return html_content
272
+ else:
273
+ logger.warning(f"Zyte proxy request failed for URL: {url}")
274
+ return None
275
+
276
+ except Exception as e:
277
+ logger.error(f"Error unblocking URL with Zyte proxy: {url}, error: {e}")
278
+ return None
@@ -114,8 +114,13 @@ class Orchestrator(ABC):
114
114
  self._owns_http_client = True
115
115
 
116
116
  # Setup the clients
117
+ self._zyteapi = ZyteAPI(
118
+ http_client=self._http_client, api_key=self._zyteapi_key
119
+ )
117
120
  self._search = Search(
118
- http_client=self._http_client, serpapi_key=self._serpapi_key
121
+ http_client=self._http_client,
122
+ serpapi_key=self._serpapi_key,
123
+ zyte_api=self._zyteapi,
119
124
  )
120
125
  self._enricher = Enricher(
121
126
  http_client=self._http_client,
@@ -123,9 +128,6 @@ class Orchestrator(ABC):
123
128
  pwd=self._dataforseo_pwd,
124
129
  )
125
130
  self._url_collector = URLCollector()
126
- self._zyteapi = ZyteAPI(
127
- http_client=self._http_client, api_key=self._zyteapi_key
128
- )
129
131
  self._processor = Processor(
130
132
  http_client=self._http_client,
131
133
  api_key=self._openaiapi_key,
@@ -376,13 +376,15 @@ class Toppreise(SearchEngine):
376
376
  "Upgrade-Insecure-Requests": "1",
377
377
  }
378
378
 
379
- def __init__(self, http_client: httpx.AsyncClient):
379
+ def __init__(self, http_client: httpx.AsyncClient, zyte_api=None):
380
380
  """Initializes the Toppreise client.
381
381
 
382
382
  Args:
383
383
  http_client: An httpx.AsyncClient to use for the async requests.
384
+ zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
384
385
  """
385
386
  self._http_client = http_client
387
+ self._zyte_api = zyte_api
386
388
 
387
389
  @property
388
390
  def _search_engine_name(self) -> str:
@@ -448,16 +450,32 @@ class Toppreise(SearchEngine):
448
450
  retry.before_sleep = lambda retry_state: self._log_before_sleep(
449
451
  search_string=search_string, retry_state=retry_state
450
452
  )
451
- async for attempt in retry:
452
- with attempt:
453
- response = await self._http_client.get(
454
- url=url,
455
- headers=self._headers,
453
+
454
+ content = None
455
+ try:
456
+ async for attempt in retry:
457
+ with attempt:
458
+ response = await self._http_client.get(
459
+ url=url,
460
+ headers=self._headers,
461
+ )
462
+ response.raise_for_status()
463
+ content = response.content
464
+ except httpx.HTTPStatusError as e:
465
+ if e.response.status_code == 403 and self._zyte_api:
466
+ logger.warning(
467
+ f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
456
468
  )
457
- response.raise_for_status()
469
+ content = await self._unblock_url(url, self._zyte_api)
470
+ if content is None:
471
+ raise e # Re-raise if zyte fallback also failed
472
+ else:
473
+ raise e
474
+
475
+ if content is None:
476
+ raise httpx.HTTPError("Failed to fetch content")
458
477
 
459
478
  # Get external product urls from the content
460
- content = response.content
461
479
  urls = self._get_external_product_urls(content=content)
462
480
  urls = urls[:num_results] # Limit to num_results if needed
463
481
 
@@ -491,18 +509,19 @@ class Toppreise(SearchEngine):
491
509
  class Search(DomainUtils):
492
510
  """Class to perform searches using different search engines."""
493
511
 
494
- def __init__(self, http_client: httpx.AsyncClient, serpapi_key: str):
512
+ def __init__(self, http_client: httpx.AsyncClient, serpapi_key: str, zyte_api=None):
495
513
  """Initializes the Search class with the given SerpAPI key.
496
514
 
497
515
  Args:
498
516
  http_client: An httpx.AsyncClient to use for the async requests.
499
517
  serpapi_key: The API key for SERP API.
518
+ zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
500
519
  """
501
520
  self._google = SerpAPIGoogle(http_client=http_client, api_key=serpapi_key)
502
521
  self._google_shopping = SerpAPIGoogleShopping(
503
522
  http_client=http_client, api_key=serpapi_key
504
523
  )
505
- self._toppreise = Toppreise(http_client=http_client)
524
+ self._toppreise = Toppreise(http_client=http_client, zyte_api=zyte_api)
506
525
 
507
526
  @staticmethod
508
527
  def _domain_in_host(domain: str, host: Host) -> bool:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fraudcrawler
3
- Version: 0.5.6
3
+ Version: 0.5.8
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -1,22 +1,22 @@
1
1
  fraudcrawler/__init__.py,sha256=Kr19jWhtbC1shVoB9fHvBSeoG1IyQB9re1kCZ4YIAi0,842
2
2
  fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- fraudcrawler/base/base.py,sha256=suQMnvLIsZO_R0eHZKDWS4u9qnd1ryzPhjGlwcaMD5A,7295
3
+ fraudcrawler/base/base.py,sha256=NOJC12qw-iSkHScPnxFLfzUvg0w57qGaID6OAzHRXeo,8695
4
4
  fraudcrawler/base/client.py,sha256=yhkNrhL2SuJXTknLf-8P81fv01FnFMahREZgem-Z-f0,5832
5
5
  fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
6
  fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
- fraudcrawler/base/orchestrator.py,sha256=AKEETrYwKbMy_6YgTdgc6L-VA1iHYOtj3wIqEN3ngO4,26990
7
+ fraudcrawler/base/orchestrator.py,sha256=28X45XLPlJe2hvff8HTLo-V08LNeS0zMWBHe5W3hk4c,27039
8
8
  fraudcrawler/base/retry.py,sha256=9VyVrbYR_0YnfxFhUrvcM3aWCYR6oR4iZE4A3zzVZUs,1015
9
9
  fraudcrawler/launch_demo_pipeline.py,sha256=j5lu8lLl8QrkVU1MJH25uKtyYk_6lBSeoouCo30aRXg,4634
10
10
  fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  fraudcrawler/processing/processor.py,sha256=Qq8QcTlqfnzFi1t-1KkncXxaIszUO7pGK3LXTdHkDnM,7638
12
12
  fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
14
- fraudcrawler/scraping/search.py,sha256=nHMYaSkq9o6Hr4yUDEPguj8IHVcOpws3_XWiAbCVgLg,24062
14
+ fraudcrawler/scraping/search.py,sha256=ZjxOj95ih6o6bOWA0JnBwjFlMzGS-8Sb1P-yvHI5aO0,24957
15
15
  fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
16
16
  fraudcrawler/scraping/zyte.py,sha256=GqvVWA1AWVoClAwd-hQ9iynsT0dOb7R0ntaLK5XVivM,8340
17
17
  fraudcrawler/settings.py,sha256=uwXMOQpuwyWkuMU0asYGtBlL_qJj8F-Xkg4dUaCmDxE,3670
18
- fraudcrawler-0.5.6.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
19
- fraudcrawler-0.5.6.dist-info/METADATA,sha256=JXYbk0Qo24eBAByL_alhmDCTXTpzHycJeGnpZrGnOYg,6642
20
- fraudcrawler-0.5.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
21
- fraudcrawler-0.5.6.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
22
- fraudcrawler-0.5.6.dist-info/RECORD,,
18
+ fraudcrawler-0.5.8.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
19
+ fraudcrawler-0.5.8.dist-info/METADATA,sha256=-e9xqpIk0EjO6fqwhmQZ5gsDrl6eJKU7VQdp8MeN0R4,6642
20
+ fraudcrawler-0.5.8.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
21
+ fraudcrawler-0.5.8.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
22
+ fraudcrawler-0.5.8.dist-info/RECORD,,