fraudcrawler 0.5.6__tar.gz → 0.5.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

Files changed (21) hide show
  1. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/PKG-INFO +1 -1
  2. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/base/base.py +35 -1
  3. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/base/orchestrator.py +1 -1
  4. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/scraping/search.py +28 -11
  5. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/pyproject.toml +1 -1
  6. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/LICENSE +0 -0
  7. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/README.md +0 -0
  8. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/__init__.py +0 -0
  9. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/base/__init__.py +0 -0
  10. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/base/client.py +0 -0
  11. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/base/google-languages.json +0 -0
  12. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/base/google-locations.json +0 -0
  13. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/base/retry.py +0 -0
  14. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/launch_demo_pipeline.py +0 -0
  15. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/processing/__init__.py +0 -0
  16. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/processing/processor.py +0 -0
  17. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/scraping/__init__.py +0 -0
  18. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/scraping/enrich.py +0 -0
  19. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/scraping/url.py +0 -0
  20. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/scraping/zyte.py +0 -0
  21. {fraudcrawler-0.5.6 → fraudcrawler-0.5.7}/fraudcrawler/settings.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fraudcrawler
3
- Version: 0.5.6
3
+ Version: 0.5.7
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -9,7 +9,7 @@ from pydantic import (
9
9
  from pydantic_settings import BaseSettings
10
10
  from urllib.parse import urlparse
11
11
  import re
12
- from typing import Any, Dict, List
12
+ from typing import Any, Dict, List, TYPE_CHECKING
13
13
 
14
14
  import httpx
15
15
 
@@ -23,6 +23,9 @@ from fraudcrawler.settings import (
23
23
  DEFAULT_HTTPX_REDIRECTS,
24
24
  )
25
25
 
26
+ if TYPE_CHECKING:
27
+ from fraudcrawler.scraping.zyte import ZyteAPI
28
+
26
29
  logger = logging.getLogger(__name__)
27
30
 
28
31
  # Load google locations and languages
@@ -241,3 +244,34 @@ class DomainUtils:
241
244
  if hostname and hostname.startswith("www."):
242
245
  hostname = hostname[4:]
243
246
  return hostname.lower()
247
+
248
+ async def _unblock_url(self, url: str, zyte_api: "ZyteAPI") -> bytes | None:
249
+ """Attempts to unblock a URL using Zyte proxy mode when direct access fails.
250
+
251
+ This method is specifically designed to handle 403 Forbidden errors for domains
252
+ that may be blocking requests from certain IP ranges (like cloud providers).
253
+
254
+ Args:
255
+ url: The URL to fetch using Zyte proxy mode.
256
+ zyte_api: An instance of ZyteAPI to use for the request.
257
+
258
+ Returns:
259
+ The HTML content as bytes if successful, None if failed.
260
+ """
261
+ try:
262
+ logger.info(f"Attempting to unblock URL using Zyte proxy: {url}")
263
+ details = await zyte_api.details(url)
264
+
265
+ if details and "httpResponseBody" in details:
266
+ # Decode the base64 content
267
+ import base64
268
+ html_content = base64.b64decode(details["httpResponseBody"])
269
+ logger.info(f"Successfully unblocked URL using Zyte proxy: {url}")
270
+ return html_content
271
+ else:
272
+ logger.warning(f"Zyte proxy request failed for URL: {url}")
273
+ return None
274
+
275
+ except Exception as e:
276
+ logger.error(f"Error unblocking URL with Zyte proxy: {url}, error: {e}")
277
+ return None
@@ -115,7 +115,7 @@ class Orchestrator(ABC):
115
115
 
116
116
  # Setup the clients
117
117
  self._search = Search(
118
- http_client=self._http_client, serpapi_key=self._serpapi_key
118
+ http_client=self._http_client, serpapi_key=self._serpapi_key, zyte_api=self._zyteapi
119
119
  )
120
120
  self._enricher = Enricher(
121
121
  http_client=self._http_client,
@@ -376,13 +376,15 @@ class Toppreise(SearchEngine):
376
376
  "Upgrade-Insecure-Requests": "1",
377
377
  }
378
378
 
379
- def __init__(self, http_client: httpx.AsyncClient):
379
+ def __init__(self, http_client: httpx.AsyncClient, zyte_api=None):
380
380
  """Initializes the Toppreise client.
381
381
 
382
382
  Args:
383
383
  http_client: An httpx.AsyncClient to use for the async requests.
384
+ zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
384
385
  """
385
386
  self._http_client = http_client
387
+ self._zyte_api = zyte_api
386
388
 
387
389
  @property
388
390
  def _search_engine_name(self) -> str:
@@ -448,16 +450,30 @@ class Toppreise(SearchEngine):
448
450
  retry.before_sleep = lambda retry_state: self._log_before_sleep(
449
451
  search_string=search_string, retry_state=retry_state
450
452
  )
451
- async for attempt in retry:
452
- with attempt:
453
- response = await self._http_client.get(
454
- url=url,
455
- headers=self._headers,
456
- )
457
- response.raise_for_status()
453
+
454
+ content = None
455
+ try:
456
+ async for attempt in retry:
457
+ with attempt:
458
+ response = await self._http_client.get(
459
+ url=url,
460
+ headers=self._headers,
461
+ )
462
+ response.raise_for_status()
463
+ content = response.content
464
+ except httpx.HTTPStatusError as e:
465
+ if e.response.status_code == 403 and self._zyte_api:
466
+ logger.warning(f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy")
467
+ content = await self._unblock_url(url, self._zyte_api)
468
+ if content is None:
469
+ raise e # Re-raise if zyte fallback also failed
470
+ else:
471
+ raise e
472
+
473
+ if content is None:
474
+ raise httpx.HTTPStatusError("Failed to fetch content", request=None, response=None)
458
475
 
459
476
  # Get external product urls from the content
460
- content = response.content
461
477
  urls = self._get_external_product_urls(content=content)
462
478
  urls = urls[:num_results] # Limit to num_results if needed
463
479
 
@@ -491,18 +507,19 @@ class Toppreise(SearchEngine):
491
507
  class Search(DomainUtils):
492
508
  """Class to perform searches using different search engines."""
493
509
 
494
- def __init__(self, http_client: httpx.AsyncClient, serpapi_key: str):
510
+ def __init__(self, http_client: httpx.AsyncClient, serpapi_key: str, zyte_api=None):
495
511
  """Initializes the Search class with the given SerpAPI key.
496
512
 
497
513
  Args:
498
514
  http_client: An httpx.AsyncClient to use for the async requests.
499
515
  serpapi_key: The API key for SERP API.
516
+ zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
500
517
  """
501
518
  self._google = SerpAPIGoogle(http_client=http_client, api_key=serpapi_key)
502
519
  self._google_shopping = SerpAPIGoogleShopping(
503
520
  http_client=http_client, api_key=serpapi_key
504
521
  )
505
- self._toppreise = Toppreise(http_client=http_client)
522
+ self._toppreise = Toppreise(http_client=http_client, zyte_api=zyte_api)
506
523
 
507
524
  @staticmethod
508
525
  def _domain_in_host(domain: str, host: Host) -> bool:
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "fraudcrawler"
7
- version = "0.5.6"
7
+ version = "0.5.7"
8
8
  description = "Intelligent Market Monitoring"
9
9
  authors = [
10
10
  "Domingo Bertus <hello@veanu.ch>",
File without changes
File without changes