fraudcrawler 0.5.6__tar.gz → 0.5.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/PKG-INFO +1 -1
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/base/base.py +36 -1
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/base/orchestrator.py +6 -4
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/scraping/search.py +29 -10
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/pyproject.toml +1 -1
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/LICENSE +0 -0
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/README.md +0 -0
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/__init__.py +0 -0
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/base/__init__.py +0 -0
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/base/client.py +0 -0
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/base/google-languages.json +0 -0
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/base/google-locations.json +0 -0
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/base/retry.py +0 -0
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/launch_demo_pipeline.py +0 -0
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/processing/__init__.py +0 -0
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/processing/processor.py +0 -0
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/scraping/__init__.py +0 -0
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/scraping/enrich.py +0 -0
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/scraping/url.py +0 -0
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/scraping/zyte.py +0 -0
- {fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/settings.py +0 -0
|
@@ -9,7 +9,7 @@ from pydantic import (
|
|
|
9
9
|
from pydantic_settings import BaseSettings
|
|
10
10
|
from urllib.parse import urlparse
|
|
11
11
|
import re
|
|
12
|
-
from typing import Any, Dict, List
|
|
12
|
+
from typing import Any, Dict, List, TYPE_CHECKING
|
|
13
13
|
|
|
14
14
|
import httpx
|
|
15
15
|
|
|
@@ -23,6 +23,9 @@ from fraudcrawler.settings import (
|
|
|
23
23
|
DEFAULT_HTTPX_REDIRECTS,
|
|
24
24
|
)
|
|
25
25
|
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from fraudcrawler.scraping.zyte import ZyteAPI
|
|
28
|
+
|
|
26
29
|
logger = logging.getLogger(__name__)
|
|
27
30
|
|
|
28
31
|
# Load google locations and languages
|
|
@@ -241,3 +244,35 @@ class DomainUtils:
|
|
|
241
244
|
if hostname and hostname.startswith("www."):
|
|
242
245
|
hostname = hostname[4:]
|
|
243
246
|
return hostname.lower()
|
|
247
|
+
|
|
248
|
+
async def _unblock_url(self, url: str, zyte_api: "ZyteAPI") -> bytes | None:
|
|
249
|
+
"""Attempts to unblock a URL using Zyte proxy mode when direct access fails.
|
|
250
|
+
|
|
251
|
+
This method is specifically designed to handle 403 Forbidden errors for domains
|
|
252
|
+
that may be blocking requests from certain IP ranges (like cloud providers).
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
url: The URL to fetch using Zyte proxy mode.
|
|
256
|
+
zyte_api: An instance of ZyteAPI to use for the request.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
The HTML content as bytes if successful, None if failed.
|
|
260
|
+
"""
|
|
261
|
+
try:
|
|
262
|
+
logger.info(f"Attempting to unblock URL using Zyte proxy: {url}")
|
|
263
|
+
details = await zyte_api.details(url)
|
|
264
|
+
|
|
265
|
+
if details and "httpResponseBody" in details:
|
|
266
|
+
# Decode the base64 content
|
|
267
|
+
import base64
|
|
268
|
+
|
|
269
|
+
html_content = base64.b64decode(details["httpResponseBody"])
|
|
270
|
+
logger.info(f"Successfully unblocked URL using Zyte proxy: {url}")
|
|
271
|
+
return html_content
|
|
272
|
+
else:
|
|
273
|
+
logger.warning(f"Zyte proxy request failed for URL: {url}")
|
|
274
|
+
return None
|
|
275
|
+
|
|
276
|
+
except Exception as e:
|
|
277
|
+
logger.error(f"Error unblocking URL with Zyte proxy: {url}, error: {e}")
|
|
278
|
+
return None
|
|
@@ -114,8 +114,13 @@ class Orchestrator(ABC):
|
|
|
114
114
|
self._owns_http_client = True
|
|
115
115
|
|
|
116
116
|
# Setup the clients
|
|
117
|
+
self._zyteapi = ZyteAPI(
|
|
118
|
+
http_client=self._http_client, api_key=self._zyteapi_key
|
|
119
|
+
)
|
|
117
120
|
self._search = Search(
|
|
118
|
-
http_client=self._http_client,
|
|
121
|
+
http_client=self._http_client,
|
|
122
|
+
serpapi_key=self._serpapi_key,
|
|
123
|
+
zyte_api=self._zyteapi,
|
|
119
124
|
)
|
|
120
125
|
self._enricher = Enricher(
|
|
121
126
|
http_client=self._http_client,
|
|
@@ -123,9 +128,6 @@ class Orchestrator(ABC):
|
|
|
123
128
|
pwd=self._dataforseo_pwd,
|
|
124
129
|
)
|
|
125
130
|
self._url_collector = URLCollector()
|
|
126
|
-
self._zyteapi = ZyteAPI(
|
|
127
|
-
http_client=self._http_client, api_key=self._zyteapi_key
|
|
128
|
-
)
|
|
129
131
|
self._processor = Processor(
|
|
130
132
|
http_client=self._http_client,
|
|
131
133
|
api_key=self._openaiapi_key,
|
|
@@ -376,13 +376,15 @@ class Toppreise(SearchEngine):
|
|
|
376
376
|
"Upgrade-Insecure-Requests": "1",
|
|
377
377
|
}
|
|
378
378
|
|
|
379
|
-
def __init__(self, http_client: httpx.AsyncClient):
|
|
379
|
+
def __init__(self, http_client: httpx.AsyncClient, zyte_api=None):
|
|
380
380
|
"""Initializes the Toppreise client.
|
|
381
381
|
|
|
382
382
|
Args:
|
|
383
383
|
http_client: An httpx.AsyncClient to use for the async requests.
|
|
384
|
+
zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
|
|
384
385
|
"""
|
|
385
386
|
self._http_client = http_client
|
|
387
|
+
self._zyte_api = zyte_api
|
|
386
388
|
|
|
387
389
|
@property
|
|
388
390
|
def _search_engine_name(self) -> str:
|
|
@@ -448,16 +450,32 @@ class Toppreise(SearchEngine):
|
|
|
448
450
|
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
449
451
|
search_string=search_string, retry_state=retry_state
|
|
450
452
|
)
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
453
|
+
|
|
454
|
+
content = None
|
|
455
|
+
try:
|
|
456
|
+
async for attempt in retry:
|
|
457
|
+
with attempt:
|
|
458
|
+
response = await self._http_client.get(
|
|
459
|
+
url=url,
|
|
460
|
+
headers=self._headers,
|
|
461
|
+
)
|
|
462
|
+
response.raise_for_status()
|
|
463
|
+
content = response.content
|
|
464
|
+
except httpx.HTTPStatusError as e:
|
|
465
|
+
if e.response.status_code == 403 and self._zyte_api:
|
|
466
|
+
logger.warning(
|
|
467
|
+
f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
|
|
456
468
|
)
|
|
457
|
-
|
|
469
|
+
content = await self._unblock_url(url, self._zyte_api)
|
|
470
|
+
if content is None:
|
|
471
|
+
raise e # Re-raise if zyte fallback also failed
|
|
472
|
+
else:
|
|
473
|
+
raise e
|
|
474
|
+
|
|
475
|
+
if content is None:
|
|
476
|
+
raise httpx.HTTPError("Failed to fetch content")
|
|
458
477
|
|
|
459
478
|
# Get external product urls from the content
|
|
460
|
-
content = response.content
|
|
461
479
|
urls = self._get_external_product_urls(content=content)
|
|
462
480
|
urls = urls[:num_results] # Limit to num_results if needed
|
|
463
481
|
|
|
@@ -491,18 +509,19 @@ class Toppreise(SearchEngine):
|
|
|
491
509
|
class Search(DomainUtils):
|
|
492
510
|
"""Class to perform searches using different search engines."""
|
|
493
511
|
|
|
494
|
-
def __init__(self, http_client: httpx.AsyncClient, serpapi_key: str):
|
|
512
|
+
def __init__(self, http_client: httpx.AsyncClient, serpapi_key: str, zyte_api=None):
|
|
495
513
|
"""Initializes the Search class with the given SerpAPI key.
|
|
496
514
|
|
|
497
515
|
Args:
|
|
498
516
|
http_client: An httpx.AsyncClient to use for the async requests.
|
|
499
517
|
serpapi_key: The API key for SERP API.
|
|
518
|
+
zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
|
|
500
519
|
"""
|
|
501
520
|
self._google = SerpAPIGoogle(http_client=http_client, api_key=serpapi_key)
|
|
502
521
|
self._google_shopping = SerpAPIGoogleShopping(
|
|
503
522
|
http_client=http_client, api_key=serpapi_key
|
|
504
523
|
)
|
|
505
|
-
self._toppreise = Toppreise(http_client=http_client)
|
|
524
|
+
self._toppreise = Toppreise(http_client=http_client, zyte_api=zyte_api)
|
|
506
525
|
|
|
507
526
|
@staticmethod
|
|
508
527
|
def _domain_in_host(domain: str, host: Host) -> bool:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|