fraudcrawler 0.5.7__tar.gz → 0.5.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/PKG-INFO +1 -1
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/base/base.py +6 -5
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/base/orchestrator.py +6 -4
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/scraping/search.py +5 -3
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/pyproject.toml +1 -1
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/LICENSE +0 -0
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/README.md +0 -0
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/__init__.py +0 -0
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/base/__init__.py +0 -0
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/base/client.py +0 -0
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/base/google-languages.json +0 -0
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/base/google-locations.json +0 -0
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/base/retry.py +0 -0
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/launch_demo_pipeline.py +0 -0
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/processing/__init__.py +0 -0
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/processing/processor.py +0 -0
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/scraping/__init__.py +0 -0
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/scraping/enrich.py +0 -0
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/scraping/url.py +0 -0
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/scraping/zyte.py +0 -0
- {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/settings.py +0 -0
|
@@ -247,31 +247,32 @@ class DomainUtils:
|
|
|
247
247
|
|
|
248
248
|
async def _unblock_url(self, url: str, zyte_api: "ZyteAPI") -> bytes | None:
|
|
249
249
|
"""Attempts to unblock a URL using Zyte proxy mode when direct access fails.
|
|
250
|
-
|
|
250
|
+
|
|
251
251
|
This method is specifically designed to handle 403 Forbidden errors for domains
|
|
252
252
|
that may be blocking requests from certain IP ranges (like cloud providers).
|
|
253
|
-
|
|
253
|
+
|
|
254
254
|
Args:
|
|
255
255
|
url: The URL to fetch using Zyte proxy mode.
|
|
256
256
|
zyte_api: An instance of ZyteAPI to use for the request.
|
|
257
|
-
|
|
257
|
+
|
|
258
258
|
Returns:
|
|
259
259
|
The HTML content as bytes if successful, None if failed.
|
|
260
260
|
"""
|
|
261
261
|
try:
|
|
262
262
|
logger.info(f"Attempting to unblock URL using Zyte proxy: {url}")
|
|
263
263
|
details = await zyte_api.details(url)
|
|
264
|
-
|
|
264
|
+
|
|
265
265
|
if details and "httpResponseBody" in details:
|
|
266
266
|
# Decode the base64 content
|
|
267
267
|
import base64
|
|
268
|
+
|
|
268
269
|
html_content = base64.b64decode(details["httpResponseBody"])
|
|
269
270
|
logger.info(f"Successfully unblocked URL using Zyte proxy: {url}")
|
|
270
271
|
return html_content
|
|
271
272
|
else:
|
|
272
273
|
logger.warning(f"Zyte proxy request failed for URL: {url}")
|
|
273
274
|
return None
|
|
274
|
-
|
|
275
|
+
|
|
275
276
|
except Exception as e:
|
|
276
277
|
logger.error(f"Error unblocking URL with Zyte proxy: {url}, error: {e}")
|
|
277
278
|
return None
|
|
@@ -114,8 +114,13 @@ class Orchestrator(ABC):
|
|
|
114
114
|
self._owns_http_client = True
|
|
115
115
|
|
|
116
116
|
# Setup the clients
|
|
117
|
+
self._zyteapi = ZyteAPI(
|
|
118
|
+
http_client=self._http_client, api_key=self._zyteapi_key
|
|
119
|
+
)
|
|
117
120
|
self._search = Search(
|
|
118
|
-
http_client=self._http_client,
|
|
121
|
+
http_client=self._http_client,
|
|
122
|
+
serpapi_key=self._serpapi_key,
|
|
123
|
+
zyte_api=self._zyteapi,
|
|
119
124
|
)
|
|
120
125
|
self._enricher = Enricher(
|
|
121
126
|
http_client=self._http_client,
|
|
@@ -123,9 +128,6 @@ class Orchestrator(ABC):
|
|
|
123
128
|
pwd=self._dataforseo_pwd,
|
|
124
129
|
)
|
|
125
130
|
self._url_collector = URLCollector()
|
|
126
|
-
self._zyteapi = ZyteAPI(
|
|
127
|
-
http_client=self._http_client, api_key=self._zyteapi_key
|
|
128
|
-
)
|
|
129
131
|
self._processor = Processor(
|
|
130
132
|
http_client=self._http_client,
|
|
131
133
|
api_key=self._openaiapi_key,
|
|
@@ -450,7 +450,7 @@ class Toppreise(SearchEngine):
|
|
|
450
450
|
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
451
451
|
search_string=search_string, retry_state=retry_state
|
|
452
452
|
)
|
|
453
|
-
|
|
453
|
+
|
|
454
454
|
content = None
|
|
455
455
|
try:
|
|
456
456
|
async for attempt in retry:
|
|
@@ -463,7 +463,9 @@ class Toppreise(SearchEngine):
|
|
|
463
463
|
content = response.content
|
|
464
464
|
except httpx.HTTPStatusError as e:
|
|
465
465
|
if e.response.status_code == 403 and self._zyte_api:
|
|
466
|
-
logger.warning(
|
|
466
|
+
logger.warning(
|
|
467
|
+
f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
|
|
468
|
+
)
|
|
467
469
|
content = await self._unblock_url(url, self._zyte_api)
|
|
468
470
|
if content is None:
|
|
469
471
|
raise e # Re-raise if zyte fallback also failed
|
|
@@ -471,7 +473,7 @@ class Toppreise(SearchEngine):
|
|
|
471
473
|
raise e
|
|
472
474
|
|
|
473
475
|
if content is None:
|
|
474
|
-
raise httpx.
|
|
476
|
+
raise httpx.HTTPError("Failed to fetch content")
|
|
475
477
|
|
|
476
478
|
# Get external product urls from the content
|
|
477
479
|
urls = self._get_external_product_urls(content=content)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|