fraudcrawler 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- fraudcrawler/base/base.py +35 -1
- fraudcrawler/base/orchestrator.py +1 -1
- fraudcrawler/scraping/search.py +28 -11
- {fraudcrawler-0.5.6.dist-info → fraudcrawler-0.5.7.dist-info}/METADATA +1 -1
- {fraudcrawler-0.5.6.dist-info → fraudcrawler-0.5.7.dist-info}/RECORD +8 -8
- {fraudcrawler-0.5.6.dist-info → fraudcrawler-0.5.7.dist-info}/LICENSE +0 -0
- {fraudcrawler-0.5.6.dist-info → fraudcrawler-0.5.7.dist-info}/WHEEL +0 -0
- {fraudcrawler-0.5.6.dist-info → fraudcrawler-0.5.7.dist-info}/entry_points.txt +0 -0
fraudcrawler/base/base.py
CHANGED
|
@@ -9,7 +9,7 @@ from pydantic import (
|
|
|
9
9
|
from pydantic_settings import BaseSettings
|
|
10
10
|
from urllib.parse import urlparse
|
|
11
11
|
import re
|
|
12
|
-
from typing import Any, Dict, List
|
|
12
|
+
from typing import Any, Dict, List, TYPE_CHECKING
|
|
13
13
|
|
|
14
14
|
import httpx
|
|
15
15
|
|
|
@@ -23,6 +23,9 @@ from fraudcrawler.settings import (
|
|
|
23
23
|
DEFAULT_HTTPX_REDIRECTS,
|
|
24
24
|
)
|
|
25
25
|
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from fraudcrawler.scraping.zyte import ZyteAPI
|
|
28
|
+
|
|
26
29
|
logger = logging.getLogger(__name__)
|
|
27
30
|
|
|
28
31
|
# Load google locations and languages
|
|
@@ -241,3 +244,34 @@ class DomainUtils:
|
|
|
241
244
|
if hostname and hostname.startswith("www."):
|
|
242
245
|
hostname = hostname[4:]
|
|
243
246
|
return hostname.lower()
|
|
247
|
+
|
|
248
|
+
async def _unblock_url(self, url: str, zyte_api: "ZyteAPI") -> bytes | None:
|
|
249
|
+
"""Attempts to unblock a URL using Zyte proxy mode when direct access fails.
|
|
250
|
+
|
|
251
|
+
This method is specifically designed to handle 403 Forbidden errors for domains
|
|
252
|
+
that may be blocking requests from certain IP ranges (like cloud providers).
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
url: The URL to fetch using Zyte proxy mode.
|
|
256
|
+
zyte_api: An instance of ZyteAPI to use for the request.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
The HTML content as bytes if successful, None if failed.
|
|
260
|
+
"""
|
|
261
|
+
try:
|
|
262
|
+
logger.info(f"Attempting to unblock URL using Zyte proxy: {url}")
|
|
263
|
+
details = await zyte_api.details(url)
|
|
264
|
+
|
|
265
|
+
if details and "httpResponseBody" in details:
|
|
266
|
+
# Decode the base64 content
|
|
267
|
+
import base64
|
|
268
|
+
html_content = base64.b64decode(details["httpResponseBody"])
|
|
269
|
+
logger.info(f"Successfully unblocked URL using Zyte proxy: {url}")
|
|
270
|
+
return html_content
|
|
271
|
+
else:
|
|
272
|
+
logger.warning(f"Zyte proxy request failed for URL: {url}")
|
|
273
|
+
return None
|
|
274
|
+
|
|
275
|
+
except Exception as e:
|
|
276
|
+
logger.error(f"Error unblocking URL with Zyte proxy: {url}, error: {e}")
|
|
277
|
+
return None
|
|
@@ -115,7 +115,7 @@ class Orchestrator(ABC):
|
|
|
115
115
|
|
|
116
116
|
# Setup the clients
|
|
117
117
|
self._search = Search(
|
|
118
|
-
http_client=self._http_client, serpapi_key=self._serpapi_key
|
|
118
|
+
http_client=self._http_client, serpapi_key=self._serpapi_key, zyte_api=self._zyteapi
|
|
119
119
|
)
|
|
120
120
|
self._enricher = Enricher(
|
|
121
121
|
http_client=self._http_client,
|
fraudcrawler/scraping/search.py
CHANGED
|
@@ -376,13 +376,15 @@ class Toppreise(SearchEngine):
|
|
|
376
376
|
"Upgrade-Insecure-Requests": "1",
|
|
377
377
|
}
|
|
378
378
|
|
|
379
|
-
def __init__(self, http_client: httpx.AsyncClient):
|
|
379
|
+
def __init__(self, http_client: httpx.AsyncClient, zyte_api=None):
|
|
380
380
|
"""Initializes the Toppreise client.
|
|
381
381
|
|
|
382
382
|
Args:
|
|
383
383
|
http_client: An httpx.AsyncClient to use for the async requests.
|
|
384
|
+
zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
|
|
384
385
|
"""
|
|
385
386
|
self._http_client = http_client
|
|
387
|
+
self._zyte_api = zyte_api
|
|
386
388
|
|
|
387
389
|
@property
|
|
388
390
|
def _search_engine_name(self) -> str:
|
|
@@ -448,16 +450,30 @@ class Toppreise(SearchEngine):
|
|
|
448
450
|
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
449
451
|
search_string=search_string, retry_state=retry_state
|
|
450
452
|
)
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
453
|
+
|
|
454
|
+
content = None
|
|
455
|
+
try:
|
|
456
|
+
async for attempt in retry:
|
|
457
|
+
with attempt:
|
|
458
|
+
response = await self._http_client.get(
|
|
459
|
+
url=url,
|
|
460
|
+
headers=self._headers,
|
|
461
|
+
)
|
|
462
|
+
response.raise_for_status()
|
|
463
|
+
content = response.content
|
|
464
|
+
except httpx.HTTPStatusError as e:
|
|
465
|
+
if e.response.status_code == 403 and self._zyte_api:
|
|
466
|
+
logger.warning(f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy")
|
|
467
|
+
content = await self._unblock_url(url, self._zyte_api)
|
|
468
|
+
if content is None:
|
|
469
|
+
raise e # Re-raise if zyte fallback also failed
|
|
470
|
+
else:
|
|
471
|
+
raise e
|
|
472
|
+
|
|
473
|
+
if content is None:
|
|
474
|
+
raise httpx.HTTPStatusError("Failed to fetch content", request=None, response=None)
|
|
458
475
|
|
|
459
476
|
# Get external product urls from the content
|
|
460
|
-
content = response.content
|
|
461
477
|
urls = self._get_external_product_urls(content=content)
|
|
462
478
|
urls = urls[:num_results] # Limit to num_results if needed
|
|
463
479
|
|
|
@@ -491,18 +507,19 @@ class Toppreise(SearchEngine):
|
|
|
491
507
|
class Search(DomainUtils):
|
|
492
508
|
"""Class to perform searches using different search engines."""
|
|
493
509
|
|
|
494
|
-
def __init__(self, http_client: httpx.AsyncClient, serpapi_key: str):
|
|
510
|
+
def __init__(self, http_client: httpx.AsyncClient, serpapi_key: str, zyte_api=None):
|
|
495
511
|
"""Initializes the Search class with the given SerpAPI key.
|
|
496
512
|
|
|
497
513
|
Args:
|
|
498
514
|
http_client: An httpx.AsyncClient to use for the async requests.
|
|
499
515
|
serpapi_key: The API key for SERP API.
|
|
516
|
+
zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
|
|
500
517
|
"""
|
|
501
518
|
self._google = SerpAPIGoogle(http_client=http_client, api_key=serpapi_key)
|
|
502
519
|
self._google_shopping = SerpAPIGoogleShopping(
|
|
503
520
|
http_client=http_client, api_key=serpapi_key
|
|
504
521
|
)
|
|
505
|
-
self._toppreise = Toppreise(http_client=http_client)
|
|
522
|
+
self._toppreise = Toppreise(http_client=http_client, zyte_api=zyte_api)
|
|
506
523
|
|
|
507
524
|
@staticmethod
|
|
508
525
|
def _domain_in_host(domain: str, host: Host) -> bool:
|
|
@@ -1,22 +1,22 @@
|
|
|
1
1
|
fraudcrawler/__init__.py,sha256=Kr19jWhtbC1shVoB9fHvBSeoG1IyQB9re1kCZ4YIAi0,842
|
|
2
2
|
fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
fraudcrawler/base/base.py,sha256=
|
|
3
|
+
fraudcrawler/base/base.py,sha256=94HTs8RpdpEics9d6o_uDniTRG1CCSO35LDsjY4hp5E,8750
|
|
4
4
|
fraudcrawler/base/client.py,sha256=yhkNrhL2SuJXTknLf-8P81fv01FnFMahREZgem-Z-f0,5832
|
|
5
5
|
fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
|
|
6
6
|
fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
|
|
7
|
-
fraudcrawler/base/orchestrator.py,sha256=
|
|
7
|
+
fraudcrawler/base/orchestrator.py,sha256=lyrdX_pEq2y3VguXMRMmyEJviGEr5-SnqeIxoJmqWKc,27014
|
|
8
8
|
fraudcrawler/base/retry.py,sha256=9VyVrbYR_0YnfxFhUrvcM3aWCYR6oR4iZE4A3zzVZUs,1015
|
|
9
9
|
fraudcrawler/launch_demo_pipeline.py,sha256=j5lu8lLl8QrkVU1MJH25uKtyYk_6lBSeoouCo30aRXg,4634
|
|
10
10
|
fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
11
|
fraudcrawler/processing/processor.py,sha256=Qq8QcTlqfnzFi1t-1KkncXxaIszUO7pGK3LXTdHkDnM,7638
|
|
12
12
|
fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
|
|
14
|
-
fraudcrawler/scraping/search.py,sha256=
|
|
14
|
+
fraudcrawler/scraping/search.py,sha256=JQ4nbylYdAk65yDDAatv-qGekRRRNy769VHQgzhqN8Y,24962
|
|
15
15
|
fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
|
|
16
16
|
fraudcrawler/scraping/zyte.py,sha256=GqvVWA1AWVoClAwd-hQ9iynsT0dOb7R0ntaLK5XVivM,8340
|
|
17
17
|
fraudcrawler/settings.py,sha256=uwXMOQpuwyWkuMU0asYGtBlL_qJj8F-Xkg4dUaCmDxE,3670
|
|
18
|
-
fraudcrawler-0.5.
|
|
19
|
-
fraudcrawler-0.5.
|
|
20
|
-
fraudcrawler-0.5.
|
|
21
|
-
fraudcrawler-0.5.
|
|
22
|
-
fraudcrawler-0.5.
|
|
18
|
+
fraudcrawler-0.5.7.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
|
|
19
|
+
fraudcrawler-0.5.7.dist-info/METADATA,sha256=tMdND63UPo5x2s49o_RMzQzqTSEBdrsv1TqQPL65DaM,6642
|
|
20
|
+
fraudcrawler-0.5.7.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
21
|
+
fraudcrawler-0.5.7.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
|
|
22
|
+
fraudcrawler-0.5.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|