fraudcrawler 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- fraudcrawler/base/base.py +8 -12
- fraudcrawler/base/orchestrator.py +6 -4
- fraudcrawler/launch_demo_pipeline.py +1 -1
- fraudcrawler/scraping/search.py +24 -8
- {fraudcrawler-0.5.7.dist-info → fraudcrawler-0.5.9.dist-info}/METADATA +1 -1
- {fraudcrawler-0.5.7.dist-info → fraudcrawler-0.5.9.dist-info}/RECORD +9 -9
- {fraudcrawler-0.5.7.dist-info → fraudcrawler-0.5.9.dist-info}/LICENSE +0 -0
- {fraudcrawler-0.5.7.dist-info → fraudcrawler-0.5.9.dist-info}/WHEEL +0 -0
- {fraudcrawler-0.5.7.dist-info → fraudcrawler-0.5.9.dist-info}/entry_points.txt +0 -0
fraudcrawler/base/base.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import base64
|
|
3
4
|
from pydantic import (
|
|
4
5
|
BaseModel,
|
|
5
6
|
Field,
|
|
@@ -9,7 +10,7 @@ from pydantic import (
|
|
|
9
10
|
from pydantic_settings import BaseSettings
|
|
10
11
|
from urllib.parse import urlparse
|
|
11
12
|
import re
|
|
12
|
-
from typing import Any, Dict, List
|
|
13
|
+
from typing import Any, Dict, List
|
|
13
14
|
|
|
14
15
|
import httpx
|
|
15
16
|
|
|
@@ -23,9 +24,6 @@ from fraudcrawler.settings import (
|
|
|
23
24
|
DEFAULT_HTTPX_REDIRECTS,
|
|
24
25
|
)
|
|
25
26
|
|
|
26
|
-
if TYPE_CHECKING:
|
|
27
|
-
from fraudcrawler.scraping.zyte import ZyteAPI
|
|
28
|
-
|
|
29
27
|
logger = logging.getLogger(__name__)
|
|
30
28
|
|
|
31
29
|
# Load google locations and languages
|
|
@@ -245,33 +243,31 @@ class DomainUtils:
|
|
|
245
243
|
hostname = hostname[4:]
|
|
246
244
|
return hostname.lower()
|
|
247
245
|
|
|
248
|
-
async def _unblock_url(self, url: str, zyte_api:
|
|
246
|
+
async def _unblock_url(self, url: str, zyte_api: Any) -> bytes | None:
|
|
249
247
|
"""Attempts to unblock a URL using Zyte proxy mode when direct access fails.
|
|
250
|
-
|
|
248
|
+
|
|
251
249
|
This method is specifically designed to handle 403 Forbidden errors for domains
|
|
252
250
|
that may be blocking requests from certain IP ranges (like cloud providers).
|
|
253
|
-
|
|
251
|
+
|
|
254
252
|
Args:
|
|
255
253
|
url: The URL to fetch using Zyte proxy mode.
|
|
256
254
|
zyte_api: An instance of ZyteAPI to use for the request.
|
|
257
|
-
|
|
255
|
+
|
|
258
256
|
Returns:
|
|
259
257
|
The HTML content as bytes if successful, None if failed.
|
|
260
258
|
"""
|
|
261
259
|
try:
|
|
262
260
|
logger.info(f"Attempting to unblock URL using Zyte proxy: {url}")
|
|
263
261
|
details = await zyte_api.details(url)
|
|
264
|
-
|
|
262
|
+
|
|
265
263
|
if details and "httpResponseBody" in details:
|
|
266
|
-
# Decode the base64 content
|
|
267
|
-
import base64
|
|
268
264
|
html_content = base64.b64decode(details["httpResponseBody"])
|
|
269
265
|
logger.info(f"Successfully unblocked URL using Zyte proxy: {url}")
|
|
270
266
|
return html_content
|
|
271
267
|
else:
|
|
272
268
|
logger.warning(f"Zyte proxy request failed for URL: {url}")
|
|
273
269
|
return None
|
|
274
|
-
|
|
270
|
+
|
|
275
271
|
except Exception as e:
|
|
276
272
|
logger.error(f"Error unblocking URL with Zyte proxy: {url}, error: {e}")
|
|
277
273
|
return None
|
|
@@ -114,8 +114,13 @@ class Orchestrator(ABC):
|
|
|
114
114
|
self._owns_http_client = True
|
|
115
115
|
|
|
116
116
|
# Setup the clients
|
|
117
|
+
self._zyteapi = ZyteAPI(
|
|
118
|
+
http_client=self._http_client, api_key=self._zyteapi_key
|
|
119
|
+
)
|
|
117
120
|
self._search = Search(
|
|
118
|
-
http_client=self._http_client,
|
|
121
|
+
http_client=self._http_client,
|
|
122
|
+
serpapi_key=self._serpapi_key,
|
|
123
|
+
zyte_api=self._zyteapi,
|
|
119
124
|
)
|
|
120
125
|
self._enricher = Enricher(
|
|
121
126
|
http_client=self._http_client,
|
|
@@ -123,9 +128,6 @@ class Orchestrator(ABC):
|
|
|
123
128
|
pwd=self._dataforseo_pwd,
|
|
124
129
|
)
|
|
125
130
|
self._url_collector = URLCollector()
|
|
126
|
-
self._zyteapi = ZyteAPI(
|
|
127
|
-
http_client=self._http_client, api_key=self._zyteapi_key
|
|
128
|
-
)
|
|
129
131
|
self._processor = Processor(
|
|
130
132
|
http_client=self._http_client,
|
|
131
133
|
api_key=self._openaiapi_key,
|
fraudcrawler/scraping/search.py
CHANGED
|
@@ -131,6 +131,17 @@ class SerpAPI(SearchEngine):
|
|
|
131
131
|
search_string += " site:" + " OR site:".join(s for s in sites)
|
|
132
132
|
return search_string
|
|
133
133
|
|
|
134
|
+
@staticmethod
|
|
135
|
+
def _get_google_domain(location: Location) -> str:
|
|
136
|
+
"""Gets the Google domain for the given location if they do not use the default pattern google.tld"""
|
|
137
|
+
if location.name == "Brazil":
|
|
138
|
+
return "google.com.br"
|
|
139
|
+
elif location.name == "United Kingdom":
|
|
140
|
+
return "google.co.uk"
|
|
141
|
+
elif location.name == "Argentina":
|
|
142
|
+
return "google.com.ar"
|
|
143
|
+
return f"google.{location.code}"
|
|
144
|
+
|
|
134
145
|
async def _search(
|
|
135
146
|
self,
|
|
136
147
|
search_string: str,
|
|
@@ -169,16 +180,19 @@ class SerpAPI(SearchEngine):
|
|
|
169
180
|
f"num_results={num_results}."
|
|
170
181
|
)
|
|
171
182
|
|
|
172
|
-
#
|
|
183
|
+
# Get Google domain and country code
|
|
184
|
+
google_domain = self._get_google_domain(location)
|
|
185
|
+
country_code = location.code
|
|
186
|
+
|
|
173
187
|
params: Dict[str, str | int] = {
|
|
174
188
|
"engine": engine,
|
|
175
189
|
"q": search_string,
|
|
176
|
-
"google_domain":
|
|
190
|
+
"google_domain": google_domain,
|
|
177
191
|
"location_requested": location.name,
|
|
178
192
|
"location_used": location.name,
|
|
179
|
-
"tbs": f"ctr:{
|
|
180
|
-
"cr": f"country{
|
|
181
|
-
"gl":
|
|
193
|
+
"tbs": f"ctr:{country_code.upper()}",
|
|
194
|
+
"cr": f"country{country_code.upper()}",
|
|
195
|
+
"gl": country_code,
|
|
182
196
|
"hl": language.code,
|
|
183
197
|
"num": num_results,
|
|
184
198
|
"api_key": self._api_key,
|
|
@@ -450,7 +464,7 @@ class Toppreise(SearchEngine):
|
|
|
450
464
|
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
451
465
|
search_string=search_string, retry_state=retry_state
|
|
452
466
|
)
|
|
453
|
-
|
|
467
|
+
|
|
454
468
|
content = None
|
|
455
469
|
try:
|
|
456
470
|
async for attempt in retry:
|
|
@@ -463,7 +477,9 @@ class Toppreise(SearchEngine):
|
|
|
463
477
|
content = response.content
|
|
464
478
|
except httpx.HTTPStatusError as e:
|
|
465
479
|
if e.response.status_code == 403 and self._zyte_api:
|
|
466
|
-
logger.warning(
|
|
480
|
+
logger.warning(
|
|
481
|
+
f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
|
|
482
|
+
)
|
|
467
483
|
content = await self._unblock_url(url, self._zyte_api)
|
|
468
484
|
if content is None:
|
|
469
485
|
raise e # Re-raise if zyte fallback also failed
|
|
@@ -471,7 +487,7 @@ class Toppreise(SearchEngine):
|
|
|
471
487
|
raise e
|
|
472
488
|
|
|
473
489
|
if content is None:
|
|
474
|
-
raise httpx.
|
|
490
|
+
raise httpx.HTTPError("Failed to fetch content")
|
|
475
491
|
|
|
476
492
|
# Get external product urls from the content
|
|
477
493
|
urls = self._get_external_product_urls(content=content)
|
|
@@ -1,22 +1,22 @@
|
|
|
1
1
|
fraudcrawler/__init__.py,sha256=Kr19jWhtbC1shVoB9fHvBSeoG1IyQB9re1kCZ4YIAi0,842
|
|
2
2
|
fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
fraudcrawler/base/base.py,sha256=
|
|
3
|
+
fraudcrawler/base/base.py,sha256=S5VP1PD2tymZMmVcVmP4FI0VlCQxDCB2gp0vVD6eN8g,8543
|
|
4
4
|
fraudcrawler/base/client.py,sha256=yhkNrhL2SuJXTknLf-8P81fv01FnFMahREZgem-Z-f0,5832
|
|
5
5
|
fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
|
|
6
6
|
fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
|
|
7
|
-
fraudcrawler/base/orchestrator.py,sha256=
|
|
7
|
+
fraudcrawler/base/orchestrator.py,sha256=28X45XLPlJe2hvff8HTLo-V08LNeS0zMWBHe5W3hk4c,27039
|
|
8
8
|
fraudcrawler/base/retry.py,sha256=9VyVrbYR_0YnfxFhUrvcM3aWCYR6oR4iZE4A3zzVZUs,1015
|
|
9
|
-
fraudcrawler/launch_demo_pipeline.py,sha256=
|
|
9
|
+
fraudcrawler/launch_demo_pipeline.py,sha256=hTzGFQDEwchDSwUx0HgG_TW5h9J7BXM7jn_iB8iI838,4636
|
|
10
10
|
fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
11
|
fraudcrawler/processing/processor.py,sha256=Qq8QcTlqfnzFi1t-1KkncXxaIszUO7pGK3LXTdHkDnM,7638
|
|
12
12
|
fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
|
|
14
|
-
fraudcrawler/scraping/search.py,sha256=
|
|
14
|
+
fraudcrawler/scraping/search.py,sha256=5fTRo7Tkkz9REU-u50oZu4kbynjIcIigp9HZ0cS_UCg,25510
|
|
15
15
|
fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
|
|
16
16
|
fraudcrawler/scraping/zyte.py,sha256=GqvVWA1AWVoClAwd-hQ9iynsT0dOb7R0ntaLK5XVivM,8340
|
|
17
17
|
fraudcrawler/settings.py,sha256=uwXMOQpuwyWkuMU0asYGtBlL_qJj8F-Xkg4dUaCmDxE,3670
|
|
18
|
-
fraudcrawler-0.5.
|
|
19
|
-
fraudcrawler-0.5.
|
|
20
|
-
fraudcrawler-0.5.
|
|
21
|
-
fraudcrawler-0.5.
|
|
22
|
-
fraudcrawler-0.5.
|
|
18
|
+
fraudcrawler-0.5.9.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
|
|
19
|
+
fraudcrawler-0.5.9.dist-info/METADATA,sha256=sMszTwcTVxuMGfRhJJZKLAPShnY4zDU6gShAa6k3tPg,6642
|
|
20
|
+
fraudcrawler-0.5.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
21
|
+
fraudcrawler-0.5.9.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
|
|
22
|
+
fraudcrawler-0.5.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|