fraudcrawler 0.5.8__tar.gz → 0.5.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/PKG-INFO +1 -1
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/base/base.py +3 -8
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/launch_demo_pipeline.py +1 -1
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/scraping/search.py +19 -5
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/pyproject.toml +1 -1
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/LICENSE +0 -0
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/README.md +0 -0
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/__init__.py +0 -0
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/base/__init__.py +0 -0
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/base/client.py +0 -0
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/base/google-languages.json +0 -0
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/base/google-locations.json +0 -0
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/base/orchestrator.py +0 -0
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/base/retry.py +0 -0
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/processing/__init__.py +0 -0
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/processing/processor.py +0 -0
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/scraping/__init__.py +0 -0
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/scraping/enrich.py +0 -0
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/scraping/url.py +0 -0
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/scraping/zyte.py +0 -0
- {fraudcrawler-0.5.8 → fraudcrawler-0.5.9}/fraudcrawler/settings.py +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import base64
|
|
3
4
|
from pydantic import (
|
|
4
5
|
BaseModel,
|
|
5
6
|
Field,
|
|
@@ -9,7 +10,7 @@ from pydantic import (
|
|
|
9
10
|
from pydantic_settings import BaseSettings
|
|
10
11
|
from urllib.parse import urlparse
|
|
11
12
|
import re
|
|
12
|
-
from typing import Any, Dict, List
|
|
13
|
+
from typing import Any, Dict, List
|
|
13
14
|
|
|
14
15
|
import httpx
|
|
15
16
|
|
|
@@ -23,9 +24,6 @@ from fraudcrawler.settings import (
|
|
|
23
24
|
DEFAULT_HTTPX_REDIRECTS,
|
|
24
25
|
)
|
|
25
26
|
|
|
26
|
-
if TYPE_CHECKING:
|
|
27
|
-
from fraudcrawler.scraping.zyte import ZyteAPI
|
|
28
|
-
|
|
29
27
|
logger = logging.getLogger(__name__)
|
|
30
28
|
|
|
31
29
|
# Load google locations and languages
|
|
@@ -245,7 +243,7 @@ class DomainUtils:
|
|
|
245
243
|
hostname = hostname[4:]
|
|
246
244
|
return hostname.lower()
|
|
247
245
|
|
|
248
|
-
async def _unblock_url(self, url: str, zyte_api:
|
|
246
|
+
async def _unblock_url(self, url: str, zyte_api: Any) -> bytes | None:
|
|
249
247
|
"""Attempts to unblock a URL using Zyte proxy mode when direct access fails.
|
|
250
248
|
|
|
251
249
|
This method is specifically designed to handle 403 Forbidden errors for domains
|
|
@@ -263,9 +261,6 @@ class DomainUtils:
|
|
|
263
261
|
details = await zyte_api.details(url)
|
|
264
262
|
|
|
265
263
|
if details and "httpResponseBody" in details:
|
|
266
|
-
# Decode the base64 content
|
|
267
|
-
import base64
|
|
268
|
-
|
|
269
264
|
html_content = base64.b64decode(details["httpResponseBody"])
|
|
270
265
|
logger.info(f"Successfully unblocked URL using Zyte proxy: {url}")
|
|
271
266
|
return html_content
|
|
@@ -131,6 +131,17 @@ class SerpAPI(SearchEngine):
|
|
|
131
131
|
search_string += " site:" + " OR site:".join(s for s in sites)
|
|
132
132
|
return search_string
|
|
133
133
|
|
|
134
|
+
@staticmethod
|
|
135
|
+
def _get_google_domain(location: Location) -> str:
|
|
136
|
+
"""Gets the Google domain for the given location if they do not use the default pattern google.tld"""
|
|
137
|
+
if location.name == "Brazil":
|
|
138
|
+
return "google.com.br"
|
|
139
|
+
elif location.name == "United Kingdom":
|
|
140
|
+
return "google.co.uk"
|
|
141
|
+
elif location.name == "Argentina":
|
|
142
|
+
return "google.com.ar"
|
|
143
|
+
return f"google.{location.code}"
|
|
144
|
+
|
|
134
145
|
async def _search(
|
|
135
146
|
self,
|
|
136
147
|
search_string: str,
|
|
@@ -169,16 +180,19 @@ class SerpAPI(SearchEngine):
|
|
|
169
180
|
f"num_results={num_results}."
|
|
170
181
|
)
|
|
171
182
|
|
|
172
|
-
#
|
|
183
|
+
# Get Google domain and country code
|
|
184
|
+
google_domain = self._get_google_domain(location)
|
|
185
|
+
country_code = location.code
|
|
186
|
+
|
|
173
187
|
params: Dict[str, str | int] = {
|
|
174
188
|
"engine": engine,
|
|
175
189
|
"q": search_string,
|
|
176
|
-
"google_domain":
|
|
190
|
+
"google_domain": google_domain,
|
|
177
191
|
"location_requested": location.name,
|
|
178
192
|
"location_used": location.name,
|
|
179
|
-
"tbs": f"ctr:{
|
|
180
|
-
"cr": f"country{
|
|
181
|
-
"gl":
|
|
193
|
+
"tbs": f"ctr:{country_code.upper()}",
|
|
194
|
+
"cr": f"country{country_code.upper()}",
|
|
195
|
+
"gl": country_code,
|
|
182
196
|
"hl": language.code,
|
|
183
197
|
"num": num_results,
|
|
184
198
|
"api_key": self._api_key,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|