fraudcrawler 0.3.8__tar.gz → 0.3.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/PKG-INFO +1 -1
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/fraudcrawler/scraping/serp.py +28 -16
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/fraudcrawler/settings.py +3 -0
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/pyproject.toml +1 -1
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/LICENSE +0 -0
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/README.md +0 -0
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/fraudcrawler/__init__.py +0 -0
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/fraudcrawler/base/__init__.py +0 -0
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/fraudcrawler/base/base.py +0 -0
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/fraudcrawler/base/client.py +0 -0
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/fraudcrawler/base/google-languages.json +0 -0
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/fraudcrawler/base/google-locations.json +0 -0
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/fraudcrawler/base/orchestrator.py +0 -0
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/fraudcrawler/launch_demo_pipeline.py +0 -0
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/fraudcrawler/processing/__init__.py +0 -0
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/fraudcrawler/processing/processor.py +0 -0
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/fraudcrawler/scraping/__init__.py +0 -0
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/fraudcrawler/scraping/enrich.py +0 -0
- {fraudcrawler-0.3.8 → fraudcrawler-0.3.9}/fraudcrawler/scraping/zyte.py +0 -0
|
@@ -4,7 +4,7 @@ from pydantic import BaseModel
|
|
|
4
4
|
from typing import List
|
|
5
5
|
from urllib.parse import urlparse
|
|
6
6
|
|
|
7
|
-
from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY
|
|
7
|
+
from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY, SERP_DEFAULT_COUNTRY_CODES
|
|
8
8
|
from fraudcrawler.base.base import Host, Language, Location, AsyncClient
|
|
9
9
|
import re
|
|
10
10
|
|
|
@@ -148,17 +148,32 @@ class SerpApi(AsyncClient):
|
|
|
148
148
|
return urls
|
|
149
149
|
|
|
150
150
|
@staticmethod
|
|
151
|
-
def
|
|
152
|
-
"""Determines whether
|
|
151
|
+
def _relevant_country_code(url: str, country_code: str) -> bool:
|
|
152
|
+
"""Determines whether the url shows relevant country codes.
|
|
153
153
|
|
|
154
154
|
Args:
|
|
155
155
|
url: The URL to investigate.
|
|
156
156
|
country_code: The country code used to filter the products.
|
|
157
157
|
"""
|
|
158
|
-
|
|
158
|
+
url = url.lower()
|
|
159
|
+
country_code_relevance = f".{country_code}" in url
|
|
160
|
+
default_relevance = any(cc in url for cc in SERP_DEFAULT_COUNTRY_CODES)
|
|
161
|
+
return country_code_relevance or default_relevance
|
|
159
162
|
|
|
160
163
|
@staticmethod
|
|
161
|
-
def
|
|
164
|
+
def _domain_in_host(domain: str, host: Host) -> bool:
|
|
165
|
+
"""Checks if the domain is present in the host.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
domain: The domain to check.
|
|
169
|
+
host: The host to check against.
|
|
170
|
+
"""
|
|
171
|
+
return any(
|
|
172
|
+
domain == hst_dom or domain.endswith(f".{hst_dom}")
|
|
173
|
+
for hst_dom in host.domains
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def _domain_in_hosts(self, domain: str, hosts: List[Host]) -> bool:
|
|
162
177
|
"""Checks if the domain is present in the list of hosts.
|
|
163
178
|
|
|
164
179
|
Note:
|
|
@@ -171,10 +186,7 @@ class SerpApi(AsyncClient):
|
|
|
171
186
|
domain: The domain to check.
|
|
172
187
|
hosts: The list of hosts to check against.
|
|
173
188
|
"""
|
|
174
|
-
|
|
175
|
-
if domain == hst_dom or domain.endswith(f".{hst_dom}"):
|
|
176
|
-
return True
|
|
177
|
-
return False
|
|
189
|
+
return any(self._domain_in_host(domain=domain, host=hst) for hst in hosts)
|
|
178
190
|
|
|
179
191
|
def _is_excluded_url(self, domain: str, excluded_urls: List[Host]) -> bool:
|
|
180
192
|
"""Checks if the domain is in the excluded URLs.
|
|
@@ -183,7 +195,7 @@ class SerpApi(AsyncClient):
|
|
|
183
195
|
domain: The domain to check.
|
|
184
196
|
excluded_urls: The list of excluded URLs.
|
|
185
197
|
"""
|
|
186
|
-
return self.
|
|
198
|
+
return self._domain_in_hosts(domain=domain, hosts=excluded_urls)
|
|
187
199
|
|
|
188
200
|
def _apply_filters(
|
|
189
201
|
self,
|
|
@@ -203,13 +215,11 @@ class SerpApi(AsyncClient):
|
|
|
203
215
|
domain = result.domain
|
|
204
216
|
# Check if the URL is in the marketplaces (if yes, keep the result un-touched)
|
|
205
217
|
if marketplaces:
|
|
206
|
-
if self.
|
|
218
|
+
if self._domain_in_hosts(domain=domain, hosts=marketplaces):
|
|
207
219
|
return result
|
|
208
220
|
|
|
209
|
-
# Check if the URL has
|
|
210
|
-
if not self.
|
|
211
|
-
url=result.url, country_code=location.code
|
|
212
|
-
):
|
|
221
|
+
# Check if the URL has a relevant country_code
|
|
222
|
+
if not self._relevant_country_code(url=result.url, country_code=location.code):
|
|
213
223
|
result.filtered = True
|
|
214
224
|
result.filtered_at_stage = "SerpAPI (country code filtering)"
|
|
215
225
|
return result
|
|
@@ -244,7 +254,9 @@ class SerpApi(AsyncClient):
|
|
|
244
254
|
if marketplaces:
|
|
245
255
|
try:
|
|
246
256
|
marketplace_name = next(
|
|
247
|
-
mp.name
|
|
257
|
+
mp.name
|
|
258
|
+
for mp in marketplaces
|
|
259
|
+
if self._domain_in_host(domain=domain, host=mp)
|
|
248
260
|
)
|
|
249
261
|
except StopIteration:
|
|
250
262
|
logger.warning(f'Failed to find marketplace for domain="{domain}".')
|
|
@@ -8,6 +8,9 @@ ROOT_DIR = Path(__file__).parents[1]
|
|
|
8
8
|
# Serp settings
|
|
9
9
|
GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
|
|
10
10
|
GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
|
|
11
|
+
SERP_DEFAULT_COUNTRY_CODES = [
|
|
12
|
+
".com",
|
|
13
|
+
]
|
|
11
14
|
|
|
12
15
|
# Enrichment settings
|
|
13
16
|
ENRICHMENT_DEFAULT_LIMIT = 10
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|