fraudcrawler 0.3.7__tar.gz → 0.3.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/PKG-INFO +1 -1
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/base/base.py +2 -2
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/scraping/serp.py +102 -46
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/settings.py +3 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/pyproject.toml +1 -1
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/LICENSE +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/README.md +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/__init__.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/base/__init__.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/base/client.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/base/google-languages.json +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/base/google-locations.json +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/base/orchestrator.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/launch_demo_pipeline.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/processing/__init__.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/processing/processor.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/scraping/__init__.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/scraping/enrich.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/scraping/zyte.py +0 -0
|
@@ -51,8 +51,8 @@ class Host(BaseModel):
|
|
|
51
51
|
@field_validator("domains", mode="before")
|
|
52
52
|
def split_domains_if_str(cls, val):
|
|
53
53
|
if isinstance(val, str):
|
|
54
|
-
|
|
55
|
-
return val
|
|
54
|
+
val = val.split(",")
|
|
55
|
+
return [dom.strip().lower() for dom in val]
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
class Location(BaseModel):
|
|
@@ -4,7 +4,7 @@ from pydantic import BaseModel
|
|
|
4
4
|
from typing import List
|
|
5
5
|
from urllib.parse import urlparse
|
|
6
6
|
|
|
7
|
-
from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY
|
|
7
|
+
from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY, SERP_DEFAULT_COUNTRY_CODES
|
|
8
8
|
from fraudcrawler.base.base import Host, Language, Location, AsyncClient
|
|
9
9
|
import re
|
|
10
10
|
|
|
@@ -66,12 +66,12 @@ class SerpApi(AsyncClient):
|
|
|
66
66
|
logger.warning(
|
|
67
67
|
f'Failed to extract domain from url="{url}"; full url is returned'
|
|
68
68
|
)
|
|
69
|
-
return url
|
|
69
|
+
return url.lower()
|
|
70
70
|
|
|
71
71
|
# Remove www. prefix
|
|
72
72
|
if hostname and hostname.startswith("www."):
|
|
73
73
|
hostname = hostname[4:]
|
|
74
|
-
return hostname
|
|
74
|
+
return hostname.lower()
|
|
75
75
|
|
|
76
76
|
async def _search(
|
|
77
77
|
self,
|
|
@@ -148,20 +148,96 @@ class SerpApi(AsyncClient):
|
|
|
148
148
|
return urls
|
|
149
149
|
|
|
150
150
|
@staticmethod
|
|
151
|
-
def
|
|
152
|
-
"""Determines whether
|
|
151
|
+
def _relevant_country_code(url: str, country_code: str) -> bool:
|
|
152
|
+
"""Determines whether the url shows relevant country codes.
|
|
153
153
|
|
|
154
154
|
Args:
|
|
155
155
|
url: The URL to investigate.
|
|
156
156
|
country_code: The country code used to filter the products.
|
|
157
157
|
"""
|
|
158
|
-
|
|
158
|
+
url = url.lower()
|
|
159
|
+
country_code_relevance = f".{country_code}" in url
|
|
160
|
+
default_relevance = any(cc in url for cc in SERP_DEFAULT_COUNTRY_CODES)
|
|
161
|
+
return country_code_relevance or default_relevance
|
|
162
|
+
|
|
163
|
+
@staticmethod
|
|
164
|
+
def _domain_in_host(domain: str, host: Host) -> bool:
|
|
165
|
+
"""Checks if the domain is present in the host.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
domain: The domain to check.
|
|
169
|
+
host: The host to check against.
|
|
170
|
+
"""
|
|
171
|
+
return any(
|
|
172
|
+
domain == hst_dom or domain.endswith(f".{hst_dom}")
|
|
173
|
+
for hst_dom in host.domains
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def _domain_in_hosts(self, domain: str, hosts: List[Host]) -> bool:
|
|
177
|
+
"""Checks if the domain is present in the list of hosts.
|
|
178
|
+
|
|
179
|
+
Note:
|
|
180
|
+
By checking `if domain == hst_dom or domain.endswith(f".{hst_dom}")`
|
|
181
|
+
it also checks for subdomains. For example, if the domain is
|
|
182
|
+
`link.springer.com` and the host domain is `springer.com`,
|
|
183
|
+
it will be detected as being present in the hosts.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
domain: The domain to check.
|
|
187
|
+
hosts: The list of hosts to check against.
|
|
188
|
+
"""
|
|
189
|
+
return any(self._domain_in_host(domain=domain, host=hst) for hst in hosts)
|
|
190
|
+
|
|
191
|
+
def _is_excluded_url(self, domain: str, excluded_urls: List[Host]) -> bool:
|
|
192
|
+
"""Checks if the domain is in the excluded URLs.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
domain: The domain to check.
|
|
196
|
+
excluded_urls: The list of excluded URLs.
|
|
197
|
+
"""
|
|
198
|
+
return self._domain_in_hosts(domain=domain, hosts=excluded_urls)
|
|
199
|
+
|
|
200
|
+
def _apply_filters(
|
|
201
|
+
self,
|
|
202
|
+
result: SerpResult,
|
|
203
|
+
location: Location,
|
|
204
|
+
marketplaces: List[Host] | None = None,
|
|
205
|
+
excluded_urls: List[Host] | None = None,
|
|
206
|
+
) -> SerpResult:
|
|
207
|
+
"""Checks for filters and updates the SerpResult accordingly.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
result: The SerpResult object to check.
|
|
211
|
+
location: The location to use for the query.
|
|
212
|
+
marketplaces: The list of marketplaces to compare the URL against.
|
|
213
|
+
excluded_urls: The list of excluded URLs.
|
|
214
|
+
"""
|
|
215
|
+
domain = result.domain
|
|
216
|
+
# Check if the URL is in the marketplaces (if yes, keep the result un-touched)
|
|
217
|
+
if marketplaces:
|
|
218
|
+
if self._domain_in_hosts(domain=domain, hosts=marketplaces):
|
|
219
|
+
return result
|
|
220
|
+
|
|
221
|
+
# Check if the URL has a relevant country_code
|
|
222
|
+
if not self._relevant_country_code(url=result.url, country_code=location.code):
|
|
223
|
+
result.filtered = True
|
|
224
|
+
result.filtered_at_stage = "SerpAPI (country code filtering)"
|
|
225
|
+
return result
|
|
226
|
+
|
|
227
|
+
# Check if the URL is in the excluded URLs
|
|
228
|
+
if excluded_urls and self._is_excluded_url(result.domain, excluded_urls):
|
|
229
|
+
result.filtered = True
|
|
230
|
+
result.filtered_at_stage = "SerpAPI (excluded URLs filtering)"
|
|
231
|
+
return result
|
|
232
|
+
|
|
233
|
+
return result
|
|
159
234
|
|
|
160
235
|
def _create_serp_result(
|
|
161
236
|
self,
|
|
162
237
|
url: str,
|
|
163
238
|
location: Location,
|
|
164
|
-
marketplaces: List[Host] | None,
|
|
239
|
+
marketplaces: List[Host] | None = None,
|
|
240
|
+
excluded_urls: List[Host] | None = None,
|
|
165
241
|
) -> SerpResult:
|
|
166
242
|
"""From a given url it creates the class:`SerpResult` instance.
|
|
167
243
|
|
|
@@ -172,50 +248,34 @@ class SerpApi(AsyncClient):
|
|
|
172
248
|
location: The location to use for the query.
|
|
173
249
|
marketplaces: The list of marketplaces to compare the URL against.
|
|
174
250
|
"""
|
|
175
|
-
# Filter for county code
|
|
176
|
-
filtered = not self._keep_url(url=url, country_code=location.code)
|
|
177
|
-
filtered_at_stage = "country code filtering" if filtered else None
|
|
178
|
-
|
|
179
251
|
# Get marketplace name
|
|
180
252
|
domain = self._get_domain(url=url)
|
|
181
253
|
marketplace_name = self._default_marketplace_name
|
|
182
|
-
if
|
|
254
|
+
if marketplaces:
|
|
183
255
|
try:
|
|
184
256
|
marketplace_name = next(
|
|
185
257
|
mp.name
|
|
186
258
|
for mp in marketplaces
|
|
187
|
-
if
|
|
259
|
+
if self._domain_in_host(domain=domain, host=mp)
|
|
188
260
|
)
|
|
189
261
|
except StopIteration:
|
|
190
262
|
logger.warning(f'Failed to find marketplace for domain="{domain}".')
|
|
191
|
-
|
|
263
|
+
|
|
264
|
+
# Create the SerpResult object
|
|
265
|
+
result = SerpResult(
|
|
192
266
|
url=url,
|
|
193
267
|
domain=domain,
|
|
194
268
|
marketplace_name=marketplace_name,
|
|
195
|
-
filtered=filtered,
|
|
196
|
-
filtered_at_stage=filtered_at_stage,
|
|
197
269
|
)
|
|
198
270
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
it will be excluded.
|
|
208
|
-
|
|
209
|
-
Args:
|
|
210
|
-
domain: The domain to check.
|
|
211
|
-
excluded_urls: The list of excluded URLs.
|
|
212
|
-
"""
|
|
213
|
-
dom = domain.lower()
|
|
214
|
-
excl_doms = [dom.lower() for excl in excluded_urls for dom in excl.domains]
|
|
215
|
-
for excl in excl_doms:
|
|
216
|
-
if dom == excl or dom.endswith(f".{excl}"):
|
|
217
|
-
return True
|
|
218
|
-
return False
|
|
271
|
+
# Apply filters
|
|
272
|
+
result = self._apply_filters(
|
|
273
|
+
result=result,
|
|
274
|
+
location=location,
|
|
275
|
+
marketplaces=marketplaces,
|
|
276
|
+
excluded_urls=excluded_urls,
|
|
277
|
+
)
|
|
278
|
+
return result
|
|
219
279
|
|
|
220
280
|
async def apply(
|
|
221
281
|
self,
|
|
@@ -256,20 +316,16 @@ class SerpApi(AsyncClient):
|
|
|
256
316
|
# Form the SerpResult objects
|
|
257
317
|
results = [
|
|
258
318
|
self._create_serp_result(
|
|
259
|
-
url=url,
|
|
319
|
+
url=url,
|
|
320
|
+
location=location,
|
|
321
|
+
marketplaces=marketplaces,
|
|
322
|
+
excluded_urls=excluded_urls,
|
|
260
323
|
)
|
|
261
324
|
for url in urls
|
|
262
325
|
]
|
|
263
326
|
|
|
264
|
-
|
|
265
|
-
if excluded_urls:
|
|
266
|
-
results = [
|
|
267
|
-
res
|
|
268
|
-
for res in results
|
|
269
|
-
if not self._is_excluded(res.domain, excluded_urls)
|
|
270
|
-
]
|
|
271
|
-
|
|
327
|
+
num_non_filtered = len([res for res in results if not res.filtered])
|
|
272
328
|
logger.info(
|
|
273
|
-
f'Produced {
|
|
329
|
+
f'Produced {num_non_filtered} results from SerpApi search with q="{search_string}".'
|
|
274
330
|
)
|
|
275
331
|
return results
|
|
@@ -8,6 +8,9 @@ ROOT_DIR = Path(__file__).parents[1]
|
|
|
8
8
|
# Serp settings
|
|
9
9
|
GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
|
|
10
10
|
GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
|
|
11
|
+
SERP_DEFAULT_COUNTRY_CODES = [
|
|
12
|
+
".com",
|
|
13
|
+
]
|
|
11
14
|
|
|
12
15
|
# Enrichment settings
|
|
13
16
|
ENRICHMENT_DEFAULT_LIMIT = 10
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|