fraudcrawler 0.3.7__tar.gz → 0.3.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/PKG-INFO +1 -1
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/fraudcrawler/base/base.py +2 -2
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/fraudcrawler/scraping/serp.py +89 -45
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/pyproject.toml +1 -1
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/LICENSE +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/README.md +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/fraudcrawler/__init__.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/fraudcrawler/base/__init__.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/fraudcrawler/base/client.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/fraudcrawler/base/google-languages.json +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/fraudcrawler/base/google-locations.json +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/fraudcrawler/base/orchestrator.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/fraudcrawler/launch_demo_pipeline.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/fraudcrawler/processing/__init__.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/fraudcrawler/processing/processor.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/fraudcrawler/scraping/__init__.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/fraudcrawler/scraping/enrich.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/fraudcrawler/scraping/zyte.py +0 -0
- {fraudcrawler-0.3.7 → fraudcrawler-0.3.8}/fraudcrawler/settings.py +0 -0
|
@@ -51,8 +51,8 @@ class Host(BaseModel):
|
|
|
51
51
|
@field_validator("domains", mode="before")
|
|
52
52
|
def split_domains_if_str(cls, val):
|
|
53
53
|
if isinstance(val, str):
|
|
54
|
-
|
|
55
|
-
return val
|
|
54
|
+
val = val.split(",")
|
|
55
|
+
return [dom.strip().lower() for dom in val]
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
class Location(BaseModel):
|
|
@@ -66,12 +66,12 @@ class SerpApi(AsyncClient):
|
|
|
66
66
|
logger.warning(
|
|
67
67
|
f'Failed to extract domain from url="{url}"; full url is returned'
|
|
68
68
|
)
|
|
69
|
-
return url
|
|
69
|
+
return url.lower()
|
|
70
70
|
|
|
71
71
|
# Remove www. prefix
|
|
72
72
|
if hostname and hostname.startswith("www."):
|
|
73
73
|
hostname = hostname[4:]
|
|
74
|
-
return hostname
|
|
74
|
+
return hostname.lower()
|
|
75
75
|
|
|
76
76
|
async def _search(
|
|
77
77
|
self,
|
|
@@ -148,7 +148,7 @@ class SerpApi(AsyncClient):
|
|
|
148
148
|
return urls
|
|
149
149
|
|
|
150
150
|
@staticmethod
|
|
151
|
-
def
|
|
151
|
+
def _has_included_country_code(url: str, country_code: str) -> bool:
|
|
152
152
|
"""Determines whether to keep the url based on the country_code.
|
|
153
153
|
|
|
154
154
|
Args:
|
|
@@ -157,11 +157,77 @@ class SerpApi(AsyncClient):
|
|
|
157
157
|
"""
|
|
158
158
|
return f".{country_code}" in url.lower() or ".com" in url.lower()
|
|
159
159
|
|
|
160
|
+
@staticmethod
|
|
161
|
+
def _domain_is_present(domain: str, hosts: List[Host]) -> bool:
|
|
162
|
+
"""Checks if the domain is present in the list of hosts.
|
|
163
|
+
|
|
164
|
+
Note:
|
|
165
|
+
By checking `if domain == hst_dom or domain.endswith(f".{hst_dom}")`
|
|
166
|
+
it also checks for subdomains. For example, if the domain is
|
|
167
|
+
`link.springer.com` and the host domain is `springer.com`,
|
|
168
|
+
it will be detected as being present in the hosts.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
domain: The domain to check.
|
|
172
|
+
hosts: The list of hosts to check against.
|
|
173
|
+
"""
|
|
174
|
+
for hst_dom in [dom for hst in hosts for dom in hst.domains]:
|
|
175
|
+
if domain == hst_dom or domain.endswith(f".{hst_dom}"):
|
|
176
|
+
return True
|
|
177
|
+
return False
|
|
178
|
+
|
|
179
|
+
def _is_excluded_url(self, domain: str, excluded_urls: List[Host]) -> bool:
|
|
180
|
+
"""Checks if the domain is in the excluded URLs.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
domain: The domain to check.
|
|
184
|
+
excluded_urls: The list of excluded URLs.
|
|
185
|
+
"""
|
|
186
|
+
return self._domain_is_present(domain=domain, hosts=excluded_urls)
|
|
187
|
+
|
|
188
|
+
def _apply_filters(
|
|
189
|
+
self,
|
|
190
|
+
result: SerpResult,
|
|
191
|
+
location: Location,
|
|
192
|
+
marketplaces: List[Host] | None = None,
|
|
193
|
+
excluded_urls: List[Host] | None = None,
|
|
194
|
+
) -> SerpResult:
|
|
195
|
+
"""Checks for filters and updates the SerpResult accordingly.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
result: The SerpResult object to check.
|
|
199
|
+
location: The location to use for the query.
|
|
200
|
+
marketplaces: The list of marketplaces to compare the URL against.
|
|
201
|
+
excluded_urls: The list of excluded URLs.
|
|
202
|
+
"""
|
|
203
|
+
domain = result.domain
|
|
204
|
+
# Check if the URL is in the marketplaces (if yes, keep the result un-touched)
|
|
205
|
+
if marketplaces:
|
|
206
|
+
if self._domain_is_present(domain=domain, hosts=marketplaces):
|
|
207
|
+
return result
|
|
208
|
+
|
|
209
|
+
# Check if the URL has the included country code
|
|
210
|
+
if not self._has_included_country_code(
|
|
211
|
+
url=result.url, country_code=location.code
|
|
212
|
+
):
|
|
213
|
+
result.filtered = True
|
|
214
|
+
result.filtered_at_stage = "SerpAPI (country code filtering)"
|
|
215
|
+
return result
|
|
216
|
+
|
|
217
|
+
# Check if the URL is in the excluded URLs
|
|
218
|
+
if excluded_urls and self._is_excluded_url(result.domain, excluded_urls):
|
|
219
|
+
result.filtered = True
|
|
220
|
+
result.filtered_at_stage = "SerpAPI (excluded URLs filtering)"
|
|
221
|
+
return result
|
|
222
|
+
|
|
223
|
+
return result
|
|
224
|
+
|
|
160
225
|
def _create_serp_result(
|
|
161
226
|
self,
|
|
162
227
|
url: str,
|
|
163
228
|
location: Location,
|
|
164
|
-
marketplaces: List[Host] | None,
|
|
229
|
+
marketplaces: List[Host] | None = None,
|
|
230
|
+
excluded_urls: List[Host] | None = None,
|
|
165
231
|
) -> SerpResult:
|
|
166
232
|
"""From a given url it creates the class:`SerpResult` instance.
|
|
167
233
|
|
|
@@ -172,50 +238,32 @@ class SerpApi(AsyncClient):
|
|
|
172
238
|
location: The location to use for the query.
|
|
173
239
|
marketplaces: The list of marketplaces to compare the URL against.
|
|
174
240
|
"""
|
|
175
|
-
# Filter for county code
|
|
176
|
-
filtered = not self._keep_url(url=url, country_code=location.code)
|
|
177
|
-
filtered_at_stage = "country code filtering" if filtered else None
|
|
178
|
-
|
|
179
241
|
# Get marketplace name
|
|
180
242
|
domain = self._get_domain(url=url)
|
|
181
243
|
marketplace_name = self._default_marketplace_name
|
|
182
|
-
if
|
|
244
|
+
if marketplaces:
|
|
183
245
|
try:
|
|
184
246
|
marketplace_name = next(
|
|
185
|
-
mp.name
|
|
186
|
-
for mp in marketplaces
|
|
187
|
-
if domain.lower() in [d.lower() for d in mp.domains]
|
|
247
|
+
mp.name for mp in marketplaces if domain in [d for d in mp.domains]
|
|
188
248
|
)
|
|
189
249
|
except StopIteration:
|
|
190
250
|
logger.warning(f'Failed to find marketplace for domain="{domain}".')
|
|
191
|
-
|
|
251
|
+
|
|
252
|
+
# Create the SerpResult object
|
|
253
|
+
result = SerpResult(
|
|
192
254
|
url=url,
|
|
193
255
|
domain=domain,
|
|
194
256
|
marketplace_name=marketplace_name,
|
|
195
|
-
filtered=filtered,
|
|
196
|
-
filtered_at_stage=filtered_at_stage,
|
|
197
257
|
)
|
|
198
258
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
it will be excluded.
|
|
208
|
-
|
|
209
|
-
Args:
|
|
210
|
-
domain: The domain to check.
|
|
211
|
-
excluded_urls: The list of excluded URLs.
|
|
212
|
-
"""
|
|
213
|
-
dom = domain.lower()
|
|
214
|
-
excl_doms = [dom.lower() for excl in excluded_urls for dom in excl.domains]
|
|
215
|
-
for excl in excl_doms:
|
|
216
|
-
if dom == excl or dom.endswith(f".{excl}"):
|
|
217
|
-
return True
|
|
218
|
-
return False
|
|
259
|
+
# Apply filters
|
|
260
|
+
result = self._apply_filters(
|
|
261
|
+
result=result,
|
|
262
|
+
location=location,
|
|
263
|
+
marketplaces=marketplaces,
|
|
264
|
+
excluded_urls=excluded_urls,
|
|
265
|
+
)
|
|
266
|
+
return result
|
|
219
267
|
|
|
220
268
|
async def apply(
|
|
221
269
|
self,
|
|
@@ -256,20 +304,16 @@ class SerpApi(AsyncClient):
|
|
|
256
304
|
# Form the SerpResult objects
|
|
257
305
|
results = [
|
|
258
306
|
self._create_serp_result(
|
|
259
|
-
url=url,
|
|
307
|
+
url=url,
|
|
308
|
+
location=location,
|
|
309
|
+
marketplaces=marketplaces,
|
|
310
|
+
excluded_urls=excluded_urls,
|
|
260
311
|
)
|
|
261
312
|
for url in urls
|
|
262
313
|
]
|
|
263
314
|
|
|
264
|
-
|
|
265
|
-
if excluded_urls:
|
|
266
|
-
results = [
|
|
267
|
-
res
|
|
268
|
-
for res in results
|
|
269
|
-
if not self._is_excluded(res.domain, excluded_urls)
|
|
270
|
-
]
|
|
271
|
-
|
|
315
|
+
num_non_filtered = len([res for res in results if not res.filtered])
|
|
272
316
|
logger.info(
|
|
273
|
-
f'Produced {
|
|
317
|
+
f'Produced {num_non_filtered} results from SerpApi search with q="{search_string}".'
|
|
274
318
|
)
|
|
275
319
|
return results
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|