fraudcrawler 0.3.6__tar.gz → 0.3.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/PKG-INFO +1 -1
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/fraudcrawler/scraping/serp.py +26 -2
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/pyproject.toml +1 -1
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/LICENSE +0 -0
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/README.md +0 -0
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/fraudcrawler/__init__.py +0 -0
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/fraudcrawler/base/__init__.py +0 -0
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/fraudcrawler/base/base.py +0 -0
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/fraudcrawler/base/client.py +0 -0
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/fraudcrawler/base/google-languages.json +0 -0
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/fraudcrawler/base/google-locations.json +0 -0
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/fraudcrawler/base/orchestrator.py +0 -0
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/fraudcrawler/launch_demo_pipeline.py +0 -0
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/fraudcrawler/processing/__init__.py +0 -0
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/fraudcrawler/processing/processor.py +0 -0
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/fraudcrawler/scraping/__init__.py +0 -0
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/fraudcrawler/scraping/enrich.py +0 -0
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/fraudcrawler/scraping/zyte.py +0 -0
- {fraudcrawler-0.3.6 → fraudcrawler-0.3.7}/fraudcrawler/settings.py +0 -0
|
@@ -196,6 +196,27 @@ class SerpApi(AsyncClient):
|
|
|
196
196
|
filtered_at_stage=filtered_at_stage,
|
|
197
197
|
)
|
|
198
198
|
|
|
199
|
+
@staticmethod
|
|
200
|
+
def _is_excluded(domain: str, excluded_urls: List[Host]) -> bool:
|
|
201
|
+
"""Checks if the domain is in the excluded URLs.
|
|
202
|
+
|
|
203
|
+
Note:
|
|
204
|
+
By checking `if dom == excl or dom.endswith(f".{excl}")` we also
|
|
205
|
+
check for subdomains. For example, if the domain is
|
|
206
|
+
`link.springer.com` and the excluded URL is `springer.com`,
|
|
207
|
+
it will be excluded.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
domain: The domain to check.
|
|
211
|
+
excluded_urls: The list of excluded URLs.
|
|
212
|
+
"""
|
|
213
|
+
dom = domain.lower()
|
|
214
|
+
excl_doms = [dom.lower() for excl in excluded_urls for dom in excl.domains]
|
|
215
|
+
for excl in excl_doms:
|
|
216
|
+
if dom == excl or dom.endswith(f".{excl}"):
|
|
217
|
+
return True
|
|
218
|
+
return False
|
|
219
|
+
|
|
199
220
|
async def apply(
|
|
200
221
|
self,
|
|
201
222
|
search_term: str,
|
|
@@ -242,8 +263,11 @@ class SerpApi(AsyncClient):
|
|
|
242
263
|
|
|
243
264
|
# Filter out the excluded URLs
|
|
244
265
|
if excluded_urls:
|
|
245
|
-
|
|
246
|
-
|
|
266
|
+
results = [
|
|
267
|
+
res
|
|
268
|
+
for res in results
|
|
269
|
+
if not self._is_excluded(res.domain, excluded_urls)
|
|
270
|
+
]
|
|
247
271
|
|
|
248
272
|
logger.info(
|
|
249
273
|
f'Produced {len(results)} results from SerpApi search with q="{search_string}".'
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|