fraudcrawler 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

@@ -196,6 +196,27 @@ class SerpApi(AsyncClient):
196
196
  filtered_at_stage=filtered_at_stage,
197
197
  )
198
198
 
199
+ @staticmethod
200
+ def _is_excluded(domain: str, excluded_urls: List[Host]) -> bool:
201
+ """Checks if the domain is in the excluded URLs.
202
+
203
+ Note:
204
+ By checking `if dom == excl or dom.endswith(f".{excl}")` we also
205
+ check for subdomains. For example, if the domain is
206
+ `link.springer.com` and the excluded URL is `springer.com`,
207
+ it will be excluded.
208
+
209
+ Args:
210
+ domain: The domain to check.
211
+ excluded_urls: The list of excluded URLs.
212
+ """
213
+ dom = domain.lower()
214
+ excl_doms = [dom.lower() for excl in excluded_urls for dom in excl.domains]
215
+ for excl in excl_doms:
216
+ if dom == excl or dom.endswith(f".{excl}"):
217
+ return True
218
+ return False
219
+
199
220
  async def apply(
200
221
  self,
201
222
  search_term: str,
@@ -242,8 +263,11 @@ class SerpApi(AsyncClient):
242
263
 
243
264
  # Filter out the excluded URLs
244
265
  if excluded_urls:
245
- excluded = [dom for excl in excluded_urls for dom in excl.domains]
246
- results = [res for res in results if res.domain not in excluded]
266
+ results = [
267
+ res
268
+ for res in results
269
+ if not self._is_excluded(res.domain, excluded_urls)
270
+ ]
247
271
 
248
272
  logger.info(
249
273
  f'Produced {len(results)} results from SerpApi search with q="{search_string}".'
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: fraudcrawler
3
- Version: 0.3.5
3
+ Version: 0.3.7
4
4
  Summary: Intelligent Market Monitoring
5
- Home-page: https://github/open-veanu/fraudcrawler
5
+ Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
7
7
  Author: Domingo Bertus
8
8
  Author-email: hello@veanu.ch
@@ -17,7 +17,7 @@ Requires-Dist: openai (>=1.68.2,<2.0.0)
17
17
  Requires-Dist: pandas (>=2.2.3,<3.0.0)
18
18
  Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
19
19
  Requires-Dist: requests (>=2.32.3,<3.0.0)
20
- Project-URL: Repository, https://github/open-veanu/fraudcrawler
20
+ Project-URL: Repository, https://github.com/open-veanu/fraudcrawler
21
21
  Description-Content-Type: text/markdown
22
22
 
23
23
  # open-veanu/fraudcrawler
@@ -10,11 +10,11 @@ fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
10
10
  fraudcrawler/processing/processor.py,sha256=IFVKIiNi0QoCAgPFkFtNDgxfhh01iDNUyIBZWACplR8,3993
11
11
  fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  fraudcrawler/scraping/enrich.py,sha256=X1BBZshdZqPmbduzhGwH0ULSzq03L_7bf7_UL8yOQ9E,10608
13
- fraudcrawler/scraping/serp.py,sha256=wT8vhk0EugcrS2CCvMuCCZrlw1MRI-ahtGYKdNUZQo8,8830
13
+ fraudcrawler/scraping/serp.py,sha256=ksN_GgneLLIW0YwAMbBVbWdFD4FvdGnR5Lq_snykLDA,9613
14
14
  fraudcrawler/scraping/zyte.py,sha256=ggI4iYG-E_UyiKgUpEFekeUd1giifEfJ_uyFUSJGSLY,6296
15
15
  fraudcrawler/settings.py,sha256=yAgGvZ9wAdaYbN5c0SBZoTUkjjLOyU2je1109qcbTzQ,723
16
- fraudcrawler-0.3.5.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
17
- fraudcrawler-0.3.5.dist-info/METADATA,sha256=z0wINs19mCGOFPrXlb4FRSivMQpWG5zTgFmCXu6pIE8,5957
18
- fraudcrawler-0.3.5.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
19
- fraudcrawler-0.3.5.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
20
- fraudcrawler-0.3.5.dist-info/RECORD,,
16
+ fraudcrawler-0.3.7.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
17
+ fraudcrawler-0.3.7.dist-info/METADATA,sha256=po2owcDHfVOmYDr2GnINKbnVlWBrzY77tbB7K_btIFU,5965
18
+ fraudcrawler-0.3.7.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
19
+ fraudcrawler-0.3.7.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
20
+ fraudcrawler-0.3.7.dist-info/RECORD,,