PyPI - fraudcrawler - Versions diffs - 0.3.7__tar.gz → 0.3.9__tar.gz - Mend

fraudcrawler 0.3.7tar.gz → 0.3.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fraudcrawler might be problematic. Click here for more details.

Files changed (19) hide show

{fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: fraudcrawler
-Version: 0.3.7
+Version: 0.3.9
 Summary: Intelligent Market Monitoring
 Home-page: https://github.com/open-veanu/fraudcrawler
 License: MIT

{fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/base/base.py RENAMED Viewed

@@ -51,8 +51,8 @@ class Host(BaseModel):
     @field_validator("domains", mode="before")
     def split_domains_if_str(cls, val):
         if isinstance(val, str):
-            return [dom.strip() for dom in val.split(",")]
-        return val
+            val = val.split(",")
+        return [dom.strip().lower() for dom in val]
 class Location(BaseModel):

{fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/scraping/serp.py RENAMED Viewed

@@ -4,7 +4,7 @@ from pydantic import BaseModel
 from typing import List
 from urllib.parse import urlparse
-from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY
+from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY, SERP_DEFAULT_COUNTRY_CODES
 from fraudcrawler.base.base import Host, Language, Location, AsyncClient
 import re
@@ -66,12 +66,12 @@ class SerpApi(AsyncClient):
             logger.warning(
                 f'Failed to extract domain from url="{url}"; full url is returned'
             )
-            return url
+            return url.lower()
         # Remove www. prefix
         if hostname and hostname.startswith("www."):
             hostname = hostname[4:]
-        return hostname
+        return hostname.lower()
     async def _search(
         self,
@@ -148,20 +148,96 @@ class SerpApi(AsyncClient):
         return urls
     @staticmethod
-    def _keep_url(url: str, country_code: str) -> bool:
-        """Determines whether to keep the url based on the country_code.
+    def _relevant_country_code(url: str, country_code: str) -> bool:
+        """Determines whether the url shows relevant country codes.
         Args:
             url: The URL to investigate.
             country_code: The country code used to filter the products.
         """
-        return f".{country_code}" in url.lower() or ".com" in url.lower()
+        url = url.lower()
+        country_code_relevance = f".{country_code}" in url
+        default_relevance = any(cc in url for cc in SERP_DEFAULT_COUNTRY_CODES)
+        return country_code_relevance or default_relevance
+    @staticmethod
+    def _domain_in_host(domain: str, host: Host) -> bool:
+        """Checks if the domain is present in the host.
+        Args:
+            domain: The domain to check.
+            host: The host to check against.
+        """
+        return any(
+            domain == hst_dom or domain.endswith(f".{hst_dom}")
+            for hst_dom in host.domains
+        )
+    def _domain_in_hosts(self, domain: str, hosts: List[Host]) -> bool:
+        """Checks if the domain is present in the list of hosts.
+        Note:
+            By checking `if domain == hst_dom or domain.endswith(f".{hst_dom}")`
+            it also checks for subdomains. For example, if the domain is
+            `link.springer.com` and the host domain is `springer.com`,
+            it will be detected as being present in the hosts.
+        Args:
+            domain: The domain to check.
+            hosts: The list of hosts to check against.
+        """
+        return any(self._domain_in_host(domain=domain, host=hst) for hst in hosts)
+    def _is_excluded_url(self, domain: str, excluded_urls: List[Host]) -> bool:
+        """Checks if the domain is in the excluded URLs.
+        Args:
+            domain: The domain to check.
+            excluded_urls: The list of excluded URLs.
+        """
+        return self._domain_in_hosts(domain=domain, hosts=excluded_urls)
+    def _apply_filters(
+        self,
+        result: SerpResult,
+        location: Location,
+        marketplaces: List[Host] | None = None,
+        excluded_urls: List[Host] | None = None,
+    ) -> SerpResult:
+        """Checks for filters and updates the SerpResult accordingly.
+        Args:
+            result: The SerpResult object to check.
+            location: The location to use for the query.
+            marketplaces: The list of marketplaces to compare the URL against.
+            excluded_urls: The list of excluded URLs.
+        """
+        domain = result.domain
+        # Check if the URL is in the marketplaces (if yes, keep the result un-touched)
+        if marketplaces:
+            if self._domain_in_hosts(domain=domain, hosts=marketplaces):
+                return result
+        # Check if the URL has a relevant country_code
+        if not self._relevant_country_code(url=result.url, country_code=location.code):
+            result.filtered = True
+            result.filtered_at_stage = "SerpAPI (country code filtering)"
+            return result
+        # Check if the URL is in the excluded URLs
+        if excluded_urls and self._is_excluded_url(result.domain, excluded_urls):
+            result.filtered = True
+            result.filtered_at_stage = "SerpAPI (excluded URLs filtering)"
+            return result
+        return result
     def _create_serp_result(
         self,
         url: str,
         location: Location,
-        marketplaces: List[Host] | None,
+        marketplaces: List[Host] | None = None,
+        excluded_urls: List[Host] | None = None,
     ) -> SerpResult:
         """From a given url it creates the class:`SerpResult` instance.
@@ -172,50 +248,34 @@ class SerpApi(AsyncClient):
             location:  The location to use for the query.
             marketplaces: The list of marketplaces to compare the URL against.
         """
-        # Filter for county code
-        filtered = not self._keep_url(url=url, country_code=location.code)
-        filtered_at_stage = "country code filtering" if filtered else None
         # Get marketplace name
         domain = self._get_domain(url=url)
         marketplace_name = self._default_marketplace_name
-        if domain and marketplaces:
+        if marketplaces:
             try:
                 marketplace_name = next(
                     mp.name
                     for mp in marketplaces
-                    if domain.lower() in [d.lower() for d in mp.domains]
+                    if self._domain_in_host(domain=domain, host=mp)
                 )
             except StopIteration:
                 logger.warning(f'Failed to find marketplace for domain="{domain}".')
-        return SerpResult(
+        # Create the SerpResult object
+        result = SerpResult(
             url=url,
             domain=domain,
             marketplace_name=marketplace_name,
-            filtered=filtered,
-            filtered_at_stage=filtered_at_stage,
         )
-    @staticmethod
-    def _is_excluded(domain: str, excluded_urls: List[Host]) -> bool:
-        """Checks if the domain is in the excluded URLs.
-        Note:
-            By checking `if dom == excl or dom.endswith(f".{excl}")` we also
-            check for subdomains. For example, if the domain is
-            `link.springer.com` and the excluded URL is `springer.com`,
-            it will be excluded.
-        Args:
-            domain: The domain to check.
-            excluded_urls: The list of excluded URLs.
-        """
-        dom = domain.lower()
-        excl_doms = [dom.lower() for excl in excluded_urls for dom in excl.domains]
-        for excl in excl_doms:
-            if dom == excl or dom.endswith(f".{excl}"):
-                return True
-        return False
+        # Apply filters
+        result = self._apply_filters(
+            result=result,
+            location=location,
+            marketplaces=marketplaces,
+            excluded_urls=excluded_urls,
+        )
+        return result
     async def apply(
         self,
@@ -256,20 +316,16 @@ class SerpApi(AsyncClient):
         # Form the SerpResult objects
         results = [
             self._create_serp_result(
-                url=url, location=location, marketplaces=marketplaces
+                url=url,
+                location=location,
+                marketplaces=marketplaces,
+                excluded_urls=excluded_urls,
             )
             for url in urls
         ]
-        # Filter out the excluded URLs
-        if excluded_urls:
-            results = [
-                res
-                for res in results
-                if not self._is_excluded(res.domain, excluded_urls)
-            ]
+        num_non_filtered = len([res for res in results if not res.filtered])
         logger.info(
-            f'Produced {len(results)} results from SerpApi search with q="{search_string}".'
+            f'Produced {num_non_filtered} results from SerpApi search with q="{search_string}".'
         )
         return results

{fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/settings.py RENAMED Viewed

@@ -8,6 +8,9 @@ ROOT_DIR = Path(__file__).parents[1]
 # Serp settings
 GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
 GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
+SERP_DEFAULT_COUNTRY_CODES = [
+    ".com",
+]
 # Enrichment settings
 ENRICHMENT_DEFAULT_LIMIT = 10

{fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "fraudcrawler"
-version = "0.3.7"
+version = "0.3.9"
 description = "Intelligent Market Monitoring"
 authors = [
     "Domingo Bertus <hello@veanu.ch>",