PyPI - fraudcrawler - Versions diffs - 0.3.6__tar.gz → 0.3.8__tar.gz - Mend

fraudcrawler 0.3.6tar.gz → 0.3.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fraudcrawler might be problematic. Click here for more details.

Files changed (19) hide show

{fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: fraudcrawler
-Version: 0.3.6
+Version: 0.3.8
 Summary: Intelligent Market Monitoring
 Home-page: https://github.com/open-veanu/fraudcrawler
 License: MIT

{fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/base/base.py RENAMED Viewed

@@ -51,8 +51,8 @@ class Host(BaseModel):
     @field_validator("domains", mode="before")
     def split_domains_if_str(cls, val):
         if isinstance(val, str):
-            return [dom.strip() for dom in val.split(",")]
-        return val
+            val = val.split(",")
+        return [dom.strip().lower() for dom in val]
 class Location(BaseModel):

{fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/scraping/serp.py RENAMED Viewed

@@ -66,12 +66,12 @@ class SerpApi(AsyncClient):
             logger.warning(
                 f'Failed to extract domain from url="{url}"; full url is returned'
             )
-            return url
+            return url.lower()
         # Remove www. prefix
         if hostname and hostname.startswith("www."):
             hostname = hostname[4:]
-        return hostname
+        return hostname.lower()
     async def _search(
         self,
@@ -148,7 +148,7 @@ class SerpApi(AsyncClient):
         return urls
     @staticmethod
-    def _keep_url(url: str, country_code: str) -> bool:
+    def _has_included_country_code(url: str, country_code: str) -> bool:
         """Determines whether to keep the url based on the country_code.
         Args:
@@ -157,11 +157,77 @@ class SerpApi(AsyncClient):
         """
         return f".{country_code}" in url.lower() or ".com" in url.lower()
+    @staticmethod
+    def _domain_is_present(domain: str, hosts: List[Host]) -> bool:
+        """Checks if the domain is present in the list of hosts.
+        Note:
+            By checking `if domain == hst_dom or domain.endswith(f".{hst_dom}")`
+            it also checks for subdomains. For example, if the domain is
+            `link.springer.com` and the host domain is `springer.com`,
+            it will be detected as being present in the hosts.
+        Args:
+            domain: The domain to check.
+            hosts: The list of hosts to check against.
+        """
+        for hst_dom in [dom for hst in hosts for dom in hst.domains]:
+            if domain == hst_dom or domain.endswith(f".{hst_dom}"):
+                return True
+        return False
+    def _is_excluded_url(self, domain: str, excluded_urls: List[Host]) -> bool:
+        """Checks if the domain is in the excluded URLs.
+        Args:
+            domain: The domain to check.
+            excluded_urls: The list of excluded URLs.
+        """
+        return self._domain_is_present(domain=domain, hosts=excluded_urls)
+    def _apply_filters(
+        self,
+        result: SerpResult,
+        location: Location,
+        marketplaces: List[Host] | None = None,
+        excluded_urls: List[Host] | None = None,
+    ) -> SerpResult:
+        """Checks for filters and updates the SerpResult accordingly.
+        Args:
+            result: The SerpResult object to check.
+            location: The location to use for the query.
+            marketplaces: The list of marketplaces to compare the URL against.
+            excluded_urls: The list of excluded URLs.
+        """
+        domain = result.domain
+        # Check if the URL is in the marketplaces (if yes, keep the result un-touched)
+        if marketplaces:
+            if self._domain_is_present(domain=domain, hosts=marketplaces):
+                return result
+        # Check if the URL has the included country code
+        if not self._has_included_country_code(
+            url=result.url, country_code=location.code
+        ):
+            result.filtered = True
+            result.filtered_at_stage = "SerpAPI (country code filtering)"
+            return result
+        # Check if the URL is in the excluded URLs
+        if excluded_urls and self._is_excluded_url(result.domain, excluded_urls):
+            result.filtered = True
+            result.filtered_at_stage = "SerpAPI (excluded URLs filtering)"
+            return result
+        return result
     def _create_serp_result(
         self,
         url: str,
         location: Location,
-        marketplaces: List[Host] | None,
+        marketplaces: List[Host] | None = None,
+        excluded_urls: List[Host] | None = None,
     ) -> SerpResult:
         """From a given url it creates the class:`SerpResult` instance.
@@ -172,30 +238,33 @@ class SerpApi(AsyncClient):
             location:  The location to use for the query.
             marketplaces: The list of marketplaces to compare the URL against.
         """
-        # Filter for county code
-        filtered = not self._keep_url(url=url, country_code=location.code)
-        filtered_at_stage = "country code filtering" if filtered else None
         # Get marketplace name
         domain = self._get_domain(url=url)
         marketplace_name = self._default_marketplace_name
-        if domain and marketplaces:
+        if marketplaces:
             try:
                 marketplace_name = next(
-                    mp.name
-                    for mp in marketplaces
-                    if domain.lower() in [d.lower() for d in mp.domains]
+                    mp.name for mp in marketplaces if domain in [d for d in mp.domains]
                 )
             except StopIteration:
                 logger.warning(f'Failed to find marketplace for domain="{domain}".')
-        return SerpResult(
+        # Create the SerpResult object
+        result = SerpResult(
             url=url,
             domain=domain,
             marketplace_name=marketplace_name,
-            filtered=filtered,
-            filtered_at_stage=filtered_at_stage,
         )
+        # Apply filters
+        result = self._apply_filters(
+            result=result,
+            location=location,
+            marketplaces=marketplaces,
+            excluded_urls=excluded_urls,
+        )
+        return result
     async def apply(
         self,
         search_term: str,
@@ -235,17 +304,16 @@ class SerpApi(AsyncClient):
         # Form the SerpResult objects
         results = [
             self._create_serp_result(
-                url=url, location=location, marketplaces=marketplaces
+                url=url,
+                location=location,
+                marketplaces=marketplaces,
+                excluded_urls=excluded_urls,
             )
             for url in urls
         ]
-        # Filter out the excluded URLs
-        if excluded_urls:
-            excluded = [dom for excl in excluded_urls for dom in excl.domains]
-            results = [res for res in results if res.domain not in excluded]
+        num_non_filtered = len([res for res in results if not res.filtered])
         logger.info(
-            f'Produced {len(results)} results from SerpApi search with q="{search_string}".'
+            f'Produced {num_non_filtered} results from SerpApi search with q="{search_string}".'
         )
         return results

{fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "fraudcrawler"
-version = "0.3.6"
+version = "0.3.8"
 description = "Intelligent Market Monitoring"
 authors = [
     "Domingo Bertus <hello@veanu.ch>",