PyPI - fraudcrawler - Versions diffs - 0.3.9__tar.gz → 0.4.0__tar.gz - Mend

fraudcrawler 0.3.9tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: fraudcrawler
-Version: 0.3.9
+Version: 0.4.0
 Summary: Intelligent Market Monitoring
 Home-page: https://github.com/open-veanu/fraudcrawler
 License: MIT

{fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from fraudcrawler.scraping.serp import SerpApi
+from fraudcrawler.scraping.serp import SerpApi, SearchEngine
 from fraudcrawler.scraping.enrich import Enricher
 from fraudcrawler.scraping.zyte import ZyteApi
 from fraudcrawler.processing.processor import Processor
@@ -15,6 +15,7 @@ from fraudcrawler.base.base import (
 __all__ = [
     "SerpApi",
+    "SearchEngine",
     "Enricher",
     "ZyteApi",
     "Processor",

{fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/base/base.py RENAMED Viewed

@@ -6,6 +6,7 @@ from pydantic import (
     model_validator,
 )
 from pydantic_settings import BaseSettings
+import re
 from typing import List
 import aiohttp
@@ -48,11 +49,17 @@ class Host(BaseModel):
     name: str
     domains: str | List[str]
+    @staticmethod
+    def _normalize_domain(domain: str) -> str:
+        """Make it lowercase and strip 'www.' and 'https?://' prefixes from the domain."""
+        domain = domain.strip().lower()
+        return re.sub(r"^(https?://)?(www\.)?", "", domain)
     @field_validator("domains", mode="before")
-    def split_domains_if_str(cls, val):
+    def normalize_domains(cls, val):
         if isinstance(val, str):
             val = val.split(",")
-        return [dom.strip().lower() for dom in val]
+        return [cls._normalize_domain(dom.strip()) for dom in val]
 class Location(BaseModel):

{fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/base/client.py RENAMED Viewed

@@ -11,6 +11,7 @@ import pandas as pd
 from fraudcrawler.settings import ROOT_DIR
 from fraudcrawler.base.base import Setup, Language, Location, Deepness, Host, Prompt
 from fraudcrawler.base.orchestrator import Orchestrator, ProductItem
+from fraudcrawler.scraping.serp import SearchEngine
 logger = logging.getLogger(__name__)
@@ -84,6 +85,7 @@ class FraudCrawlerClient(Orchestrator):
         prompts: List[Prompt],
         marketplaces: List[Host] | None = None,
         excluded_urls: List[Host] | None = None,
+        search_engines: List[SearchEngine | str] | None = None,
     ) -> None:
         """Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
@@ -96,6 +98,7 @@ class FraudCrawlerClient(Orchestrator):
             marketplaces: The marketplaces to include in the search.
             excluded_urls: The URLs to exclude from the search.
         """
+        # Handle results files
         timestamp = datetime.today().strftime("%Y%m%d%H%M%S")
         filename = self._results_dir / self._filename_template.format(
             search_term=search_term,
@@ -105,9 +108,18 @@ class FraudCrawlerClient(Orchestrator):
         )
         self._results.append(Results(search_term=search_term, filename=filename))
+        # Normalize inputs
+        nrm_se: List[SearchEngine] = list(SearchEngine)
+        if search_engines:
+            nrm_se = [
+                SearchEngine(se) if isinstance(se, str) else se for se in search_engines
+            ]
+        # Run the pipeline by calling the orchestrator's run method
         asyncio.run(
             super().run(
                 search_term=search_term,
+                search_engines=nrm_se,
                 language=language,
                 location=location,
                 deepness=deepness,

{fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/base/orchestrator.py RENAMED Viewed

@@ -16,7 +16,7 @@ from fraudcrawler.settings import (
     DEFAULT_N_PROC_WKRS,
 )
 from fraudcrawler.base.base import Deepness, Host, Language, Location, Prompt
-from fraudcrawler import SerpApi, Enricher, ZyteApi, Processor
+from fraudcrawler import SerpApi, SearchEngine, Enricher, ZyteApi, Processor
 logger = logging.getLogger(__name__)
@@ -387,6 +387,7 @@ class Orchestrator(ABC):
         queue: asyncio.Queue[dict | None],
         search_term: str,
         search_term_type: str,
+        search_engines: List[SearchEngine],
         language: Language,
         location: Location,
         num_results: int,
@@ -397,6 +398,7 @@ class Orchestrator(ABC):
         item = {
             "search_term": search_term,
             "search_term_type": search_term_type,
+            "search_engines": search_engines,
             "language": language,
             "location": location,
             "num_results": num_results,
@@ -410,6 +412,7 @@ class Orchestrator(ABC):
         self,
         queue: asyncio.Queue[dict | None],
         search_term: str,
+        search_engines: List[SearchEngine],
         language: Language,
         location: Location,
         deepness: Deepness,
@@ -429,6 +432,7 @@ class Orchestrator(ABC):
         await self._add_serp_items_for_search_term(
             search_term=search_term,
             search_term_type="initial",
+            search_engines=search_engines,
             num_results=deepness.num_results,
             **common_kwargs,  # type: ignore[arg-type]
         )
@@ -450,6 +454,7 @@ class Orchestrator(ABC):
                 await self._add_serp_items_for_search_term(
                     search_term=trm,
                     search_term_type="enriched",
+                    search_engines=search_engines,
                     num_results=enrichment.additional_urls_per_term,
                     **common_kwargs,  # type: ignore[arg-type]
                 )
@@ -457,6 +462,7 @@ class Orchestrator(ABC):
     async def run(
         self,
         search_term: str,
+        search_engines: List[SearchEngine],
         language: Language,
         location: Location,
         deepness: Deepness,
@@ -469,6 +475,7 @@ class Orchestrator(ABC):
         Args:
             search_term: The search term for the query.
+            search_engines: The list of search engines to use for the SerpAPI query.
             language: The language to use for the query.
             location: The location to use for the query.
             deepness: The search depth and enrichment details.
@@ -523,6 +530,7 @@ class Orchestrator(ABC):
         await self._add_serp_items(
             queue=serp_queue,
             search_term=search_term,
+            search_engines=search_engines,
             language=language,
             location=location,
             deepness=deepness,

{fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/scraping/serp.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import asyncio
+from enum import Enum
 import logging
 from pydantic import BaseModel
 from typing import List
@@ -13,7 +14,6 @@ logger = logging.getLogger(__name__)
 class SerpResult(BaseModel):
     """Model for a single search result from SerpApi."""
     url: str
     domain: str
     marketplace_name: str
@@ -21,12 +21,20 @@ class SerpResult(BaseModel):
     filtered_at_stage: str | None = None
+class SearchEngine(Enum):
+    """Enum for the supported search engines."""
+    GOOGLE = "google"
+    GOOGLE_SHOPPING = "google_shopping"
 class SerpApi(AsyncClient):
     """A client to interact with the SerpApi for performing searches."""
     _endpoint = "https://serpapi.com/search"
-    _engine = "google"
-    _default_marketplace_name = "Google"
+    _engine_marketplace_names = {
+        SearchEngine.GOOGLE.value: "Google",
+        SearchEngine.GOOGLE_SHOPPING.value: "Google Shopping"
+    }
     _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
     def __init__(
@@ -73,8 +81,42 @@ class SerpApi(AsyncClient):
             hostname = hostname[4:]
         return hostname.lower()
+    @staticmethod
+    def _extract_search_results(response: dict, engine: str) -> List[str]:
+        """Extracts search results from the response based on the engine type.
+        Args:
+            response: The response from the SerpApi search.
+            engine: The search engine used.
+        Returns:
+            A list of URLs extracted from the response.
+        """
+        urls = []
+        if engine == SearchEngine.GOOGLE.value:
+            # Get the organic_results
+            results = response.get("organic_results")
+            if results is None:
+                logger.warning(f'No SerpAPI results for engine="{engine}".')
+            else:
+                urls = [url for res in results if (url := res.get("link"))]
+        elif engine == SearchEngine.GOOGLE_SHOPPING.value:
+            # Get the shopping_results
+            results = response.get("shopping_results")
+            if results is None:
+                logger.warning(f'No SerpAPI results for engine="{engine}".')
+            else:
+                urls = [url for res in results if (url := res.get("product_link"))]
+        else:
+            raise ValueError(f"Invalid SerpAPI search engine: {engine}")
+        return urls
     async def _search(
         self,
+        engine: str,
         search_string: str,
         language: Language,
         location: Location,
@@ -83,6 +125,7 @@ class SerpApi(AsyncClient):
         """Performs a search using SerpApi and returns the URLs of the results.
         Args:
+            engine: The search engine to use.
             search_string: The search string (with potentially added site: parameters).
             language: The language to use for the query ('hl' parameter).
             location: The location to use for the query ('gl' parameter).
@@ -93,20 +136,35 @@ class SerpApi(AsyncClient):
             q: The search string (with potentially added site: parameters).
             google_domain: The Google domain to use for the search (e.g. google.[com]).
             location_[requested|used]: The location to use for the search.
-            tbs: The time-based search parameters (e.g. 'ctr:CH&cr:countryCH').
+            tbs: The to-be-searched  parameters (e.g. 'ctr:CH').
+            cr: The country code to limit the search to (e.g. 'countryCH').
             gl: The country code to use for the search.
             hl: The language code to use for the search.
             num: The number of results to return.
             api_key: The API key to use for the search.
         """
+        if engine not in self._engine_marketplace_names:
+            raise ValueError(
+                f"Invalid SerpAPI search engine: {engine}. "
+                f"Supported engines are: {list(self._engine_marketplace_names.keys())}."
+            )
+        logger.debug(
+            f'Performing SerpAPI search with engine="{engine}", '
+            f'q="{search_string}", '
+            f'location="{location.name}", '
+            f'language="{language.code}", '
+            f"num_results={num_results}."
+        )
         # Setup the parameters
         params = {
-            "engine": self._engine,
+            "engine": engine,
             "q": search_string,
             "google_domain": f"google.{location.code}",
             "location_requested": location.name,
             "location_used": location.name,
-            "tbs": f"ctr:{location.code.upper()}&cr:country{location.code.upper()}",
+            "tbs": f"ctr:{location.code.upper()}",
+            "cr": f"country{location.code.upper()}",
             "gl": location.code,
             "hl": language.code,
             "num": num_results,
@@ -132,18 +190,11 @@ class SerpApi(AsyncClient):
         if err is not None:
             raise err
-        # Get the organic_results
-        results = response.get("organic_results")
-        if results is None:
-            logger.warning(
-                f'No organic_results key in SerpAPI results for search_string="{search_string}".'
-            )
-            return []
+        # Extract the URLs from the response
+        urls = self._extract_search_results(response=response, engine=engine)
-        # Extract urls
-        urls = [res.get("link") for res in results]
         logger.debug(
-            f'Found {len(urls)} URLs from SerpApi search for q="{search_string}".'
+            f'Found total of {len(urls)} URLs from SerpApi search for q="{search_string}" and engine="{engine}".'
         )
         return urls
@@ -234,6 +285,7 @@ class SerpApi(AsyncClient):
     def _create_serp_result(
         self,
+        engine: str,
         url: str,
         location: Location,
         marketplaces: List[Host] | None = None,
@@ -244,13 +296,18 @@ class SerpApi(AsyncClient):
         If marketplaces is None or the domain can not be extracted, the default marketplace name is used.
         Args:
+            engine: The search engine used.
             url: The URL to be processed.
             location:  The location to use for the query.
             marketplaces: The list of marketplaces to compare the URL against.
+            excluded_urls: The list of excluded URLs.
         """
         # Get marketplace name
         domain = self._get_domain(url=url)
-        marketplace_name = self._default_marketplace_name
+        # Select marketplace name based on engine
+        marketplace_name = self._engine_marketplace_names[engine]
         if marketplaces:
             try:
                 marketplace_name = next(
@@ -277,9 +334,109 @@ class SerpApi(AsyncClient):
         )
         return result
+    async def _search_google(
+        self,
+        search_string: str,
+        language: Language,
+        location: Location,
+        num_results: int,
+        marketplaces: List[Host] | None = None,
+        excluded_urls: List[Host] | None = None,
+    ) -> List[SerpResult]:
+        """Performs a google search using SerpApi and returns SerpResults.
+        Args:
+            search_string: The search string (with potentially added site: parameters).
+            language: The language to use for the query ('hl' parameter).
+            location: The location to use for the query ('gl' parameter).
+            num_results: Max number of results to return.
+            marketplaces: The marketplaces to include in the search.
+            excluded_urls: The URLs to exclude from the search.
+        """
+        engine = SearchEngine.GOOGLE.value
+        # Perform the search
+        urls = await self._search(
+            engine=engine,
+            search_string=search_string,
+            language=language,
+            location=location,
+            num_results=num_results,
+        )
+        # Create SerpResult objects from the URLs
+        results = [
+            self._create_serp_result(
+                url=url,
+                location=location,
+                marketplaces=marketplaces,
+                excluded_urls=excluded_urls,
+                engine=engine,
+            )
+            for url in urls
+        ]
+        logger.debug(
+            f'Produced {len(results)} results from google search with q="{search_string}".'
+        )
+        return results
+    async def _search_google_shopping(
+        self,
+        search_string: str,
+        language: Language,
+        location: Location,
+        num_results: int,
+        marketplaces: List[Host] | None = None,
+        excluded_urls: List[Host] | None = None,
+    ) -> List[SerpResult]:
+        """Performs a google search using SerpApi and returns SerpResults.
+        Args:
+            search_string: The search string (with potentially added site: parameters).
+            language: The language to use for the query ('hl' parameter).
+            location: The location to use for the query ('gl' parameter).
+            num_results: Max number of results to return.
+            marketplaces: The marketplaces to include in the search.
+            excluded_urls: The URLs to exclude from the search.
+        """
+        engine = SearchEngine.GOOGLE_SHOPPING.value
+        # Perform the search
+        urls = await self._search(
+            engine=engine,
+            search_string=search_string,
+            language=language,
+            location=location,
+            num_results=num_results,
+        )
+        # !!! NOTE !!!: Google Shopping results do not properly support the 'num' parameter,
+        # so we might get more results than requested. This is a known issue with SerpAPI
+        # and Google Shopping searches (see https://github.com/serpapi/public-roadmap/issues/1858)
+        urls = urls[:num_results]
+        # Create SerpResult objects from the URLs
+        results = [
+            self._create_serp_result(
+                url=url,
+                location=location,
+                marketplaces=marketplaces,
+                excluded_urls=excluded_urls,
+                engine=engine,
+            )
+            for url in urls
+        ]
+        logger.debug(
+            f'Produced {len(results)} results from google shopping search with q="{search_string}".'
+        )
+        return results
     async def apply(
         self,
         search_term: str,
+        search_engines: List[SearchEngine],
         language: Language,
         location: Location,
         num_results: int,
@@ -305,27 +462,35 @@ class SerpApi(AsyncClient):
             sites = [dom for host in marketplaces for dom in host.domains]
             search_string += " site:" + " OR site:".join(s for s in sites)
-        # Perform the search
-        urls = await self._search(
-            search_string=search_string,
-            language=language,
-            location=location,
-            num_results=num_results,
-        )
+        # Initialize the results list
+        results: List[SerpResult] = []
-        # Form the SerpResult objects
-        results = [
-            self._create_serp_result(
-                url=url,
+        # Perform the google search
+        if SearchEngine.GOOGLE in search_engines:
+            ggl_res = await self._search_google(
+                search_string=search_string,
+                language=language,
                 location=location,
+                num_results=num_results,
                 marketplaces=marketplaces,
                 excluded_urls=excluded_urls,
             )
-            for url in urls
-        ]
+            results.extend(ggl_res)
+        # Perform the google shopping search
+        if SearchEngine.GOOGLE_SHOPPING in search_engines:
+            shp_res = await self._search_google_shopping(
+                search_string=search_string,
+                language=language,
+                location=location,
+                num_results=num_results,
+                marketplaces=marketplaces,
+                excluded_urls=excluded_urls,
+            )
+            results.extend(shp_res)
         num_non_filtered = len([res for res in results if not res.filtered])
         logger.info(
-            f'Produced {num_non_filtered} results from SerpApi search with q="{search_string}".'
+            f'Produced a total of {num_non_filtered} results from SerpApi search with q="{search_string}".'
         )
         return results

{fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/settings.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import List
 # Generic settings
 MAX_RETRIES = 3
@@ -8,8 +9,8 @@ ROOT_DIR = Path(__file__).parents[1]
 # Serp settings
 GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
 GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
-SERP_DEFAULT_COUNTRY_CODES = [
-    ".com",
+SERP_DEFAULT_COUNTRY_CODES: List[str] = [
+    # ".com",
 ]
 # Enrichment settings

{fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "fraudcrawler"
-version = "0.3.9"
+version = "0.4.0"
 description = "Intelligent Market Monitoring"
 authors = [
     "Domingo Bertus <hello@veanu.ch>",