PyPI - fraudcrawler - Versions diffs - 0.3.10__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

fraudcrawler 0.3.10py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fraudcrawler might be problematic. Click here for more details.

Files changed (15) hide show

fraudcrawler/__init__.py +4 -2
fraudcrawler/base/base.py +41 -2
fraudcrawler/base/client.py +22 -2
fraudcrawler/base/orchestrator.py +36 -37
fraudcrawler/launch_demo_pipeline.py +25 -24
fraudcrawler/processing/processor.py +11 -19
fraudcrawler/scraping/serp.py +197 -30
fraudcrawler/scraping/zyte.py +24 -1
fraudcrawler/settings.py +5 -5
{fraudcrawler-0.3.10.dist-info → fraudcrawler-0.4.2.dist-info}/METADATA +2 -2
fraudcrawler-0.4.2.dist-info/RECORD +20 -0
fraudcrawler-0.3.10.dist-info/RECORD +0 -20
{fraudcrawler-0.3.10.dist-info → fraudcrawler-0.4.2.dist-info}/LICENSE +0 -0
{fraudcrawler-0.3.10.dist-info → fraudcrawler-0.4.2.dist-info}/WHEEL +0 -0
{fraudcrawler-0.3.10.dist-info → fraudcrawler-0.4.2.dist-info}/entry_points.txt +0 -0

fraudcrawler/__init__.py CHANGED Viewed

@@ -1,8 +1,8 @@
-from fraudcrawler.scraping.serp import SerpApi
+from fraudcrawler.scraping.serp import SerpApi, SearchEngine
 from fraudcrawler.scraping.enrich import Enricher
 from fraudcrawler.scraping.zyte import ZyteApi
 from fraudcrawler.processing.processor import Processor
-from fraudcrawler.base.orchestrator import Orchestrator, ProductItem
+from fraudcrawler.base.orchestrator import Orchestrator
 from fraudcrawler.base.client import FraudCrawlerClient
 from fraudcrawler.base.base import (
     Deepness,
@@ -11,10 +11,12 @@ from fraudcrawler.base.base import (
     Language,
     Location,
     Prompt,
+    ProductItem,
 )
 __all__ = [
     "SerpApi",
+    "SearchEngine",
     "Enricher",
     "ZyteApi",
     "Processor",

fraudcrawler/base/base.py CHANGED Viewed

@@ -2,12 +2,13 @@ import json
 import logging
 from pydantic import (
     BaseModel,
+    Field,
     field_validator,
     model_validator,
 )
 from pydantic_settings import BaseSettings
 import re
-from typing import List
+from typing import List, Dict
 import aiohttp
@@ -114,12 +115,39 @@ class Deepness(BaseModel):
     enrichment: Enrichment | None = None
+class ProductItem(BaseModel):
+    """Model representing a product item."""
+    # Serp/Enrich parameters
+    search_term: str
+    search_term_type: str
+    url: str
+    marketplace_name: str
+    domain: str
+    # Zyte parameters
+    product_name: str | None = None
+    product_price: str | None = None
+    product_description: str | None = None
+    product_images: List[str] | None = None
+    probability: float | None = None
+    html: str | None = None
+    html_clean: str | None = None
+    # Processor parameters are set dynamic so we must allow extra fields
+    classifications: Dict[str, int] = Field(default_factory=dict)
+    # Filtering parameters
+    filtered: bool = False
+    filtered_at_stage: str | None = None
 class Prompt(BaseModel):
     """Model for prompts."""
     name: str
-    context: str
     system_prompt: str
+    product_item_fields: List[str]
     allowed_classes: List[int]
     @field_validator("allowed_classes", mode="before")
@@ -129,6 +157,17 @@ class Prompt(BaseModel):
             raise ValueError("all values in allowed_classes must be positive integers.")
         return val
+    @field_validator("product_item_fields", mode="before")
+    def validate_product_item_fields(cls, val):
+        """Ensure all product_item_fields are valid ProductItem attributes."""
+        valid_fields = set(ProductItem.model_fields.keys())
+        for field in val:
+            if field not in valid_fields:
+                raise ValueError(
+                    f"Invalid product_item_field: '{field}'. Must be one of: {sorted(valid_fields)}"
+                )
+        return val
 class AsyncClient:
     """Base class for sub-classes using async HTTP requests."""

fraudcrawler/base/client.py CHANGED Viewed

@@ -9,8 +9,17 @@ from typing import List
 import pandas as pd
 from fraudcrawler.settings import ROOT_DIR
-from fraudcrawler.base.base import Setup, Language, Location, Deepness, Host, Prompt
-from fraudcrawler.base.orchestrator import Orchestrator, ProductItem
+from fraudcrawler.base.base import (
+    Setup,
+    Language,
+    Location,
+    Deepness,
+    Host,
+    Prompt,
+    ProductItem,
+)
+from fraudcrawler.base.orchestrator import Orchestrator
+from fraudcrawler.scraping.serp import SearchEngine
 logger = logging.getLogger(__name__)
@@ -84,6 +93,7 @@ class FraudCrawlerClient(Orchestrator):
         prompts: List[Prompt],
         marketplaces: List[Host] | None = None,
         excluded_urls: List[Host] | None = None,
+        search_engines: List[SearchEngine | str] | None = None,
     ) -> None:
         """Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
@@ -96,6 +106,7 @@ class FraudCrawlerClient(Orchestrator):
             marketplaces: The marketplaces to include in the search.
             excluded_urls: The URLs to exclude from the search.
         """
+        # Handle results files
         timestamp = datetime.today().strftime("%Y%m%d%H%M%S")
         filename = self._results_dir / self._filename_template.format(
             search_term=search_term,
@@ -105,9 +116,18 @@ class FraudCrawlerClient(Orchestrator):
         )
         self._results.append(Results(search_term=search_term, filename=filename))
+        # Normalize inputs
+        nrm_se: List[SearchEngine] = list(SearchEngine)
+        if search_engines:
+            nrm_se = [
+                SearchEngine(se) if isinstance(se, str) else se for se in search_engines
+            ]
+        # Run the pipeline by calling the orchestrator's run method
         asyncio.run(
             super().run(
                 search_term=search_term,
+                search_engines=nrm_se,
                 language=language,
                 location=location,
                 deepness=deepness,

fraudcrawler/base/orchestrator.py CHANGED Viewed

@@ -1,12 +1,13 @@
 from abc import ABC, abstractmethod
 import asyncio
 import logging
-from pydantic import BaseModel, Field
 from typing import Dict, List, Set, cast
+from bs4 import BeautifulSoup
 from fraudcrawler.settings import (
     PROCESSOR_DEFAULT_MODEL,
     PROCESSOR_DEFAULT_IF_MISSING,
+    PROCESSOR_PRODUCT_DETAILS_TEMPLATE,
     MAX_RETRIES,
     RETRY_DELAY,
 )
@@ -15,37 +16,19 @@ from fraudcrawler.settings import (
     DEFAULT_N_ZYTE_WKRS,
     DEFAULT_N_PROC_WKRS,
 )
-from fraudcrawler.base.base import Deepness, Host, Language, Location, Prompt
-from fraudcrawler import SerpApi, Enricher, ZyteApi, Processor
+from fraudcrawler.base.base import (
+    Deepness,
+    Host,
+    Language,
+    Location,
+    Prompt,
+    ProductItem,
+)
+from fraudcrawler import SerpApi, SearchEngine, Enricher, ZyteApi, Processor
 logger = logging.getLogger(__name__)
-class ProductItem(BaseModel):
-    """Model representing a product item."""
-    # Serp/Enrich parameters
-    search_term: str
-    search_term_type: str
-    url: str
-    marketplace_name: str
-    domain: str
-    # Zyte parameters
-    product_name: str | None = None
-    product_price: str | None = None
-    product_description: str | None = None
-    product_images: List[str] | None = None
-    probability: float | None = None
-    # Processor parameters are set dynamic so we must allow extra fields
-    classifications: Dict[str, int] = Field(default_factory=dict)
-    # Filtering parameters
-    filtered: bool = False
-    filtered_at_stage: str | None = None
 class Orchestrator(ABC):
     """Abstract base class for orchestrating the different actors (crawling, processing).
@@ -231,15 +214,16 @@ class Orchestrator(ABC):
                     product.probability = self._zyteapi.extract_probability(
                         details=details
                     )
+                    product.html = self._zyteapi.extract_html(details=details)
+                    if product.html:
+                        soup = BeautifulSoup(product.html, "html.parser")
+                        product.html_clean = soup.get_text(separator=" ", strip=True)
                     # Filter the product based on the probability threshold
                     if not self._zyteapi.keep_product(details=details):
                         product.filtered = True
                         product.filtered_at_stage = "Zyte probability threshold"
                 except Exception as e:
                     logger.warning(f"Error executing Zyte API search: {e}.")
             await queue_out.put(product)
             queue_in.task_done()
@@ -269,19 +253,26 @@ class Orchestrator(ABC):
             if not product.filtered:
                 try:
                     url = product.url
-                    name = product.product_name
-                    description = product.product_description
                     # Run all the configured prompts
                     for prompt in prompts:
+                        # Dynamically build product_details string
+                        details = []
+                        for field in prompt.product_item_fields:
+                            value = getattr(product, field, None)
+                            if value is not None:
+                                details.append(
+                                    PROCESSOR_PRODUCT_DETAILS_TEMPLATE.format(
+                                        field_name=field, field_value=value
+                                    )
+                                )
+                        product_details = "\n\n".join(details)
                         logger.debug(
-                            f"Classify product {name} with prompt {prompt.name}"
+                            f"Classify product at {url} with prompt {prompt.name} and details: {product_details}"
                         )
                         classification = await self._processor.classify(
                             prompt=prompt,
                             url=url,
-                            name=name,
-                            description=description,
+                            product_details=product_details,
                         )
                         product.classifications[prompt.name] = classification
                 except Exception as e:
@@ -387,6 +378,7 @@ class Orchestrator(ABC):
         queue: asyncio.Queue[dict | None],
         search_term: str,
         search_term_type: str,
+        search_engines: List[SearchEngine],
         language: Language,
         location: Location,
         num_results: int,
@@ -397,6 +389,7 @@ class Orchestrator(ABC):
         item = {
             "search_term": search_term,
             "search_term_type": search_term_type,
+            "search_engines": search_engines,
             "language": language,
             "location": location,
             "num_results": num_results,
@@ -410,6 +403,7 @@ class Orchestrator(ABC):
         self,
         queue: asyncio.Queue[dict | None],
         search_term: str,
+        search_engines: List[SearchEngine],
         language: Language,
         location: Location,
         deepness: Deepness,
@@ -429,6 +423,7 @@ class Orchestrator(ABC):
         await self._add_serp_items_for_search_term(
             search_term=search_term,
             search_term_type="initial",
+            search_engines=search_engines,
             num_results=deepness.num_results,
             **common_kwargs,  # type: ignore[arg-type]
         )
@@ -450,6 +445,7 @@ class Orchestrator(ABC):
                 await self._add_serp_items_for_search_term(
                     search_term=trm,
                     search_term_type="enriched",
+                    search_engines=search_engines,
                     num_results=enrichment.additional_urls_per_term,
                     **common_kwargs,  # type: ignore[arg-type]
                 )
@@ -457,6 +453,7 @@ class Orchestrator(ABC):
     async def run(
         self,
         search_term: str,
+        search_engines: List[SearchEngine],
         language: Language,
         location: Location,
         deepness: Deepness,
@@ -469,6 +466,7 @@ class Orchestrator(ABC):
         Args:
             search_term: The search term for the query.
+            search_engines: The list of search engines to use for the SerpAPI query.
             language: The language to use for the query.
             location: The location to use for the query.
             deepness: The search depth and enrichment details.
@@ -523,6 +521,7 @@ class Orchestrator(ABC):
         await self._add_serp_items(
             queue=serp_queue,
             search_term=search_term,
+            search_engines=search_engines,
             language=language,
             location=location,
             deepness=deepness,

fraudcrawler/launch_demo_pipeline.py CHANGED Viewed

@@ -16,38 +16,39 @@ def main():
     search_term = "Kühlschrank"
     language = Language(name="German")
     location = Location(name="Switzerland")
-    deepness = Deepness(num_results=20)
+    deepness = Deepness(num_results=10)
     prompts = [
         Prompt(
-            name="relevance",
-            context="This organization is interested in checking the energy efficiency of certain devices.",
+            name="availability",
             system_prompt=(
-                "You are a helpful and intelligent assistant. Your task is to classify any given product "
-                "as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
+                "You are a helpful and intelligent assistant helping an organization that is interested in checking the availability of certain products."
+                "Your task is to classify any given product as either available (1) or not available (0), strictly based on the context and product details provided by the user. "
                 "You must consider all aspects of the given context and make a binary decision accordingly. "
-                "If the product aligns with the user's needs, classify it as 1 (relevant); otherwise, classify it as 0 (not relevant). "
+                "If the product can be purchased, added to a shopping basket, delivered, or is listed as available in any form, classify it as 1 (available); "
+                "if there is any mention of out of stock, not available, no longer shippable, or similar, classify it as 0 (not available). "
                 "Respond only with the number 1 or 0."
             ),
+            product_item_fields=["product_name", "html_clean"],
             allowed_classes=[0, 1],
         ),
-        Prompt(
-            name="seriousness",
-            context="This organization is interested in checking the energy efficiency of certain devices.",
-            system_prompt=(
-                "You are an intelligent and discerning assistant. Your task is to classify each item as either "
-                "a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
-                "    1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found  "
-                "within an online shop or marketplace.\n"
-                "    2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
-                "        - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
-                "exact product itself, classify as 0.\n"
-                "        - Advertisements: Promotional content that doesn't directly sell a product.\n"
-                "        - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
-                "        - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
-                "Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
-            ),
-            allowed_classes=[0, 1],
-        ),
+        # Prompt(
+        #     name="seriousness",
+        #     system_prompt=(
+        #         "You are a helpful and intelligent assistant helping an organization that is interested in checking the energy efficiency of certain devices. "
+        #         "Your task is to classify each item as either a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
+        #         "    1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found  "
+        #         "within an online shop or marketplace.\n"
+        #         "    2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
+        #         "        - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
+        #         "exact product itself, classify as 0.\n"
+        #         "        - Advertisements: Promotional content that doesn't directly sell a product.\n"
+        #         "        - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
+        #         "        - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
+        #         "Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
+        #     ),
+        #     product_item_fields=["product_name", "product_description"],
+        #     allowed_classes=[0, 1],
+        # ),
     ]
     # # Optional: Add tern ENRICHEMENT
     # from fraudcrawler import Enrichment

fraudcrawler/processing/processor.py CHANGED Viewed

@@ -52,42 +52,34 @@ class Processor:
             raise ValueError("Empty response from OpenAI API")
         return content
-    async def classify(
-        self, prompt: Prompt, url: str, name: str | None, description: str | None
-    ) -> int:
-        """A generic classification method that classified a product based on a prompt object.
+    async def classify(self, prompt: Prompt, url: str, product_details: str) -> int:
+        """A generic classification method that classifies a product based on a prompt object.
         Args:
-            prompt: A dictionary with keys "system_prompt", "user_prompt", etc.
+            prompt: A dictionary with keys "system_prompt", etc.
             url: Product URL (often used in the user_prompt).
-            name: Product name (often used in the user_prompt).
-            description: Product description (often used in the user_prompt).
+            product_details: String with product details, formatted per prompt.product_item_fields.
         Note:
             This method returns `PROCESSOR_DEFAULT_IF_MISSING` if:
-                - 'name' or 'description' is None
+                - product_details is empty
                 - an error occurs during the API call
                 - if the response isn't in allowed_classes.
         """
         # If required fields are missing, return the prompt's default fallback if provided.
-        if name is None or description is None:
-            logger.warning(
-                f"Missing required fields for classification: name='{name}', description='{description}'"
-            )
+        if not product_details:
+            logger.warning("Missing required product_details for classification.")
             return self._default_if_missing
         # Substitute placeholders in user_prompt with the relevant arguments
         user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
-            context=prompt.context,
-            url=url,
-            name=name,
-            description=description,
+            product_details=product_details,
         )
         # Call the OpenAI API
         try:
             logger.debug(
-                f'Calling OpenAI API for classification (name="{name}", prompt="{prompt.name}")'
+                f'Calling OpenAI API for classification (url="{url}", prompt="{prompt.name}")'
             )
             content = await self._call_openai_api(
                 system_prompt=prompt.system_prompt,
@@ -104,12 +96,12 @@ class Processor:
                 return self._default_if_missing
             logger.info(
-                f'Classification for "{name}" (prompt={prompt.name}): {classification}'
+                f'Classification for url="{url}" (prompt={prompt.name}): {classification}'
             )
             return classification
         except Exception as e:
             logger.error(
-                f'Error classifying product "{name}" with prompt "{prompt.name}": {e}'
+                f'Error classifying product at url="{url}" with prompt "{prompt.name}": {e}'
             )
             return self._default_if_missing

fraudcrawler/scraping/serp.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import asyncio
+from enum import Enum
 import logging
 from pydantic import BaseModel
 from typing import List
@@ -21,12 +22,21 @@ class SerpResult(BaseModel):
     filtered_at_stage: str | None = None
+class SearchEngine(Enum):
+    """Enum for the supported search engines."""
+    GOOGLE = "google"
+    GOOGLE_SHOPPING = "google_shopping"
 class SerpApi(AsyncClient):
     """A client to interact with the SerpApi for performing searches."""
     _endpoint = "https://serpapi.com/search"
-    _engine = "google"
-    _default_marketplace_name = "Google"
+    _engine_marketplace_names = {
+        SearchEngine.GOOGLE.value: "Google",
+        SearchEngine.GOOGLE_SHOPPING.value: "Google Shopping",
+    }
     _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
     def __init__(
@@ -73,8 +83,42 @@ class SerpApi(AsyncClient):
             hostname = hostname[4:]
         return hostname.lower()
+    @staticmethod
+    def _extract_search_results(response: dict, engine: str) -> List[str]:
+        """Extracts search results from the response based on the engine type.
+        Args:
+            response: The response from the SerpApi search.
+            engine: The search engine used.
+        Returns:
+            A list of URLs extracted from the response.
+        """
+        urls = []
+        if engine == SearchEngine.GOOGLE.value:
+            # Get the organic_results
+            results = response.get("organic_results")
+            if results is None:
+                logger.warning(f'No SerpAPI results for engine="{engine}".')
+            else:
+                urls = [url for res in results if (url := res.get("link"))]
+        elif engine == SearchEngine.GOOGLE_SHOPPING.value:
+            # Get the shopping_results
+            results = response.get("shopping_results")
+            if results is None:
+                logger.warning(f'No SerpAPI results for engine="{engine}".')
+            else:
+                urls = [url for res in results if (url := res.get("product_link"))]
+        else:
+            raise ValueError(f"Invalid SerpAPI search engine: {engine}")
+        return urls
     async def _search(
         self,
+        engine: str,
         search_string: str,
         language: Language,
         location: Location,
@@ -83,6 +127,7 @@ class SerpApi(AsyncClient):
         """Performs a search using SerpApi and returns the URLs of the results.
         Args:
+            engine: The search engine to use.
             search_string: The search string (with potentially added site: parameters).
             language: The language to use for the query ('hl' parameter).
             location: The location to use for the query ('gl' parameter).
@@ -93,20 +138,35 @@ class SerpApi(AsyncClient):
             q: The search string (with potentially added site: parameters).
             google_domain: The Google domain to use for the search (e.g. google.[com]).
             location_[requested|used]: The location to use for the search.
-            tbs: The time-based search parameters (e.g. 'ctr:CH&cr:countryCH').
+            tbs: The to-be-searched  parameters (e.g. 'ctr:CH').
+            cr: The country code to limit the search to (e.g. 'countryCH').
             gl: The country code to use for the search.
             hl: The language code to use for the search.
             num: The number of results to return.
             api_key: The API key to use for the search.
         """
+        if engine not in self._engine_marketplace_names:
+            raise ValueError(
+                f"Invalid SerpAPI search engine: {engine}. "
+                f"Supported engines are: {list(self._engine_marketplace_names.keys())}."
+            )
+        logger.debug(
+            f'Performing SerpAPI search with engine="{engine}", '
+            f'q="{search_string}", '
+            f'location="{location.name}", '
+            f'language="{language.code}", '
+            f"num_results={num_results}."
+        )
         # Setup the parameters
         params = {
-            "engine": self._engine,
+            "engine": engine,
             "q": search_string,
             "google_domain": f"google.{location.code}",
             "location_requested": location.name,
             "location_used": location.name,
-            "tbs": f"ctr:{location.code.upper()}&cr:country{location.code.upper()}",
+            "tbs": f"ctr:{location.code.upper()}",
+            "cr": f"country{location.code.upper()}",
             "gl": location.code,
             "hl": language.code,
             "num": num_results,
@@ -132,18 +192,11 @@ class SerpApi(AsyncClient):
         if err is not None:
             raise err
-        # Get the organic_results
-        results = response.get("organic_results")
-        if results is None:
-            logger.warning(
-                f'No organic_results key in SerpAPI results for search_string="{search_string}".'
-            )
-            return []
+        # Extract the URLs from the response
+        urls = self._extract_search_results(response=response, engine=engine)
-        # Extract urls
-        urls = [res.get("link") for res in results]
         logger.debug(
-            f'Found {len(urls)} URLs from SerpApi search for q="{search_string}".'
+            f'Found total of {len(urls)} URLs from SerpApi search for q="{search_string}" and engine="{engine}".'
         )
         return urls
@@ -234,6 +287,7 @@ class SerpApi(AsyncClient):
     def _create_serp_result(
         self,
+        engine: str,
         url: str,
         location: Location,
         marketplaces: List[Host] | None = None,
@@ -244,13 +298,18 @@ class SerpApi(AsyncClient):
         If marketplaces is None or the domain can not be extracted, the default marketplace name is used.
         Args:
+            engine: The search engine used.
             url: The URL to be processed.
             location:  The location to use for the query.
             marketplaces: The list of marketplaces to compare the URL against.
+            excluded_urls: The list of excluded URLs.
         """
         # Get marketplace name
         domain = self._get_domain(url=url)
-        marketplace_name = self._default_marketplace_name
+        # Select marketplace name based on engine
+        marketplace_name = self._engine_marketplace_names[engine]
         if marketplaces:
             try:
                 marketplace_name = next(
@@ -277,9 +336,109 @@ class SerpApi(AsyncClient):
         )
         return result
+    async def _search_google(
+        self,
+        search_string: str,
+        language: Language,
+        location: Location,
+        num_results: int,
+        marketplaces: List[Host] | None = None,
+        excluded_urls: List[Host] | None = None,
+    ) -> List[SerpResult]:
+        """Performs a google search using SerpApi and returns SerpResults.
+        Args:
+            search_string: The search string (with potentially added site: parameters).
+            language: The language to use for the query ('hl' parameter).
+            location: The location to use for the query ('gl' parameter).
+            num_results: Max number of results to return.
+            marketplaces: The marketplaces to include in the search.
+            excluded_urls: The URLs to exclude from the search.
+        """
+        engine = SearchEngine.GOOGLE.value
+        # Perform the search
+        urls = await self._search(
+            engine=engine,
+            search_string=search_string,
+            language=language,
+            location=location,
+            num_results=num_results,
+        )
+        # Create SerpResult objects from the URLs
+        results = [
+            self._create_serp_result(
+                url=url,
+                location=location,
+                marketplaces=marketplaces,
+                excluded_urls=excluded_urls,
+                engine=engine,
+            )
+            for url in urls
+        ]
+        logger.debug(
+            f'Produced {len(results)} results from google search with q="{search_string}".'
+        )
+        return results
+    async def _search_google_shopping(
+        self,
+        search_string: str,
+        language: Language,
+        location: Location,
+        num_results: int,
+        marketplaces: List[Host] | None = None,
+        excluded_urls: List[Host] | None = None,
+    ) -> List[SerpResult]:
+        """Performs a google search using SerpApi and returns SerpResults.
+        Args:
+            search_string: The search string (with potentially added site: parameters).
+            language: The language to use for the query ('hl' parameter).
+            location: The location to use for the query ('gl' parameter).
+            num_results: Max number of results to return.
+            marketplaces: The marketplaces to include in the search.
+            excluded_urls: The URLs to exclude from the search.
+        """
+        engine = SearchEngine.GOOGLE_SHOPPING.value
+        # Perform the search
+        urls = await self._search(
+            engine=engine,
+            search_string=search_string,
+            language=language,
+            location=location,
+            num_results=num_results,
+        )
+        # !!! NOTE !!!: Google Shopping results do not properly support the 'num' parameter,
+        # so we might get more results than requested. This is a known issue with SerpAPI
+        # and Google Shopping searches (see https://github.com/serpapi/public-roadmap/issues/1858)
+        urls = urls[:num_results]
+        # Create SerpResult objects from the URLs
+        results = [
+            self._create_serp_result(
+                url=url,
+                location=location,
+                marketplaces=marketplaces,
+                excluded_urls=excluded_urls,
+                engine=engine,
+            )
+            for url in urls
+        ]
+        logger.debug(
+            f'Produced {len(results)} results from google shopping search with q="{search_string}".'
+        )
+        return results
     async def apply(
         self,
         search_term: str,
+        search_engines: List[SearchEngine],
         language: Language,
         location: Location,
         num_results: int,
@@ -305,27 +464,35 @@ class SerpApi(AsyncClient):
             sites = [dom for host in marketplaces for dom in host.domains]
             search_string += " site:" + " OR site:".join(s for s in sites)
-        # Perform the search
-        urls = await self._search(
-            search_string=search_string,
-            language=language,
-            location=location,
-            num_results=num_results,
-        )
+        # Initialize the results list
+        results: List[SerpResult] = []
-        # Form the SerpResult objects
-        results = [
-            self._create_serp_result(
-                url=url,
+        # Perform the google search
+        if SearchEngine.GOOGLE in search_engines:
+            ggl_res = await self._search_google(
+                search_string=search_string,
+                language=language,
                 location=location,
+                num_results=num_results,
                 marketplaces=marketplaces,
                 excluded_urls=excluded_urls,
             )
-            for url in urls
-        ]
+            results.extend(ggl_res)
+        # Perform the google shopping search
+        if SearchEngine.GOOGLE_SHOPPING in search_engines:
+            shp_res = await self._search_google_shopping(
+                search_string=search_string,
+                language=language,
+                location=location,
+                num_results=num_results,
+                marketplaces=marketplaces,
+                excluded_urls=excluded_urls,
+            )
+            results.extend(shp_res)
         num_non_filtered = len([res for res in results if not res.filtered])
         logger.info(
-            f'Produced {num_non_filtered} results from SerpApi search with q="{search_string}".'
+            f'Produced a total of {num_non_filtered} results from SerpApi search with q="{search_string}".'
         )
         return results

fraudcrawler/scraping/zyte.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import logging
 from typing import List
+from base64 import b64decode
 import aiohttp
@@ -68,7 +69,8 @@ class ZyteApi(AsyncClient):
                     "metadata": {
                         "probability": float,
                     },
-                }
+                },
+                "httpResponseBody": base64
             }
         """
         logger.info(f"Fetching product details by Zyte for URL {url}.")
@@ -192,3 +194,24 @@ class ZyteApi(AsyncClient):
             }
         """
         return float(details.get("product", {}).get("metadata", {}).get("probability"))
+    @staticmethod
+    def extract_html(details: dict) -> str | None:
+        """Extracts the HTML from the Zyte API response.
+        The input argument is a dictionary of the following structure:
+            {
+                "httpResponseBody": base64
+            }
+        """
+        # Get the Base64-encoded content
+        encoded = details.get("httpResponseBody")
+        # Decode it into bytes
+        if isinstance(encoded, str):
+            decoded_bytes = b64decode(encoded)
+        # Convert bytes to string (assuming UTF-8 encoding)
+        decoded_string = decoded_bytes.decode("utf-8")
+        return decoded_string

fraudcrawler/settings.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import List
 # Generic settings
 MAX_RETRIES = 3
@@ -8,8 +9,8 @@ ROOT_DIR = Path(__file__).parents[1]
 # Serp settings
 GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
 GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
-SERP_DEFAULT_COUNTRY_CODES = [
-    ".com",
+SERP_DEFAULT_COUNTRY_CODES: List[str] = [
+    # ".com",
 ]
 # Enrichment settings
@@ -21,9 +22,8 @@ ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
 # Processor settings
 PROCESSOR_DEFAULT_MODEL = "gpt-4o"
 PROCESSOR_DEFAULT_IF_MISSING = -1
-PROCESSOR_USER_PROMPT_TEMPLATE = (
-    "Context: {context}\n\nProduct Details: {name}\n{description}\\n\nRelevance:"
-)
+PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
+PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
 # Async settings
 DEFAULT_N_SERP_WKRS = 10

{fraudcrawler-0.3.10.dist-info → fraudcrawler-0.4.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: fraudcrawler
-Version: 0.3.10
+Version: 0.4.2
 Summary: Intelligent Market Monitoring
 Home-page: https://github.com/open-veanu/fraudcrawler
 License: MIT
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Requires-Dist: aiohttp (>=3.11.14,<4.0.0)
+Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
 Requires-Dist: openai (>=1.68.2,<2.0.0)
 Requires-Dist: pandas (>=2.2.3,<3.0.0)
 Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
@@ -80,7 +81,6 @@ deepness = Deepness(num_results=50)
 prompts = [
     Prompt(
         name="relevance",
-        context="This organization is interested in medical products and drugs.",
         system_prompt=(
             "You are a helpful and intelligent assistant. Your task is to classify any given product "
             "as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "

fraudcrawler-0.4.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,20 @@
+fraudcrawler/__init__.py,sha256=yXFdQzlSLUZV4Oh0wkzghvPlICQO5TnpEtIHZaTay_c,717
+fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+fraudcrawler/base/base.py,sha256=JWjZ3mpX4caQAsWKYqtHrUqHfHr6GXlAaEjxxHV9ODQ,6020
+fraudcrawler/base/client.py,sha256=FibiYycjUys-c4sv66Y2JqJu5y15be2MYd2_9yB3wG8,4936
+fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
+fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
+fraudcrawler/base/orchestrator.py,sha256=p1gRtj3jVaFmtwPSKruiOixu3QDuSiHjPKFi0KKsgPk,24591
+fraudcrawler/launch_demo_pipeline.py,sha256=zQxKAekJ56iKQ5-NeM0UMS-1Wd3ui0bpeqkH1nM9A4A,4628
+fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+fraudcrawler/processing/processor.py,sha256=An2orst0YRIav7bFuoDMgjwWz2Z9dyjVUbkNAMXNTTo,3748
+fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+fraudcrawler/scraping/enrich.py,sha256=X1BBZshdZqPmbduzhGwH0ULSzq03L_7bf7_UL8yOQ9E,10608
+fraudcrawler/scraping/serp.py,sha256=ix2kCs9Xo694K8vjDL104MDb2Cun1AXfStxMaR-2u7U,17865
+fraudcrawler/scraping/zyte.py,sha256=DUF5pIwpZyQw30qURnFxtp8KYpUgBkrXjM7RaVGH92Q,7005
+fraudcrawler/settings.py,sha256=z63Lc8LnmfG7u0F7CVlGOXMMpr7LtJC0BzXDoA8rN7Q,839
+fraudcrawler-0.4.2.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
+fraudcrawler-0.4.2.dist-info/METADATA,sha256=M1xMdweLHpSbfEceT_5GpcDiLdDHpOHpzQ5w-ZNF4gQ,5931
+fraudcrawler-0.4.2.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
+fraudcrawler-0.4.2.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
+fraudcrawler-0.4.2.dist-info/RECORD,,

fraudcrawler-0.3.10.dist-info/RECORD DELETED Viewed

@@ -1,20 +0,0 @@
-fraudcrawler/__init__.py,sha256=2EgoTb2jNcQt1NxUV8za0154kb7ZnHZ_KeKgx21rdFs,679
-fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-fraudcrawler/base/base.py,sha256=woesbPztEh7tbD0ty9S37JbFrbEC-01H9etmCT2ffnc,4771
-fraudcrawler/base/client.py,sha256=GcTUMqLfvweLFdHy6CP9tgxsFQiPkc6KyiLcwLnDiw8,4412
-fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
-fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
-fraudcrawler/base/orchestrator.py,sha256=Gmryv8l8nB1QUwwjLoZGop2mwKqWYQQORT_96_w5ptA,23981
-fraudcrawler/launch_demo_pipeline.py,sha256=RIZTtdtZeJPhvSLp1IUjT_nhme_2q6mAGWKoL838E4E,4320
-fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-fraudcrawler/processing/processor.py,sha256=IFVKIiNi0QoCAgPFkFtNDgxfhh01iDNUyIBZWACplR8,3993
-fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-fraudcrawler/scraping/enrich.py,sha256=X1BBZshdZqPmbduzhGwH0ULSzq03L_7bf7_UL8yOQ9E,10608
-fraudcrawler/scraping/serp.py,sha256=xBXqBcgO25xtiV3CnTLRuUeKIpnWGaAyDqF7KeGAcks,11750
-fraudcrawler/scraping/zyte.py,sha256=ggI4iYG-E_UyiKgUpEFekeUd1giifEfJ_uyFUSJGSLY,6296
-fraudcrawler/settings.py,sha256=1SVxjwMLuZd_rr3KkwYoRozTBw2VQU-OJQkgA33k95Q,768
-fraudcrawler-0.3.10.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
-fraudcrawler-0.3.10.dist-info/METADATA,sha256=Nkr3t_4q_pejrdBFyzbOq9ePlauQwy-ZM_Njr1n6OSk,5966
-fraudcrawler-0.3.10.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
-fraudcrawler-0.3.10.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
-fraudcrawler-0.3.10.dist-info/RECORD,,

{fraudcrawler-0.3.10.dist-info → fraudcrawler-0.4.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{fraudcrawler-0.3.10.dist-info → fraudcrawler-0.4.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{fraudcrawler-0.3.10.dist-info → fraudcrawler-0.4.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

fraudcrawler 0.3.10__py3-none-any.whl → 0.4.2__py3-none-any.whl

Potentially problematic release.

fraudcrawler 0.3.10py3-none-any.whl → 0.4.2py3-none-any.whl