PyPI - fraudcrawler - Versions diffs - 0.3.3__py3-none-any.whl - Mend

fraudcrawler 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fraudcrawler might be problematic. Click here for more details.

Files changed (20) hide show

fraudcrawler/__init__.py +30 -0
fraudcrawler/base/__init__.py +0 -0
fraudcrawler/base/base.py +145 -0
fraudcrawler/base/client.py +134 -0
fraudcrawler/base/google-languages.json +630 -0
fraudcrawler/base/google-locations.json +1 -0
fraudcrawler/base/orchestrator.py +626 -0
fraudcrawler/launch_demo_pipeline.py +100 -0
fraudcrawler/processing/__init__.py +0 -0
fraudcrawler/processing/processor.py +105 -0
fraudcrawler/scraping/__init__.py +0 -0
fraudcrawler/scraping/enrich.py +303 -0
fraudcrawler/scraping/serp.py +251 -0
fraudcrawler/scraping/zyte.py +194 -0
fraudcrawler/settings.py +31 -0
fraudcrawler-0.3.3.dist-info/LICENSE +21 -0
fraudcrawler-0.3.3.dist-info/METADATA +163 -0
fraudcrawler-0.3.3.dist-info/RECORD +20 -0
fraudcrawler-0.3.3.dist-info/WHEEL +4 -0
fraudcrawler-0.3.3.dist-info/entry_points.txt +3 -0

fraudcrawler/launch_demo_pipeline.py ADDED Viewed

@@ -0,0 +1,100 @@
+import logging
+from fraudcrawler import FraudCrawlerClient, Language, Location, Deepness, Prompt
+LOG_FMT = "%(asctime)s | %(name)s | %(funcName)s | %(levelname)s | %(message)s"
+LOG_LVL = "INFO"
+DATE_FMT = "%Y-%m-%d %H:%M:%S"
+logging.basicConfig(format=LOG_FMT, level=LOG_LVL, datefmt=DATE_FMT)
+def main():
+    # Setup the client
+    client = FraudCrawlerClient()
+    # Setup the search
+    search_term = "Kühlschrank"
+    language = Language(name="German")
+    location = Location(name="Switzerland")
+    deepness = Deepness(num_results=20)
+    prompts = [
+        Prompt(
+            name="relevance",
+            context="This organization is interested in checking the energy efficiency of certain devices.",
+            system_prompt=(
+                "You are a helpful and intelligent assistant. Your task is to classify any given product "
+                "as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
+                "You must consider all aspects of the given context and make a binary decision accordingly. "
+                "If the product aligns with the user's needs, classify it as 1 (relevant); otherwise, classify it as 0 (not relevant). "
+                "Respond only with the number 1 or 0."
+            ),
+            allowed_classes=[0, 1],
+        ),
+        Prompt(
+            name="seriousness",
+            context="This organization is interested in checking the energy efficiency of certain devices.",
+            system_prompt=(
+                "You are an intelligent and discerning assistant. Your task is to classify each item as either "
+                "a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
+                "    1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found  "
+                "within an online shop or marketplace.\n"
+                "    2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
+                "        - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
+                "exact product itself, classify as 0.\n"
+                "        - Advertisements: Promotional content that doesn't directly sell a product.\n"
+                "        - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
+                "        - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
+                "Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
+            ),
+            allowed_classes=[0, 1],
+        ),
+    ]
+    # # Optional: Add tern ENRICHEMENT
+    # from fraudcrawler import Enrichment
+    # deepness.enrichment = Enrichment(additional_terms=10, additional_urls_per_term=20)
+    # # Optional: Add MARKETPLACES and EXCLUDED_URLS
+    # from fraudcrawler import Host
+    # marketplaces = [
+    #     Host(name="International", domains="zavamed.com,apomeds.com"),
+    #     Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
+    # ]
+    # excluded_urls = [
+    #     Host(name="Digitec", domains="digitec.ch"),
+    #     Host(name="Brack", domains="brack.ch"),
+    # ]
+    # Execute the pipeline
+    client.execute(
+        search_term=search_term,
+        language=language,
+        location=location,
+        deepness=deepness,
+        prompts=prompts,
+        # marketplaces=marketplaces,
+        # excluded_urls=excluded_urls,
+    )
+    # Show results
+    print()
+    title = "Available results"
+    print(title)
+    print("=" * len(title))
+    client.print_available_results()
+    print()
+    title = f'Results for "{search_term.upper()}"'
+    print(title)
+    print("=" * len(title))
+    df = client.load_results()
+    print(f"Number of products found: {len(df)}")
+    print()
+    n_head = 10
+    print(f"First {n_head} products are:")
+    print(df.head(n=n_head))
+    print()
+if __name__ == "__main__":
+    main()

fraudcrawler/processing/__init__.py ADDED Viewed

File without changes

fraudcrawler/processing/processor.py ADDED Viewed

@@ -0,0 +1,105 @@
+import logging
+from openai import AsyncOpenAI
+from fraudcrawler.base.base import Prompt
+from fraudcrawler.settings import PROCESSOR_USER_PROMPT_TEMPLATE
+logger = logging.getLogger(__name__)
+class Processor:
+    """Processes product data for classification based on a prompt configuration."""
+    def __init__(self, api_key: str, model: str):
+        """Initializes the Processor.
+        Args:
+            api_key: The OpenAI API key.
+            model: The OpenAI model to use.
+        """
+        self._client = AsyncOpenAI(api_key=api_key)
+        self._model = model
+    async def _call_openai_api(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        **kwargs,
+    ) -> str:
+        """Calls the OpenAI API with the given user prompt."""
+        response = await self._client.chat.completions.create(
+            model=self._model,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ],
+            **kwargs,
+        )
+        content = response.choices[0].message.content
+        if not content:
+            raise ValueError("Empty response from OpenAI API")
+        return content
+    async def classify(
+        self, prompt: Prompt, url: str, name: str | None, description: str | None
+    ) -> int:
+        """A generic classification method that classified a product based on a prompt object.
+        Args:
+            prompt: A dictionary with keys "system_prompt", "user_prompt", etc.
+            url: Product URL (often used in the user_prompt).
+            name: Product name (often used in the user_prompt).
+            description: Product description (often used in the user_prompt).
+        Note:
+            This method returns `prompt.default_if_missing` if:
+                - 'name' or 'description' is None
+                - an error occurs during the API call
+                - if the response isn't in allowed_classes.
+        """
+        # If required fields are missing, return the prompt's default fallback if provided.
+        if name is None or description is None:
+            logger.warning(
+                f"Missing required fields for classification: name='{name}', description='{description}'"
+            )
+            return prompt.default_if_missing
+        # Substitute placeholders in user_prompt with the relevant arguments
+        user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
+            context=prompt.context,
+            url=url,
+            name=name,
+            description=description,
+        )
+        # Call the OpenAI API
+        try:
+            logger.debug(
+                f'Calling OpenAI API for classification (name="{name}", prompt="{prompt.name}")'
+            )
+            content = await self._call_openai_api(
+                system_prompt=prompt.system_prompt,
+                user_prompt=user_prompt,
+                max_tokens=1,
+            )
+            classification = int(content.strip())
+            # Enforce that the classification is in the allowed classes
+            if classification not in prompt.allowed_classes:
+                logger.warning(
+                    f"Classification '{classification}' not in allowed classes {prompt.allowed_classes}"
+                )
+                return prompt.default_if_missing
+            logger.info(
+                f'Classification for "{name}" (prompt={prompt.name}): {classification}'
+            )
+            return classification
+        except Exception as e:
+            logger.error(
+                f'Error classifying product "{name}" with prompt "{prompt.name}": {e}'
+            )
+            return prompt.default_if_missing

fraudcrawler/scraping/__init__.py ADDED Viewed

File without changes

fraudcrawler/scraping/enrich.py ADDED Viewed

@@ -0,0 +1,303 @@
+from base64 import b64encode
+from collections import defaultdict
+import logging
+from pydantic import BaseModel
+from typing import Dict, List, Iterator
+from fraudcrawler.settings import ENRICHMENT_DEFAULT_LIMIT
+from fraudcrawler.base.base import Location, Language, AsyncClient
+logger = logging.getLogger(__name__)
+class Keyword(BaseModel):
+    """Model for keyword details (e.g. `Keyword(text="sildenafil", volume=100)`)."""
+    text: str
+    volume: int
+class Enricher(AsyncClient):
+    """A client to interact with the DataForSEO API for enhancing searches (producing alternative search_terms)."""
+    _auth_encoding = "ascii"
+    _max_retries = 3
+    _retry_delay = 2
+    _base_endpoint = "https://api.dataforseo.com"
+    _suggestions_endpoint = "/v3/dataforseo_labs/google/keyword_suggestions/live"
+    _keywords_endpoint = "/v3/dataforseo_labs/google/related_keywords/live"
+    def __init__(self, user: str, pwd: str):
+        """Initializes the DataForSeoApiClient with the given username and password.
+        Args:
+            user: The username for DataForSEO API.
+            pwd: The password for DataForSEO API.
+        """
+        self._user = user
+        self._pwd = pwd
+        auth = f"{user}:{pwd}"
+        auth = b64encode(auth.encode(self._auth_encoding)).decode(self._auth_encoding)
+        self._headers = {
+            "Authorization": f"Basic {auth}",
+            "Content-Encoding": "gzip",
+        }
+    @staticmethod
+    def _extract_items_from_data(data: dict) -> Iterator[dict]:
+        """Extracts the items from the DataForSEO response.
+        Args:
+            data: The response data from DataForSEO.
+        """
+        tasks = (
+            data.get("tasks") or []
+        )  # in contrast to data.get("tasks", []) this handles the case where data["tasks"] is set to None
+        for task in tasks:
+            results = task.get("result") or []
+            for result in results:
+                items = result.get("items") or []
+                yield from items
+    @staticmethod
+    def _parse_suggested_keyword(item: dict) -> Keyword:
+        """Parses a keyword from an item in the DataForSEO suggested keyword search response.
+        Args:
+            item: An item from the DataForSEO response.
+        """
+        text = item["keyword"]
+        volume = item["keyword_info"]["search_volume"]
+        return Keyword(text=text, volume=volume)
+    def _extract_suggested_keywords(self, data: dict) -> List[Keyword]:
+        """Extracts the keywords from the DataForSEO response for suggested keywords.
+        Args:
+            data: The response data from DataForSEO.
+        The DataForSEO results are of the form
+        (c.f. https://docs.dataforseo.com/v3/dataforseo_labs/google/keyword_suggestions/live/?bash):
+        {
+          "tasks": [
+            {
+              "result": [
+                {
+                  "items": [
+                    {
+                      "keyword": <suggested-keyword>,
+                      "keyword_info": {
+                        "search_volume": <volume>
+                      }
+                    }
+                  ]
+                }
+              ]
+            }
+          ]
+        }
+        Args:
+            data: The response data from DataForSEO.
+        """
+        keywords = []
+        for item in self._extract_items_from_data(data=data):
+            try:
+                keyword = self._parse_suggested_keyword(item)
+                keywords.append(keyword)
+            except Exception as e:
+                logger.warning(f"Ignoring keyword due to error: {e}.")
+        return keywords
+    async def _get_suggested_keywords(
+        self,
+        search_term: str,
+        language: Language,
+        location: Location,
+        limit: int = ENRICHMENT_DEFAULT_LIMIT,
+    ) -> List[Keyword]:
+        """Get keyword suggestions for a given search_term.
+        Args:
+            search_term: The search term to use for the query.
+            language: The language to use for the search.
+            location: The location to use for the search.
+            limit: The upper limit of suggestions to get.
+        """
+        # Data must be a list of dictionaries setting a number of search tasks; here we only have one task.
+        data = [
+            {
+                "keyword": search_term,
+                "language_name": language.name,
+                "location_name": location.name,
+                "limit": limit,
+                "include_serp_info": True,
+                "include_seed_keyword": True,
+            }
+        ]
+        logger.debug(
+            f'DataForSEO search for suggested keywords with search_term="{search_term}".'
+        )
+        try:
+            url = f"{self._base_endpoint}{self._suggestions_endpoint}"
+            logger.debug(f'DataForSEO url="{url}" with data="{data}".')
+            sugg_data = await self.post(url=url, headers=self._headers, data=data)
+        except Exception as e:
+            logger.error(f"DataForSEO suggested search failed with error: {e}.")
+        # Extract the keywords from the response
+        try:
+            keywords = self._extract_suggested_keywords(data=sugg_data)
+        except Exception as e:
+            logger.error(
+                f"Failed to extract suggested keywords from DataForSEO response with error: {e}."
+            )
+        logger.debug(f"Found {len(keywords)} suggestions from DataForSEO search.")
+        return keywords
+    @staticmethod
+    def _parse_related_keyword(item: dict) -> Keyword:
+        """Parses a keyword from an item in the DataForSEO related keyword search response.
+        Args:
+            item: An item from the DataForSEO response.
+        """
+        text = item["keyword_data"]["keyword"]
+        volume = item["keyword_data"]["keyword_info"]["search_volume"]
+        return Keyword(text=text, volume=volume)
+    def _extract_related_keywords(self, data: dict) -> List[Keyword]:
+        """Extracts the keywords from the DataForSEO response for related keywords.
+        Args:
+            data: The response data from DataForSEO.
+        The DataForSEO results are of the form
+        (c.f. https://docs.dataforseo.com/v3/dataforseo_labs/google/related_keywords/live/?bash):
+        {
+          "tasks": [
+            {
+              "result": [
+                {
+                  "items": [
+                    {
+                      "keyword_data": {
+                        "keyword": <related-keyword>,
+                        "keyword_info": {
+                          "search_volume": <volume>
+                        }
+                      }
+                    }
+                  ]
+                }
+              ]
+            }
+          ]
+        }
+        Args:
+            data: The response data from DataForSEO.
+        """
+        keywords = []
+        for item in self._extract_items_from_data(data=data):
+            try:
+                keyword = self._parse_related_keyword(item)
+                keywords.append(keyword)
+            except Exception as e:
+                logger.warning(f"Ignoring keyword due to error: {e}.")
+        return keywords
+    async def _get_related_keywords(
+        self,
+        search_term: str,
+        language: Language,
+        location: Location,
+        limit: int = ENRICHMENT_DEFAULT_LIMIT,
+    ) -> List[Keyword]:
+        """Get related keywords for a given search_term.
+        Args:
+            search_term: The search term to use for the query.
+            location: The location to use for the search.
+            language: The language to use for the search.
+            limit: The upper limit of suggestions to get.
+        """
+        # Data must be a list of dictionaries setting a number of search tasks; here we only have one task.
+        data = [
+            {
+                "keyword": search_term,
+                "language_name": language.name,
+                "location_name": location.name,
+                "limit": limit,
+            }
+        ]
+        logger.debug(
+            f'DataForSEO search for related keywords with search_term="{search_term}".'
+        )
+        try:
+            url = f"{self._base_endpoint}{self._keywords_endpoint}"
+            logger.debug(f'DataForSEO url="{url}" with data="{data}".')
+            rel_data = await self.post(url=url, headers=self._headers, data=data)
+        except Exception as e:
+            logger.error(f"DataForSEO related keyword search failed with error: {e}.")
+        # Extract the keywords from the response
+        try:
+            keywords = self._extract_related_keywords(data=rel_data)
+        except Exception as e:
+            logger.error(
+                f"Failed to extract related keywords from DataForSEO response with error: {e}."
+            )
+        logger.debug(f"Found {len(keywords)} related keywords from DataForSEO search.")
+        return keywords
+    async def apply(
+        self,
+        search_term: str,
+        language: Language,
+        location: Location,
+        n_terms: int,
+    ) -> List[str]:
+        """Applies the enrichment to a search_term.
+        Args:
+            search_term: The search term to use for the query.
+            location: The location to use for the search.
+            language: The language to use for the search.
+            n_terms: The number of additional terms
+        """
+        # Get the additional keywords
+        logger.info(
+            f'Applying enrichment for search_term="{search_term}" and n_terms="{n_terms}".'
+        )
+        suggested = await self._get_suggested_keywords(
+            search_term=search_term,
+            location=location,
+            language=language,
+            limit=n_terms,
+        )
+        related = await self._get_related_keywords(
+            search_term=search_term,
+            location=location,
+            language=language,
+            limit=n_terms,
+        )
+        # Remove original keyword and aggregate them by volume
+        keywords = [kw for kw in suggested + related if kw.text != search_term]
+        kw_vol: Dict[str, int] = defaultdict(int)
+        for kw in keywords:
+            kw_vol[kw.text] = max(kw.volume, kw_vol[kw.text])
+        keywords = [Keyword(text=k, volume=v) for k, v in kw_vol.items()]
+        logger.debug(f"Found {len(keywords)} additional unique keywords.")
+        # Sort the keywords by volume and get the top n_terms
+        keywords = sorted(keywords, key=lambda kw: kw.volume, reverse=True)
+        terms = [kw.text for kw in keywords[:n_terms]]
+        logger.info(f"Produced {len(terms)} additional search_terms.")
+        return terms