PyPI - fraudcrawler - Versions diffs - 0.4.0__tar.gz → 0.4.3__tar.gz - Mend

fraudcrawler 0.4.0tar.gz → 0.4.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fraudcrawler might be problematic. Click here for more details.

Files changed (21) hide show

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: fraudcrawler
-Version: 0.4.0
+Version: 0.4.3
 Summary: Intelligent Market Monitoring
 Home-page: https://github.com/open-veanu/fraudcrawler
 License: MIT
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Requires-Dist: aiohttp (>=3.11.14,<4.0.0)
+Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
 Requires-Dist: openai (>=1.68.2,<2.0.0)
 Requires-Dist: pandas (>=2.2.3,<3.0.0)
 Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
@@ -80,7 +81,6 @@ deepness = Deepness(num_results=50)
 prompts = [
     Prompt(
         name="relevance",
-        context="This organization is interested in medical products and drugs.",
         system_prompt=(
             "You are a helpful and intelligent assistant. Your task is to classify any given product "
             "as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/README.md RENAMED Viewed

@@ -58,7 +58,6 @@ deepness = Deepness(num_results=50)
 prompts = [
     Prompt(
         name="relevance",
-        context="This organization is interested in medical products and drugs.",
         system_prompt=(
             "You are a helpful and intelligent assistant. Your task is to classify any given product "
             "as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/__init__.py RENAMED Viewed

@@ -1,8 +1,9 @@
 from fraudcrawler.scraping.serp import SerpApi, SearchEngine
 from fraudcrawler.scraping.enrich import Enricher
+from fraudcrawler.scraping.url import URLCollector
 from fraudcrawler.scraping.zyte import ZyteApi
 from fraudcrawler.processing.processor import Processor
-from fraudcrawler.base.orchestrator import Orchestrator, ProductItem
+from fraudcrawler.base.orchestrator import Orchestrator
 from fraudcrawler.base.client import FraudCrawlerClient
 from fraudcrawler.base.base import (
     Deepness,
@@ -11,12 +12,14 @@ from fraudcrawler.base.base import (
     Language,
     Location,
     Prompt,
+    ProductItem,
 )
 __all__ = [
     "SerpApi",
     "SearchEngine",
     "Enricher",
+    "URLCollector",
     "ZyteApi",
     "Processor",
     "Orchestrator",

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/base/base.py RENAMED Viewed

@@ -2,12 +2,13 @@ import json
 import logging
 from pydantic import (
     BaseModel,
+    Field,
     field_validator,
     model_validator,
 )
 from pydantic_settings import BaseSettings
 import re
-from typing import List
+from typing import List, Dict
 import aiohttp
@@ -114,12 +115,39 @@ class Deepness(BaseModel):
     enrichment: Enrichment | None = None
+class ProductItem(BaseModel):
+    """Model representing a product item."""
+    # Serp/Enrich parameters
+    search_term: str
+    search_term_type: str
+    url: str
+    marketplace_name: str
+    domain: str
+    # Zyte parameters
+    product_name: str | None = None
+    product_price: str | None = None
+    product_description: str | None = None
+    product_images: List[str] | None = None
+    probability: float | None = None
+    html: str | None = None
+    html_clean: str | None = None
+    # Processor parameters are set dynamic so we must allow extra fields
+    classifications: Dict[str, int] = Field(default_factory=dict)
+    # Filtering parameters
+    filtered: bool = False
+    filtered_at_stage: str | None = None
 class Prompt(BaseModel):
     """Model for prompts."""
     name: str
-    context: str
     system_prompt: str
+    product_item_fields: List[str]
     allowed_classes: List[int]
     @field_validator("allowed_classes", mode="before")
@@ -129,6 +157,17 @@ class Prompt(BaseModel):
             raise ValueError("all values in allowed_classes must be positive integers.")
         return val
+    @field_validator("product_item_fields", mode="before")
+    def validate_product_item_fields(cls, val):
+        """Ensure all product_item_fields are valid ProductItem attributes."""
+        valid_fields = set(ProductItem.model_fields.keys())
+        for field in val:
+            if field not in valid_fields:
+                raise ValueError(
+                    f"Invalid product_item_field: '{field}'. Must be one of: {sorted(valid_fields)}"
+                )
+        return val
 class AsyncClient:
     """Base class for sub-classes using async HTTP requests."""

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/base/client.py RENAMED Viewed

@@ -9,8 +9,16 @@ from typing import List
 import pandas as pd
 from fraudcrawler.settings import ROOT_DIR
-from fraudcrawler.base.base import Setup, Language, Location, Deepness, Host, Prompt
-from fraudcrawler.base.orchestrator import Orchestrator, ProductItem
+from fraudcrawler.base.base import (
+    Setup,
+    Language,
+    Location,
+    Deepness,
+    Host,
+    Prompt,
+    ProductItem,
+)
+from fraudcrawler.base.orchestrator import Orchestrator
 from fraudcrawler.scraping.serp import SearchEngine
 logger = logging.getLogger(__name__)

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/base/orchestrator.py RENAMED Viewed

@@ -1,12 +1,14 @@
 from abc import ABC, abstractmethod
 import asyncio
 import logging
-from pydantic import BaseModel, Field
-from typing import Dict, List, Set, cast
+from typing import Dict, List, cast
+from bs4 import BeautifulSoup
 from fraudcrawler.settings import (
     PROCESSOR_DEFAULT_MODEL,
     PROCESSOR_DEFAULT_IF_MISSING,
+    PROCESSOR_PRODUCT_DETAILS_TEMPLATE,
     MAX_RETRIES,
     RETRY_DELAY,
 )
@@ -15,37 +17,26 @@ from fraudcrawler.settings import (
     DEFAULT_N_ZYTE_WKRS,
     DEFAULT_N_PROC_WKRS,
 )
-from fraudcrawler.base.base import Deepness, Host, Language, Location, Prompt
-from fraudcrawler import SerpApi, SearchEngine, Enricher, ZyteApi, Processor
+from fraudcrawler.base.base import (
+    Deepness,
+    Host,
+    Language,
+    Location,
+    Prompt,
+    ProductItem,
+)
+from fraudcrawler import (
+    SerpApi,
+    SearchEngine,
+    Enricher,
+    URLCollector,
+    ZyteApi,
+    Processor,
+)
 logger = logging.getLogger(__name__)
-class ProductItem(BaseModel):
-    """Model representing a product item."""
-    # Serp/Enrich parameters
-    search_term: str
-    search_term_type: str
-    url: str
-    marketplace_name: str
-    domain: str
-    # Zyte parameters
-    product_name: str | None = None
-    product_price: str | None = None
-    product_description: str | None = None
-    product_images: List[str] | None = None
-    probability: float | None = None
-    # Processor parameters are set dynamic so we must allow extra fields
-    classifications: Dict[str, int] = Field(default_factory=dict)
-    # Filtering parameters
-    filtered: bool = False
-    filtered_at_stage: str | None = None
 class Orchestrator(ABC):
     """Abstract base class for orchestrating the different actors (crawling, processing).
@@ -92,15 +83,12 @@ class Orchestrator(ABC):
             n_zyte_wkrs: Number of async workers for zyte (optional).
             n_proc_wkrs: Number of async workers for the processor (optional).
         """
-        # Setup the variables
-        self._collected_urls_current_run: Set[str] = set()
-        self._collected_urls_previous_runs: Set[str] = set()
         # Setup the clients
         self._serpapi = SerpApi(
             api_key=serpapi_key, max_retries=max_retries, retry_delay=retry_delay
         )
         self._enricher = Enricher(user=dataforseo_user, pwd=dataforseo_pwd)
+        self._url_collector = URLCollector()
         self._zyteapi = ZyteApi(
             api_key=zyteapi_key, max_retries=max_retries, retry_delay=retry_delay
         )
@@ -173,16 +161,18 @@ class Orchestrator(ABC):
                 break
             if not product.filtered:
-                url = product.url
+                # Clean the URL by removing tracking parameters
+                url = self._url_collector.remove_tracking_parameters(product.url)
+                product.url = url
-                if url in self._collected_urls_current_run:
+                if url in self._url_collector.collected_currently:
                     # deduplicate on current run
                     product.filtered = True
                     product.filtered_at_stage = (
                         "URL collection (current run deduplication)"
                     )
                     logger.debug(f"URL {url} already collected in current run")
-                elif url in self._collected_urls_previous_runs:
+                elif url in self._url_collector.collected_previously:
                     # deduplicate on previous runs coming from a db
                     product.filtered = True
                     product.filtered_at_stage = (
@@ -190,7 +180,7 @@ class Orchestrator(ABC):
                     )
                     logger.debug(f"URL {url} as already collected in previous run")
                 else:
-                    self._collected_urls_current_run.add(url)
+                    self._url_collector.collected_currently.add(url)
             await queue_out.put(product)
             queue_in.task_done()
@@ -231,15 +221,16 @@ class Orchestrator(ABC):
                     product.probability = self._zyteapi.extract_probability(
                         details=details
                     )
+                    product.html = self._zyteapi.extract_html(details=details)
+                    if product.html:
+                        soup = BeautifulSoup(product.html, "html.parser")
+                        product.html_clean = soup.get_text(separator=" ", strip=True)
                     # Filter the product based on the probability threshold
                     if not self._zyteapi.keep_product(details=details):
                         product.filtered = True
                         product.filtered_at_stage = "Zyte probability threshold"
                 except Exception as e:
                     logger.warning(f"Error executing Zyte API search: {e}.")
             await queue_out.put(product)
             queue_in.task_done()
@@ -269,19 +260,26 @@ class Orchestrator(ABC):
             if not product.filtered:
                 try:
                     url = product.url
-                    name = product.product_name
-                    description = product.product_description
                     # Run all the configured prompts
                     for prompt in prompts:
+                        # Dynamically build product_details string
+                        details = []
+                        for field in prompt.product_item_fields:
+                            value = getattr(product, field, None)
+                            if value is not None:
+                                details.append(
+                                    PROCESSOR_PRODUCT_DETAILS_TEMPLATE.format(
+                                        field_name=field, field_value=value
+                                    )
+                                )
+                        product_details = "\n\n".join(details)
                         logger.debug(
-                            f"Classify product {name} with prompt {prompt.name}"
+                            f"Classify product at {url} with prompt {prompt.name} and details: {product_details}"
                         )
                         classification = await self._processor.classify(
                             prompt=prompt,
                             url=url,
-                            name=name,
-                            description=description,
+                            product_details=product_details,
                         )
                         product.classifications[prompt.name] = classification
                 except Exception as e:
@@ -489,7 +487,7 @@ class Orchestrator(ABC):
         #        INITIAL SETUP
         # ---------------------------
         if previously_collected_urls:
-            self._collected_urls_previous_runs = set(self._collected_urls_current_run)
+            self._url_collector.collected_previously = set(previously_collected_urls)
         # Setup the async framework
         n_terms_max = 1 + (

fraudcrawler-0.4.3/fraudcrawler/launch_demo_pipeline.py ADDED Viewed

@@ -0,0 +1,101 @@
+import logging
+from fraudcrawler import FraudCrawlerClient, Language, Location, Deepness, Prompt
+LOG_FMT = "%(asctime)s | %(name)s | %(funcName)s | %(levelname)s | %(message)s"
+LOG_LVL = "INFO"
+DATE_FMT = "%Y-%m-%d %H:%M:%S"
+logging.basicConfig(format=LOG_FMT, level=LOG_LVL, datefmt=DATE_FMT)
+def main():
+    # Setup the client
+    client = FraudCrawlerClient()
+    # Setup the search
+    search_term = "Medion Kühlbox MD 37454"
+    language = Language(name="German")
+    location = Location(name="Switzerland")
+    deepness = Deepness(num_results=10)
+    prompts = [
+        Prompt(
+            name="availability",
+            system_prompt=(
+                "You are a helpful and intelligent assistant helping an organization that is interested in checking the availability of certain products."
+                "Your task is to classify any given product as either available (1) or not available (0), strictly based on the context and product details provided by the user. "
+                "You must consider all aspects of the given context and make a binary decision accordingly. "
+                "If the product can be purchased, added to a shopping basket, delivered, or is listed as available in any form, classify it as 1 (available); "
+                "if there is any mention of out of stock, not available, no longer shippable, or similar, classify it as 0 (not available). "
+                "Respond only with the number 1 or 0."
+            ),
+            product_item_fields=["product_name", "html_clean"],
+            allowed_classes=[0, 1],
+        ),
+        # Prompt(
+        #     name="seriousness",
+        #     system_prompt=(
+        #         "You are a helpful and intelligent assistant helping an organization that is interested in checking the energy efficiency of certain devices. "
+        #         "Your task is to classify each item as either a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
+        #         "    1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found  "
+        #         "within an online shop or marketplace.\n"
+        #         "    2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
+        #         "        - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
+        #         "exact product itself, classify as 0.\n"
+        #         "        - Advertisements: Promotional content that doesn't directly sell a product.\n"
+        #         "        - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
+        #         "        - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
+        #         "Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
+        #     ),
+        #     product_item_fields=["product_name", "product_description"],
+        #     allowed_classes=[0, 1],
+        # ),
+    ]
+    # # Optional: Add tern ENRICHEMENT
+    # from fraudcrawler import Enrichment
+    # deepness.enrichment = Enrichment(additional_terms=10, additional_urls_per_term=20)
+    # # Optional: Add MARKETPLACES and EXCLUDED_URLS
+    # from fraudcrawler import Host
+    # marketplaces = [
+    #     Host(name="International", domains="zavamed.com,apomeds.com"),
+    #     Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
+    # ]
+    # excluded_urls = [
+    #     Host(name="Digitec", domains="digitec.ch"),
+    #     Host(name="Brack", domains="brack.ch"),
+    # ]
+    # Execute the pipeline
+    client.execute(
+        search_term=search_term,
+        language=language,
+        location=location,
+        deepness=deepness,
+        prompts=prompts,
+        # marketplaces=marketplaces,
+        # excluded_urls=excluded_urls,
+    )
+    # Show results
+    print()
+    title = "Available results"
+    print(title)
+    print("=" * len(title))
+    client.print_available_results()
+    print()
+    title = f'Results for "{search_term.upper()}"'
+    print(title)
+    print("=" * len(title))
+    df = client.load_results()
+    print(f"Number of products found: {len(df)}")
+    print()
+    n_head = 10
+    print(f"First {n_head} products are:")
+    print(df.head(n=n_head))
+    print()
+if __name__ == "__main__":
+    main()

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/processing/processor.py RENAMED Viewed

@@ -52,42 +52,34 @@ class Processor:
             raise ValueError("Empty response from OpenAI API")
         return content
-    async def classify(
-        self, prompt: Prompt, url: str, name: str | None, description: str | None
-    ) -> int:
-        """A generic classification method that classified a product based on a prompt object.
+    async def classify(self, prompt: Prompt, url: str, product_details: str) -> int:
+        """A generic classification method that classifies a product based on a prompt object.
         Args:
-            prompt: A dictionary with keys "system_prompt", "user_prompt", etc.
+            prompt: A dictionary with keys "system_prompt", etc.
             url: Product URL (often used in the user_prompt).
-            name: Product name (often used in the user_prompt).
-            description: Product description (often used in the user_prompt).
+            product_details: String with product details, formatted per prompt.product_item_fields.
         Note:
             This method returns `PROCESSOR_DEFAULT_IF_MISSING` if:
-                - 'name' or 'description' is None
+                - product_details is empty
                 - an error occurs during the API call
                 - if the response isn't in allowed_classes.
         """
         # If required fields are missing, return the prompt's default fallback if provided.
-        if name is None or description is None:
-            logger.warning(
-                f"Missing required fields for classification: name='{name}', description='{description}'"
-            )
+        if not product_details:
+            logger.warning("Missing required product_details for classification.")
             return self._default_if_missing
         # Substitute placeholders in user_prompt with the relevant arguments
         user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
-            context=prompt.context,
-            url=url,
-            name=name,
-            description=description,
+            product_details=product_details,
         )
         # Call the OpenAI API
         try:
             logger.debug(
-                f'Calling OpenAI API for classification (name="{name}", prompt="{prompt.name}")'
+                f'Calling OpenAI API for classification (url="{url}", prompt="{prompt.name}")'
             )
             content = await self._call_openai_api(
                 system_prompt=prompt.system_prompt,
@@ -104,12 +96,12 @@ class Processor:
                 return self._default_if_missing
             logger.info(
-                f'Classification for "{name}" (prompt={prompt.name}): {classification}'
+                f'Classification for url="{url}" (prompt={prompt.name}): {classification}'
             )
             return classification
         except Exception as e:
             logger.error(
-                f'Error classifying product "{name}" with prompt "{prompt.name}": {e}'
+                f'Error classifying product at url="{url}" with prompt "{prompt.name}": {e}'
             )
             return self._default_if_missing

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/scraping/serp.py RENAMED Viewed

@@ -4,16 +4,17 @@ import logging
 from pydantic import BaseModel
 from typing import List
 from urllib.parse import urlparse
+import re
 from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY, SERP_DEFAULT_COUNTRY_CODES
 from fraudcrawler.base.base import Host, Language, Location, AsyncClient
-import re
 logger = logging.getLogger(__name__)
 class SerpResult(BaseModel):
     """Model for a single search result from SerpApi."""
     url: str
     domain: str
     marketplace_name: str
@@ -23,6 +24,7 @@ class SerpResult(BaseModel):
 class SearchEngine(Enum):
     """Enum for the supported search engines."""
     GOOGLE = "google"
     GOOGLE_SHOPPING = "google_shopping"
@@ -33,7 +35,7 @@ class SerpApi(AsyncClient):
     _endpoint = "https://serpapi.com/search"
     _engine_marketplace_names = {
         SearchEngine.GOOGLE.value: "Google",
-        SearchEngine.GOOGLE_SHOPPING.value: "Google Shopping"
+        SearchEngine.GOOGLE_SHOPPING.value: "Google Shopping",
     }
     _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"

fraudcrawler-0.4.3/fraudcrawler/scraping/url.py ADDED Viewed

@@ -0,0 +1,57 @@
+import logging
+from typing import List, Set, Tuple
+from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
+from fraudcrawler.settings import KNOWN_TRACKERS
+logger = logging.getLogger(__name__)
+class URLCollector:
+    """A class to collect and de-duplicate URLs."""
+    def __init__(self):
+        self.collected_currently: Set[str] = set()
+        self.collected_previously: Set[str] = set()
+    @staticmethod
+    def remove_tracking_parameters(url: str) -> str:
+        """Remove tracking parameters from URLs.
+        Args:
+            url: The URL to clean.
+        Returns:
+            The cleaned URL without tracking parameters.
+        """
+        logging.debug(f"Removing tracking parameters from URL: {url}")
+        # Parse the url
+        parsed_url = urlparse(url)
+        # Parse query parameters
+        queries: List[Tuple[str, str]] = parse_qsl(
+            parsed_url.query, keep_blank_values=True
+        )
+        remove_all = url.startswith(
+            "https://www.ebay"
+        )  # eBay URLs have all query parameters as tracking parameters
+        if remove_all:
+            filtered_queries = []
+        else:
+            filtered_queries = [
+                q
+                for q in queries
+                if not any(q[0].startswith(tracker) for tracker in KNOWN_TRACKERS)
+            ]
+        # Rebuild the URL without tracking parameters
+        clean_url = ParseResult(
+            scheme=parsed_url.scheme,
+            netloc=parsed_url.netloc,
+            path=parsed_url.path,
+            params=parsed_url.params,
+            query=urlencode(filtered_queries, quote_via=quote),
+            fragment=parsed_url.fragment,
+        )
+        return urlunparse(clean_url)

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/scraping/zyte.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import logging
 from typing import List
+from base64 import b64decode
 import aiohttp
@@ -68,7 +69,8 @@ class ZyteApi(AsyncClient):
                     "metadata": {
                         "probability": float,
                     },
-                }
+                },
+                "httpResponseBody": base64
             }
         """
         logger.info(f"Fetching product details by Zyte for URL {url}.")
@@ -192,3 +194,24 @@ class ZyteApi(AsyncClient):
             }
         """
         return float(details.get("product", {}).get("metadata", {}).get("probability"))
+    @staticmethod
+    def extract_html(details: dict) -> str | None:
+        """Extracts the HTML from the Zyte API response.
+        The input argument is a dictionary of the following structure:
+            {
+                "httpResponseBody": base64
+            }
+        """
+        # Get the Base64-encoded content
+        encoded = details.get("httpResponseBody")
+        # Decode it into bytes
+        if isinstance(encoded, str):
+            decoded_bytes = b64decode(encoded)
+        # Convert bytes to string (assuming UTF-8 encoding)
+        decoded_string = decoded_bytes.decode("utf-8")
+        return decoded_string

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/settings.py RENAMED Viewed

@@ -13,6 +13,18 @@ SERP_DEFAULT_COUNTRY_CODES: List[str] = [
     # ".com",
 ]
+# URL De-duplication settings
+KNOWN_TRACKERS = [
+    "srsltid",
+    "utm_source",
+    "utm_medium",
+    "utm_campaign",
+    "utm_term",
+    "utm_content",
+    "ar",
+    "ps",
+]
 # Enrichment settings
 ENRICHMENT_DEFAULT_LIMIT = 10
@@ -22,9 +34,8 @@ ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
 # Processor settings
 PROCESSOR_DEFAULT_MODEL = "gpt-4o"
 PROCESSOR_DEFAULT_IF_MISSING = -1
-PROCESSOR_USER_PROMPT_TEMPLATE = (
-    "Context: {context}\n\nProduct Details: {name}\n{description}\\n\nRelevance:"
-)
+PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
+PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
 # Async settings
 DEFAULT_N_SERP_WKRS = 10

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "fraudcrawler"
-version = "0.4.0"
+version = "0.4.3"
 description = "Intelligent Market Monitoring"
 authors = [
     "Domingo Bertus <hello@veanu.ch>",
@@ -25,6 +25,7 @@ pandas = "^2.2.3"
 aiohttp = "^3.11.14"
 pydantic-settings = "^2.8.1"
 openai = "^1.68.2"
+beautifulsoup4 = "^4.13.4"
 [tool.poetry.group.dev.dependencies]
 pytest-cov = "^6.0.0"

fraudcrawler-0.4.0/fraudcrawler/launch_demo_pipeline.py DELETED Viewed

@@ -1,100 +0,0 @@
-import logging
-from fraudcrawler import FraudCrawlerClient, Language, Location, Deepness, Prompt
-LOG_FMT = "%(asctime)s | %(name)s | %(funcName)s | %(levelname)s | %(message)s"
-LOG_LVL = "INFO"
-DATE_FMT = "%Y-%m-%d %H:%M:%S"
-logging.basicConfig(format=LOG_FMT, level=LOG_LVL, datefmt=DATE_FMT)
-def main():
-    # Setup the client
-    client = FraudCrawlerClient()
-    # Setup the search
-    search_term = "Kühlschrank"
-    language = Language(name="German")
-    location = Location(name="Switzerland")
-    deepness = Deepness(num_results=20)
-    prompts = [
-        Prompt(
-            name="relevance",
-            context="This organization is interested in checking the energy efficiency of certain devices.",
-            system_prompt=(
-                "You are a helpful and intelligent assistant. Your task is to classify any given product "
-                "as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
-                "You must consider all aspects of the given context and make a binary decision accordingly. "
-                "If the product aligns with the user's needs, classify it as 1 (relevant); otherwise, classify it as 0 (not relevant). "
-                "Respond only with the number 1 or 0."
-            ),
-            allowed_classes=[0, 1],
-        ),
-        Prompt(
-            name="seriousness",
-            context="This organization is interested in checking the energy efficiency of certain devices.",
-            system_prompt=(
-                "You are an intelligent and discerning assistant. Your task is to classify each item as either "
-                "a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
-                "    1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found  "
-                "within an online shop or marketplace.\n"
-                "    2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
-                "        - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
-                "exact product itself, classify as 0.\n"
-                "        - Advertisements: Promotional content that doesn't directly sell a product.\n"
-                "        - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
-                "        - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
-                "Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
-            ),
-            allowed_classes=[0, 1],
-        ),
-    ]
-    # # Optional: Add tern ENRICHEMENT
-    # from fraudcrawler import Enrichment
-    # deepness.enrichment = Enrichment(additional_terms=10, additional_urls_per_term=20)
-    # # Optional: Add MARKETPLACES and EXCLUDED_URLS
-    # from fraudcrawler import Host
-    # marketplaces = [
-    #     Host(name="International", domains="zavamed.com,apomeds.com"),
-    #     Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
-    # ]
-    # excluded_urls = [
-    #     Host(name="Digitec", domains="digitec.ch"),
-    #     Host(name="Brack", domains="brack.ch"),
-    # ]
-    # Execute the pipeline
-    client.execute(
-        search_term=search_term,
-        language=language,
-        location=location,
-        deepness=deepness,
-        prompts=prompts,
-        # marketplaces=marketplaces,
-        # excluded_urls=excluded_urls,
-    )
-    # Show results
-    print()
-    title = "Available results"
-    print(title)
-    print("=" * len(title))
-    client.print_available_results()
-    print()
-    title = f'Results for "{search_term.upper()}"'
-    print(title)
-    print("=" * len(title))
-    df = client.load_results()
-    print(f"Number of products found: {len(df)}")
-    print()
-    n_head = 10
-    print(f"First {n_head} products are:")
-    print(df.head(n=n_head))
-    print()
-if __name__ == "__main__":
-    main()

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/LICENSE RENAMED Viewed

File without changes

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/base/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/base/google-languages.json RENAMED Viewed

File without changes

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/base/google-locations.json RENAMED Viewed

File without changes

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/processing/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/scraping/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/scraping/enrich.py RENAMED Viewed

File without changes

fraudcrawler 0.4.0__tar.gz → 0.4.3__tar.gz

Potentially problematic release.

fraudcrawler 0.4.0tar.gz → 0.4.3tar.gz