PyPI - fraudcrawler - Versions diffs - 0.5.0__py3-none-any.whl → 0.7.26__py3-none-any.whl - Mend

fraudcrawler 0.5.0py3-none-any.whl → 0.7.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

fraudcrawler/__init__.py +21 -5
fraudcrawler/base/base.py +18 -38
fraudcrawler/base/client.py +57 -60
fraudcrawler/base/orchestrator.py +277 -276
fraudcrawler/base/retry.py +25 -11
fraudcrawler/launch_demo_pipeline.py +103 -41
fraudcrawler/processing/base.py +151 -0
fraudcrawler/processing/openai.py +521 -0
fraudcrawler/scraping/enrich.py +6 -4
fraudcrawler/scraping/search.py +370 -110
fraudcrawler/scraping/url.py +42 -3
fraudcrawler/scraping/zyte.py +146 -80
fraudcrawler/settings.py +22 -10
fraudcrawler-0.7.26.dist-info/METADATA +173 -0
fraudcrawler-0.7.26.dist-info/RECORD +23 -0
fraudcrawler/processing/processor.py +0 -199
fraudcrawler-0.5.0.dist-info/METADATA +0 -167
fraudcrawler-0.5.0.dist-info/RECORD +0 -22
{fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/LICENSE +0 -0
{fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/WHEEL +0 -0
{fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/entry_points.txt +0 -0

fraudcrawler/processing/processor.py DELETED Viewed

@@ -1,199 +0,0 @@
-import logging
-import httpx
-from openai import AsyncOpenAI
-from tenacity import RetryCallState
-from fraudcrawler.base.base import ProductItem, Prompt, ClassificationResult
-from fraudcrawler.base.retry import get_async_retry
-from fraudcrawler.settings import (
-    PROCESSOR_PRODUCT_DETAILS_TEMPLATE,
-    PROCESSOR_USER_PROMPT_TEMPLATE,
-    PROCESSOR_DEFAULT_IF_MISSING,
-    PROCESSOR_EMPTY_TOKEN_COUNT,
-)
-logger = logging.getLogger(__name__)
-class Processor:
-    """Processes product data for classification based on a prompt configuration."""
-    def __init__(
-        self,
-        http_client: httpx.AsyncClient,
-        api_key: str,
-        model: str,
-        default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
-        empty_token_count: int = PROCESSOR_EMPTY_TOKEN_COUNT,
-    ):
-        """Initializes the Processor.
-        Args:
-            http_client: An httpx.AsyncClient to use for the async requests.
-            api_key: The OpenAI API key.
-            model: The OpenAI model to use.
-            default_if_missing: The default classification to return if error occurs.
-            empty_token_count: The default value to return as tokensif the classification is empty.
-        """
-        self._client = AsyncOpenAI(http_client=http_client, api_key=api_key)
-        self._model = model
-        self._error_response = ClassificationResult(
-            result=default_if_missing,
-            input_tokens=empty_token_count,
-            output_tokens=empty_token_count,
-        )
-    @staticmethod
-    def _get_product_details(product: ProductItem, prompt: Prompt) -> str:
-        """Extracts product details based on the prompt configuration.
-        Args:
-            product: The product item to extract details from.
-            prompt: The prompt configuration containing field names.
-        """
-        details = []
-        for field in prompt.product_item_fields:
-            if value := getattr(product, field, None):
-                details.append(
-                    PROCESSOR_PRODUCT_DETAILS_TEMPLATE.format(
-                        field_name=field, field_value=value
-                    )
-                )
-            else:
-                logger.warning(
-                    f'Field "{field}" is missing in ProductItem with url="{product.url}"'
-                )
-        return "\n\n".join(details)
-    @staticmethod
-    def _log_before(url: str, prompt: Prompt, retry_state: RetryCallState) -> None:
-        """Context aware logging before the request is made."""
-        if retry_state:
-            logger.debug(
-                f"Classifying product with url={url} using prompt={prompt} (Attempt {retry_state.attempt_number})."
-            )
-        else:
-            logger.debug(f"retry_state is {retry_state}; not logging before.")
-    @staticmethod
-    def _log_before_sleep(
-        url: str, prompt: Prompt, retry_state: RetryCallState
-    ) -> None:
-        """Context aware logging before sleeping after a failed request."""
-        if retry_state and retry_state.outcome:
-            logger.warning(
-                f"Attempt {retry_state.attempt_number} of classifying product with url={url} using prompt={prompt} "
-                f"failed with error: {retry_state.outcome.exception()}. "
-                f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
-            )
-    async def _call_openai_api(
-        self,
-        system_prompt: str,
-        user_prompt: str,
-        **kwargs,
-    ) -> ClassificationResult:
-        """Calls the OpenAI API with the given user prompt."""
-        response = await self._client.chat.completions.create(
-            model=self._model,
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": user_prompt},
-            ],
-            **kwargs,
-        )
-        if not response or not (content := response.choices[0].message.content):
-            raise ValueError(
-                f'Error calling OpenAI API or empty response="{response}".'
-            )
-        # Convert the content to an integer
-        try:
-            content = int(content.strip())
-        except Exception as e:
-            msg = f"Failed to convert OpenAI response '{content}' to integer: {e}"
-            logger.error(msg)
-            raise ValueError(msg)
-        # For tracking consumption we alre return the tokens used
-        classification = ClassificationResult(
-            result=content,
-            input_tokens=response.usage.prompt_tokens,
-            output_tokens=response.usage.completion_tokens,
-        )
-        return classification
-    async def classify(
-        self,
-        product: ProductItem,
-        prompt: Prompt,
-    ) -> ClassificationResult:
-        """A generic classification method that classifies a product based on a prompt object and returns
-          the classification, input tokens, and output tokens.
-        Args:
-            product: The product item to classify.
-            prompt: The prompt to use for classification.
-        Note:
-            This method returns `PROCESSOR_DEFAULT_IF_MISSING` if:
-                - product_details is empty
-                - an error occurs during the API call
-                - if the response isn't in allowed_classes.
-        """
-        url = product.url
-        # Form the product details from the ProductItem
-        product_details = self._get_product_details(product=product, prompt=prompt)
-        if not product_details:
-            logger.warning("Missing required product_details for classification.")
-            return self._error_response
-        # Prepare the user prompt
-        user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
-            product_details=product_details,
-        )
-        # Call the OpenAI API
-        try:
-            logger.debug(
-                f"Classifying product with url={url} using prompt={prompt.name} and user_prompt={user_prompt}."
-            )
-            # Perform the request and retry if necessary. There is some context aware logging
-            #  - `before`: before the request is made (or before retrying)
-            #  - `before_sleep`: if the request fails before sleeping
-            retry = get_async_retry()
-            retry.before = lambda retry_state: self._log_before(
-                url=url, prompt=prompt, retry_state=retry_state
-            )
-            retry.before_sleep = lambda retry_state: self._log_before_sleep(
-                url=url, prompt=prompt, retry_state=retry_state
-            )
-            async for attempt in retry:
-                with attempt:
-                    classification = await self._call_openai_api(
-                        system_prompt=prompt.system_prompt,
-                        user_prompt=user_prompt,
-                        max_tokens=1,
-                    )
-            # Enforce that the classification is in the allowed classes
-            if classification.result not in prompt.allowed_classes:
-                logger.warning(
-                    f"Classification '{classification.result}' not in allowed classes {prompt.allowed_classes}"
-                )
-                return self._error_response
-            logger.info(
-                f'Classification for url="{url}" (prompt={prompt.name}): {classification.result} and total tokens used: {classification.input_tokens + classification.output_tokens}'
-            )
-            return classification
-        except Exception as e:
-            logger.error(
-                f'Error classifying product at url="{url}" with prompt "{prompt.name}": {e}'
-            )
-            return self._error_response

fraudcrawler-0.5.0.dist-info/METADATA DELETED Viewed

@@ -1,167 +0,0 @@
-Metadata-Version: 2.1
-Name: fraudcrawler
-Version: 0.5.0
-Summary: Intelligent Market Monitoring
-Home-page: https://github.com/open-veanu/fraudcrawler
-License: MIT
-Author: Domingo Bertus
-Author-email: hello@veanu.ch
-Requires-Python: >=3.11,<4.0
-Classifier: License :: OSI Approved :: MIT License
-Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.11
-Classifier: Programming Language :: Python :: 3.12
-Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
-Requires-Dist: httpx (>=0.28.1,<0.29.0)
-Requires-Dist: openai (>=1.68.2,<2.0.0)
-Requires-Dist: pandas (>=2.2.3,<3.0.0)
-Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
-Requires-Dist: requests (>=2.32.3,<3.0.0)
-Requires-Dist: tenacity (>=9.1.2,<10.0.0)
-Project-URL: Repository, https://github.com/open-veanu/fraudcrawler
-Description-Content-Type: text/markdown
-# open-veanu/fraudcrawler
-Intelligent Market Monitoring
-The pipeline for monitoring the market has the folling main steps:
-1. search for a given term using SerpAPI
-2. get product information using ZyteAPI
-3. assess relevance of the found products using an OpenAI API
-## Installation
-```bash
-python3.11 -m venv .venv
-source .venv/bin/activate
-pip install fraudcrawler
-```
-## Usage
-### `.env` file
-Make sure to create an `.env` file with the necessary API keys and credentials (c.f. `.env.example` file).
-### Run demo pipeline
-```bash
-python -m fraudcrawler.launch_demo_pipeline
-```
-### Customize the pipeline
-Start by initializing the client
-```python
-from fraudcrawler import FraudCrawlerClient
-# Initialize the client
-client = FraudCrawlerClient()
-```
-For setting up the search we need 5 main objects.
-#### `search_term: str`
-The search term for the query (similar to search terms used within major search providers).
-#### `language: Language`
-The language used in SerpAPI ('hl' parameter), as well as for the optional search term enrichement (e.g. finding similar and related search terms). `language=Language('German')` creates an object having a language name and a language code as: `Language(name='German', code='de')`.
-#### `location: Location`
-The location used in SerpAPI ('gl' parameter). `location=Location('Switzerland')` creates an object having a location name and a location code as `Location(name='Switzerland', code='ch')`.
-#### `deepness: Deepness`
-Defines the search depth with the number of results to retrieve and optional enrichment parameters.
-#### `prompts: List[Prompt]`
-The list of prompts to classify a given product with (multiple) LLM calls. Each prompt object has a `name`, a `context` (used for defining the user prompt), a `system_prompt` (for defining the classification task), and `allowed_classes` (a list of possible classes).
-```python
-from fraudcrawler import Language, Location, Deepness, Prompt
-# Setup the search
-search_term = "sildenafil"
-language = Language(name="German")
-location = Location(name="Switzerland")
-deepness = Deepness(num_results=50)
-prompts = [
-    Prompt(
-        name="relevance",
-        system_prompt=(
-            "You are a helpful and intelligent assistant. Your task is to classify any given product "
-            "as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
-            "You must consider all aspects of the given context and make a binary decision accordingly. "
-            "If the product aligns with the user's needs, classify it as 1 (relevant); otherwise, classify it as 0 (not relevant). "
-            "Respond only with the number 1 or 0."
-        ),
-        allowed_classes=[0, 1],
-    )
-]
-```
-(Optional) Add search term enrichement. This will find related search terms (in a given language) and search for these as well.
-```python
-from fraudcrawler import Enrichment
-deepness.enrichment = Enrichment(
-    additional_terms=5,
-    additional_urls_per_term=10
-)
-```
-(Optional) Add marketplaces where we explicitely want to look for (this will focus your search as the :site parameter for a google search)
-```python
-from fraudcrawler import Host
-marketplaces = [
-    Host(name="International", domains="zavamed.com,apomeds.com"),
-    Host(name="National", domains="netdoktor.ch, nobelpharma.ch"),
-]
-```
-(Optional) Exclude urls (where you don't want to find products)
-```python
-excluded_urls = [
-    Host(name="Compendium", domains="compendium.ch"),
-]
-```
-(Optional) Exclude previously collected urls (intends to save credits)
-```python
-previously_collected_urls = [
-    https://pharmaciedelabateliere.ch/shop/sante/douleurs-inflammations/dafalgan-cpr-eff-500-mg-16-pce/,
-    https://eiche.ch/product/schmerzmittel-52cd81d5d206a/dafalgan-brausetabletten-1336653,
-]
-```
-And finally run the pipeline
-```python
-# Execute the pipeline
-client.execute(
-    search_term=search_term,
-    language=language,
-    location=location,
-    deepness=deepness,
-    prompts=prompts,
-    # marketplaces=marketplaces,    # Uncomment this for using marketplaces
-    # excluded_urls=excluded_urls   # Uncomment this for using excluded_urls
-    # previously_collected_urls=previously_collected_urls    # Uncomment this for using previously_selected_urls
-)
-```
-This creates a file with name pattern `<search_term>_<language.code>_<location.code>_<datetime[%Y%m%d%H%M%S]>.csv` inside the folder `data/results/`.
-Once the pipeline terminated the results can be loaded and examined as follows:
-```python
-df = client.load_results()
-print(df.head(n=10))
-```
-If the client has been used to run multiple pipelines, an overview of the available results (for a given instance of
-`FraudCrawlerClient`) can be obtained with
-```python
-client.print_available_results()
-```
-## Contributing
-see `CONTRIBUTING.md`
-### Async Setup
-The `Orchestrator` class in `src/base/orchestrator.py` is designed to coordinate multiple services that may have interdependencies, allowing them to run in a semi-iterative manner. This means, for example, that product A can be at stage III of the pipeline while product B is still at stage I.
-This behavior is enabled through an asynchronous pipeline setup. The three main steps, `SerpAPI`, `ZyteAPI`, and `Processor`, all utilize `httpx.AsyncClient`. It is both possible and highly recommended to manage a single AsyncClient instance per application for efficiency. We provide a `HttpxAsyncClient` class that you can pass For more details, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
-The following image provides a schematic representation of the package's async setup.
-![Async Setup](https://github.com/open-veanu/fraudcrawler/raw/master/docs/assets/images/Fraudcrawler_Async_Setup.svg)

fraudcrawler-0.5.0.dist-info/RECORD DELETED Viewed

@@ -1,22 +0,0 @@
-fraudcrawler/__init__.py,sha256=Kr19jWhtbC1shVoB9fHvBSeoG1IyQB9re1kCZ4YIAi0,842
-fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-fraudcrawler/base/base.py,sha256=suQMnvLIsZO_R0eHZKDWS4u9qnd1ryzPhjGlwcaMD5A,7295
-fraudcrawler/base/client.py,sha256=yhkNrhL2SuJXTknLf-8P81fv01FnFMahREZgem-Z-f0,5832
-fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
-fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
-fraudcrawler/base/orchestrator.py,sha256=AKEETrYwKbMy_6YgTdgc6L-VA1iHYOtj3wIqEN3ngO4,26990
-fraudcrawler/base/retry.py,sha256=9VyVrbYR_0YnfxFhUrvcM3aWCYR6oR4iZE4A3zzVZUs,1015
-fraudcrawler/launch_demo_pipeline.py,sha256=j5lu8lLl8QrkVU1MJH25uKtyYk_6lBSeoouCo30aRXg,4634
-fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-fraudcrawler/processing/processor.py,sha256=Qq8QcTlqfnzFi1t-1KkncXxaIszUO7pGK3LXTdHkDnM,7638
-fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
-fraudcrawler/scraping/search.py,sha256=nHMYaSkq9o6Hr4yUDEPguj8IHVcOpws3_XWiAbCVgLg,24062
-fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
-fraudcrawler/scraping/zyte.py,sha256=GqvVWA1AWVoClAwd-hQ9iynsT0dOb7R0ntaLK5XVivM,8340
-fraudcrawler/settings.py,sha256=uwXMOQpuwyWkuMU0asYGtBlL_qJj8F-Xkg4dUaCmDxE,3670
-fraudcrawler-0.5.0.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
-fraudcrawler-0.5.0.dist-info/METADATA,sha256=H9aq_euzQMD8Ag3gbo3GIrfC4eVl-gGahD_DieQ1oow,6642
-fraudcrawler-0.5.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-fraudcrawler-0.5.0.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
-fraudcrawler-0.5.0.dist-info/RECORD,,

{fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/LICENSE RENAMED Viewed

File without changes

{fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/WHEEL RENAMED Viewed

File without changes

{fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/entry_points.txt RENAMED Viewed

File without changes

fraudcrawler 0.5.0__py3-none-any.whl → 0.7.26__py3-none-any.whl

fraudcrawler 0.5.0py3-none-any.whl → 0.7.26py3-none-any.whl