PyPI - fraudcrawler - Versions diffs - 0.5.9__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

fraudcrawler 0.5.9py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fraudcrawler might be problematic. Click here for more details.

Files changed (16) hide show

fraudcrawler/__init__.py +2 -2
fraudcrawler/base/base.py +3 -32
fraudcrawler/base/client.py +1 -1
fraudcrawler/base/orchestrator.py +135 -135
fraudcrawler/base/retry.py +12 -6
fraudcrawler/processing/processor.py +3 -3
fraudcrawler/scraping/search.py +274 -69
fraudcrawler/scraping/url.py +42 -3
fraudcrawler/scraping/zyte.py +15 -1
fraudcrawler/settings.py +13 -3
{fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.0.dist-info}/METADATA +4 -3
fraudcrawler-0.6.0.dist-info/RECORD +22 -0
{fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.0.dist-info}/WHEEL +1 -1
fraudcrawler-0.5.9.dist-info/RECORD +0 -22
{fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.0.dist-info}/LICENSE +0 -0
{fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.0.dist-info}/entry_points.txt +0 -0

fraudcrawler/processing/processor.py CHANGED Viewed

@@ -72,7 +72,7 @@ class Processor:
         """Context aware logging before the request is made."""
         if retry_state:
             logger.debug(
-                f"Classifying product with url={url} using prompt={prompt} (Attempt {retry_state.attempt_number})."
+                f"Classifying product with url={url} using prompt={prompt.name} (Attempt {retry_state.attempt_number})."
             )
         else:
             logger.debug(f"retry_state is {retry_state}; not logging before.")
@@ -84,7 +84,7 @@ class Processor:
         """Context aware logging before sleeping after a failed request."""
         if retry_state and retry_state.outcome:
             logger.warning(
-                f"Attempt {retry_state.attempt_number} of classifying product with url={url} using prompt={prompt} "
+                f"Attempt {retry_state.attempt_number} of classifying product with url={url} using prompt={prompt.name} "
                 f"failed with error: {retry_state.outcome.exception()}. "
                 f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
             )
@@ -160,7 +160,7 @@ class Processor:
         # Call the OpenAI API
         try:
             logger.debug(
-                f"Classifying product with url={url} using prompt={prompt.name} and user_prompt={user_prompt}."
+                f"Classifying product with url={url}, using prompt={prompt.name}."
             )
             # Perform the request and retry if necessary. There is some context aware logging
             #  - `before`: before the request is made (or before retrying)

fraudcrawler/scraping/search.py CHANGED Viewed

@@ -6,12 +6,18 @@ from typing import Dict, List
 from urllib.parse import quote_plus
 from bs4 import BeautifulSoup
+from bs4.element import Tag
 import httpx
-from tenacity import RetryCallState
+from tenacity import RetryCallState, AsyncRetrying
-from fraudcrawler.settings import SEARCH_DEFAULT_COUNTRY_CODES
+from fraudcrawler.settings import (
+    SEARCH_DEFAULT_COUNTRY_CODES,
+    TOPPREISE_SEARCH_PATHS,
+    TOPPREISE_COMPARISON_PATHS,
+)
 from fraudcrawler.base.base import Host, Language, Location, DomainUtils
 from fraudcrawler.base.retry import get_async_retry
+from fraudcrawler.scraping.zyte import ZyteAPI
 logger = logging.getLogger(__name__)
@@ -380,7 +386,7 @@ class SerpAPIGoogleShopping(SerpAPI):
 class Toppreise(SearchEngine):
     """Search engine for toppreise.ch."""
-    _endpoint = "https://www.toppreise.ch/produktsuche"
+    _endpoint = "https://www.toppreise.ch/"
     _headers = {
         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
@@ -390,28 +396,42 @@ class Toppreise(SearchEngine):
         "Upgrade-Insecure-Requests": "1",
     }
-    def __init__(self, http_client: httpx.AsyncClient, zyte_api=None):
+    def __init__(self, http_client: httpx.AsyncClient, zyteapi_key: str):
         """Initializes the Toppreise client.
         Args:
             http_client: An httpx.AsyncClient to use for the async requests.
-            zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
+            zyteapi_key: ZyteAPI key for fallback when direct access fails.
         """
         self._http_client = http_client
-        self._zyte_api = zyte_api
+        self._zyteapi = ZyteAPI(http_client=http_client, api_key=zyteapi_key)
-    @property
-    def _search_engine_name(self) -> str:
-        """The name of the search engine."""
-        return SearchEngineName.TOPPREISE.value
+    @classmethod
+    def _get_search_endpoint(cls, language: Language) -> str:
+        """Get the search endpoint based on the language."""
+        search_path = TOPPREISE_SEARCH_PATHS.get(
+            language.code, TOPPREISE_SEARCH_PATHS["default"]
+        )
+        return f"{cls._endpoint}{search_path}"
     @staticmethod
-    def _get_external_product_urls(content: bytes) -> List[str]:
-        """Extracts external product URLs from the Toppreise search results page."""
+    def _extract_links(
+        element: Tag, ext_products: bool = True, comp_products: bool = True
+    ) -> List[str]:
+        """Extracts all relevant product URLs from a BeautifulSoup object of a Toppreise page.
-        # Parse the HTML
-        soup = BeautifulSoup(content, "html.parser")
-        links = soup.find_all("a", href=True)
+        Note:
+            Depending on the arguments, it extracts:
+                - product comparison URLs (i.e. https://www.toppreise.ch/preisvergleich/...)
+                - external product URLs (i.e. https://www.example.com/ext_...).
+        Args:
+            tag: BeautifulSoup Tag object containing the HTML to parse.
+            ext_products: Whether to extract external product URLs.
+            comp_products: Whether to extract product comparison URLs.
+        """
+        # Find all links in the page
+        links = element.find_all("a", href=True)
         # Filter links to only include external product links
         hrefs = [
@@ -422,7 +442,15 @@ class Toppreise(SearchEngine):
                 and (href := link.get("href"))  # Ensure href is not None
                 and not href.startswith("javascript:")  # Skip javascript links
                 and isinstance(href, str)  # Ensure href is a string
-                and "ext_" in href  # Skip links that are not external product link
+                # Make sure the link is either an external product link (href contains 'ext_')
+                # or is a search result link (href contains 'preisvergleich', 'comparison-prix', or 'price-comparison')
+                and (
+                    ("ext_" in href and ext_products)
+                    or (
+                        any(pth in href for pth in TOPPREISE_COMPARISON_PATHS)
+                        and comp_products
+                    )
+                )
             )
         ]
@@ -437,21 +465,100 @@ class Toppreise(SearchEngine):
         # Return deduplicated urls
         urls = list(set(urls))
+        return urls
+    def _extract_product_urls_from_search_page(self, content: bytes) -> List[str]:
+        """Extracts product urls from a Toppreise search page (i.e. https://www.toppreise.ch/produktsuche)."""
+        # Parse the HTML
+        soup = BeautifulSoup(content, "html.parser")
+        main = soup.find("div", id="Page_Browsing")
+        if not isinstance(main, Tag):
+            logger.warning("No main content found in Toppreise search page.")
+            return []
+        # Extract links (external product links and comparison links)
+        urls = self._extract_links(element=main)
+        logger.debug(f"Found {len(urls)} product URLs from Toppreise search results.")
+        return urls
+    def _extract_product_urls_from_comparison_page(self, content: bytes) -> List[str]:
+        """Extracts product urls from a Toppreise product comparison page (i.e. https://www.toppreise.ch/preisvergleich/...)."""
+        # Parse the HTML
+        soup = BeautifulSoup(content, "html.parser")
+        # Extract links (external product links only)
+        urls = self._extract_links(element=soup, comp_products=False)
         logger.debug(
-            f"Found {len(urls)} external product URLs from Toppreise search results."
+            f"Found {len(urls)} external product URLs from Toppreise comparison page."
         )
         return urls
-    async def _search(self, search_string: str, num_results: int) -> List[str]:
+    @property
+    def _search_engine_name(self) -> str:
+        """The name of the search engine."""
+        return SearchEngineName.TOPPREISE.value
+    async def http_client_get_with_fallback(
+        self, url: str, retry: AsyncRetrying
+    ) -> bytes:
+        """Performs a GET request with retries.
+        If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
+        content using Zyte proxy mode.
+        Args:
+            url: The URL to request.
+            retry: The retry strategy to use.
+        """
+        # Try to access the URL directly
+        try:
+            async for attempt in retry:
+                with attempt:
+                    response = await self._http_client.get(
+                        url=url,
+                        headers=self._headers,
+                    )
+                    response.raise_for_status()
+                    content = response.content
+        # If we get a 403 Error (can happen depending on IP/location of deployment),
+        # we try to unblock the URL using Zyte proxy mode
+        except httpx.HTTPStatusError as err_direct:
+            if err_direct.response.status_code == 403:
+                logger.warning(
+                    f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
+                )
+                try:
+                    content = await self._zyteapi.unblock_url_content(url)
+                except Exception as err_resolve:
+                    msg = f'Error unblocking URL="{url}" with Zyte proxy: {err_resolve}'
+                    logger.error(msg)
+                    raise httpx.HTTPError(msg) from err_resolve
+            else:
+                raise err_direct
+        return content
+    async def _search(
+        self, search_string: str, language: Language, num_results: int
+    ) -> List[str]:
         """Performs a search on Toppreise and returns the URLs of the results.
+        If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
+        content using Zyte proxy mode.
         Args:
             search_string: The search string to use for the query.
+            language: The language to use for the query.
             num_results: Max number of results to return.
         """
         # Build the search URL for Toppreise
+        endpoint = self._get_search_endpoint(language=language)
         encoded_search = quote_plus(search_string)
-        url = f"{self._endpoint}?q={encoded_search}"
+        url = f"{endpoint}?q={encoded_search}"
         logger.debug(f"Toppreise search URL: {url}")
         # Perform the request and retry if necessary. There is some context aware logging:
@@ -464,33 +571,10 @@ class Toppreise(SearchEngine):
         retry.before_sleep = lambda retry_state: self._log_before_sleep(
             search_string=search_string, retry_state=retry_state
         )
-        content = None
-        try:
-            async for attempt in retry:
-                with attempt:
-                    response = await self._http_client.get(
-                        url=url,
-                        headers=self._headers,
-                    )
-                    response.raise_for_status()
-                    content = response.content
-        except httpx.HTTPStatusError as e:
-            if e.response.status_code == 403 and self._zyte_api:
-                logger.warning(
-                    f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
-                )
-                content = await self._unblock_url(url, self._zyte_api)
-                if content is None:
-                    raise e  # Re-raise if zyte fallback also failed
-            else:
-                raise e
-        if content is None:
-            raise httpx.HTTPError("Failed to fetch content")
+        content = await self.http_client_get_with_fallback(url=url, retry=retry)
         # Get external product urls from the content
-        urls = self._get_external_product_urls(content=content)
+        urls = self._extract_product_urls_from_search_page(content=content)
         urls = urls[:num_results]  # Limit to num_results if needed
         return urls
@@ -498,17 +582,20 @@ class Toppreise(SearchEngine):
     async def search(
         self,
         search_term: str,
+        language: Language,
         num_results: int,
     ) -> List[SearchResult]:
         """Performs a Toppreise search and returns SearchResults.
         Args:
             search_term: The search term to use for the query.
+            language: The language to use for the search.
             num_results: Max number of results to return.
         """
         # Perform the search
         urls = await self._search(
             search_string=search_term,
+            language=language,
             num_results=num_results,
         )
@@ -520,22 +607,124 @@ class Toppreise(SearchEngine):
         return results
-class Search(DomainUtils):
+class Searcher(DomainUtils):
     """Class to perform searches using different search engines."""
-    def __init__(self, http_client: httpx.AsyncClient, serpapi_key: str, zyte_api=None):
+    _post_search_retry_stop_after = 3
+    def __init__(
+        self, http_client: httpx.AsyncClient, serpapi_key: str, zyteapi_key: str
+    ):
         """Initializes the Search class with the given SerpAPI key.
         Args:
             http_client: An httpx.AsyncClient to use for the async requests.
             serpapi_key: The API key for SERP API.
-            zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
+            zyteapi_key: ZyteAPI key for fallback when direct access fails.
         """
+        self._http_client = http_client
         self._google = SerpAPIGoogle(http_client=http_client, api_key=serpapi_key)
         self._google_shopping = SerpAPIGoogleShopping(
-            http_client=http_client, api_key=serpapi_key
+            http_client=http_client,
+            api_key=serpapi_key,
+        )
+        self._toppreise = Toppreise(
+            http_client=http_client,
+            zyteapi_key=zyteapi_key,
         )
-        self._toppreise = Toppreise(http_client=http_client, zyte_api=zyte_api)
+    @staticmethod
+    def _post_search_log_before(url: str, retry_state: RetryCallState | None) -> None:
+        """Context aware logging before the request is made."""
+        if retry_state:
+            logger.debug(
+                f'Performing post search for url="{url}" '
+                f"(attempt {retry_state.attempt_number})."
+            )
+        else:
+            logger.debug(f"retry_state is {retry_state}; not logging before.")
+    @staticmethod
+    def _post_search_log_before_sleep(
+        url: str, retry_state: RetryCallState | None
+    ) -> None:
+        """Context aware logging before sleeping after a failed request."""
+        if retry_state and retry_state.outcome:
+            logger.warning(
+                f'Attempt {retry_state.attempt_number} of post search for url="{url}" '
+                f"failed with error: {retry_state.outcome.exception()}. "
+                f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
+            )
+        else:
+            logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
+    async def _post_search_toppreise_comparison(self, url: str) -> List[str]:
+        """Post-search for product URLs from a Toppreise product comparison page.
+        Note:
+            In comparison to the function Toppreise._search, here we extract the urls from
+            product comparison pages (f.e. https://www.toppreise.ch/preisvergleich/). They can
+            also be found in the results of a google search.
+        Args:
+            url: The URL of the Toppreise product listing page.
+        """
+        # Perform the request and retry if necessary. There is some context aware logging:
+        #  - `before`: before the request is made (and before retrying)
+        #  - `before_sleep`: if the request fails before sleeping
+        retry = get_async_retry(stop_after=self._post_search_retry_stop_after)
+        retry.before = lambda retry_state: self._post_search_log_before(
+            url=url, retry_state=retry_state
+        )
+        retry.before_sleep = lambda retry_state: self._post_search_log_before_sleep(
+            url=url, retry_state=retry_state
+        )
+        content = await self._toppreise.http_client_get_with_fallback(
+            url=url, retry=retry
+        )
+        # Get external product urls from the content
+        urls = self._toppreise._extract_product_urls_from_comparison_page(
+            content=content
+        )
+        return urls
+    async def _post_search(self, results: List[SearchResult]) -> List[SearchResult]:
+        """Post-search for additional embedded product URLs from the obtained results.
+        Note:
+            This function is used to extract embedded product URLs from
+            product listing pages (e.g. Toppreise, Google Shopping) if needed.
+        Args:
+            results: The list of SearchResult objects obtained from the search.
+        """
+        post_search_results: List[SearchResult] = []
+        for res in results:
+            url = res.url
+            # Extract embedded product URLs from the Toppreise product listing page
+            if any(pth in url for pth in TOPPREISE_COMPARISON_PATHS):
+                logger.debug(
+                    f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
+                )
+                post_search_urls = await self._post_search_toppreise_comparison(url=url)
+                logger.debug(
+                    f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
+                )
+                psr = [
+                    SearchResult(
+                        url=psu,
+                        domain=self._get_domain(url=psu),
+                        search_engine_name=res.search_engine_name,
+                    )
+                    for psu in post_search_urls
+                ]
+                post_search_results.extend(psr)
+        return post_search_results
     @staticmethod
     def _domain_in_host(domain: str, host: Host) -> bool:
@@ -625,63 +814,77 @@ class Search(DomainUtils):
     async def apply(
         self,
         search_term: str,
+        search_engine: SearchEngineName | str,
         language: Language,
         location: Location,
         num_results: int,
         marketplaces: List[Host] | None = None,
         excluded_urls: List[Host] | None = None,
-        search_engines: List[SearchEngineName | str] | None = None,
     ) -> List[SearchResult]:
         """Performs a search and returns SearchResults.
         Args:
             search_term: The search term to use for the query.
+            search_engine: The search engine to use for the search.
             language: The language to use for the query ('hl' parameter).
             location: The location to use for the query ('gl' parameter).
             num_results: Max number of results per search engine.
             marketplaces: The marketplaces to include in the search.
             excluded_urls: The URLs to exclude from the search.
-            search_engines: The list of search engines to use for the search.
         """
-        if search_engines is None:
-            search_engines = list(SearchEngineName)
-        else:
-            search_engines = [
-                SearchEngineName(sen) if isinstance(sen, str) else sen
-                for sen in search_engines
-            ]
-        results: List[SearchResult] = []
+        logger.info(
+            f'Performing search for term="{search_term}" using engine="{search_engine}".'
+        )
+        # -------------------------------
+        # SEARCH
+        # -------------------------------
+        # Map string to SearchEngineName if needed
+        if isinstance(search_engine, str):
+            search_engine = SearchEngineName(search_engine)
         # Make SerpAPI google search
-        if SearchEngineName.GOOGLE in search_engines:
-            res = await self._google.search(
+        if search_engine == SearchEngineName.GOOGLE:
+            results = await self._google.search(
                 search_term=search_term,
                 language=language,
                 location=location,
                 num_results=num_results,
                 marketplaces=marketplaces,
             )
-            results.extend(res)
         # Make SerpAPI google shopping search
-        if SearchEngineName.GOOGLE_SHOPPING in search_engines:
-            res = await self._google_shopping.search(
+        elif search_engine == SearchEngineName.GOOGLE_SHOPPING:
+            results = await self._google_shopping.search(
                 search_term=search_term,
                 language=language,
                 location=location,
                 num_results=num_results,
                 marketplaces=marketplaces,
             )
-            results.extend(res)
         # Make Toppreise search
-        if SearchEngineName.TOPPREISE in search_engines:
-            res = await self._toppreise.search(
+        elif search_engine == SearchEngineName.TOPPREISE:
+            results = await self._toppreise.search(
                 search_term=search_term,
+                language=language,
                 num_results=num_results,
             )
-            results.extend(res)
+        # Other search engines can be added here (raise unknown engine error otherwise)
+        else:
+            raise ValueError(f"Unknown search engine: {search_engine}")
+        # -------------------------------
+        # POST-SEARCH URL EXTRACTION
+        # -------------------------------
+        post_search_results = await self._post_search(results=results)
+        post_search_results = post_search_results[:num_results]
+        results.extend(post_search_results)
+        # -------------------------------
+        # FILTERS
+        # -------------------------------
         # Apply filters
         results = [
             self._apply_filters(
@@ -693,5 +896,7 @@ class Search(DomainUtils):
             for res in results
         ]
-        logger.debug(f"Search produced a total of {len(results)} results.")
+        logger.info(
+            f'Search for term="{search_term}" using engine="{search_engine}" produced {len(results)} results.'
+        )
         return results

fraudcrawler/scraping/url.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import List, Set, Tuple
 from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
 from fraudcrawler.settings import KNOWN_TRACKERS
+from fraudcrawler.base.base import ProductItem
 logger = logging.getLogger(__name__)
@@ -11,11 +12,19 @@ class URLCollector:
     """A class to collect and de-duplicate URLs."""
     def __init__(self):
-        self.collected_currently: Set[str] = set()
-        self.collected_previously: Set[str] = set()
+        self._collected_currently: Set[str] = set()
+        self._collected_previously: Set[str] = set()
+    def add_previously_collected_urls(self, urls: List[str]) -> None:
+        """Add a set of previously collected URLs to the internal state.
+        Args:
+            urls: A set of URLs that have been collected in previous runs.
+        """
+        self._collected_previously.update(urls)
     @staticmethod
-    def remove_tracking_parameters(url: str) -> str:
+    def _remove_tracking_parameters(url: str) -> str:
         """Remove tracking parameters from URLs.
         Args:
@@ -55,3 +64,33 @@ class URLCollector:
             fragment=parsed_url.fragment,
         )
         return urlunparse(clean_url)
+    async def apply(self, product: ProductItem) -> ProductItem:
+        """Manages the collection and deduplication of ProductItems.
+        Args:
+            product: The product item to process.
+        """
+        logger.debug(f'Processing product with  url="{product.url}"')
+        # Remove tracking parameters from the URL
+        url = self._remove_tracking_parameters(product.url)
+        product.url = url
+        # deduplicate on current run
+        if url in self._collected_currently:
+            product.filtered = True
+            product.filtered_at_stage = "URL collection (current run deduplication)"
+            logger.debug(f"URL {url} already collected in current run")
+        # deduplicate on previous runs coming from a db
+        elif url in self._collected_previously:
+            product.filtered = True
+            product.filtered_at_stage = "URL collection (previous run deduplication)"
+            logger.debug(f"URL {url} as already collected in previous run")
+        # Add to currently collected URLs
+        else:
+            self._collected_currently.add(url)
+        return product

fraudcrawler/scraping/zyte.py CHANGED Viewed

@@ -1,6 +1,6 @@
+from base64 import b64decode
 import logging
 from typing import List
-from base64 import b64decode
 import httpx
 from tenacity import RetryCallState
@@ -242,3 +242,17 @@ class ZyteAPI(DomainUtils):
             decoded_string = decoded_bytes.decode("utf-8")
             return decoded_string
         return None
+    async def unblock_url_content(self, url: str) -> bytes:
+        """Unblock the content of an URL using Zyte proxy mode.
+        Args:
+            url: The URL to fetch using Zyte proxy mode.
+        """
+        logger.debug(f'Unblock URL content using Zyte proxy for url="{url}"')
+        details = await self.details(url)
+        if not details or "httpResponseBody" not in details:
+            raise httpx.HTTPError("No httpResponseBody in Zyte response")
+        return b64decode(details["httpResponseBody"])

fraudcrawler/settings.py CHANGED Viewed

@@ -14,12 +14,22 @@ RETRY_EXP_BASE = 4
 RETRY_JITTER = 1
 RETRY_SKIP_IF_CODE = [400, 401, 403]  # Skip retrying on these HTTP status codes
-# Serp settings
+# Search settings
 GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
 GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
 SEARCH_DEFAULT_COUNTRY_CODES: List[str] = [
     # ".com",
 ]
+TOPPREISE_SEARCH_PATHS = {
+    "de": "produktsuche",
+    "fr": "chercher",
+    "default": "browse",
+}
+TOPPREISE_COMPARISON_PATHS = [
+    "preisvergleich",
+    "comparison-prix",
+    "price-comparison",
+]
 # URL De-duplication settings
 KNOWN_TRACKERS = [
@@ -76,8 +86,8 @@ PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevan
 PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
 # Async workers settings
-DEFAULT_N_SERP_WKRS = 10
-DEFAULT_N_ZYTE_WKRS = 10
+DEFAULT_N_SRCH_WKRS = 5
+DEFAULT_N_CNTX_WKRS = 15
 DEFAULT_N_PROC_WKRS = 10
 # HTTPX client settings

{fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.3
 Name: fraudcrawler
-Version: 0.5.9
+Version: 0.6.0
 Summary: Intelligent Market Monitoring
 Home-page: https://github.com/open-veanu/fraudcrawler
 License: MIT
@@ -11,6 +11,7 @@ Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
 Requires-Dist: httpx (>=0.28.1,<0.29.0)
 Requires-Dist: openai (>=1.68.2,<2.0.0)
@@ -160,7 +161,7 @@ see `CONTRIBUTING.md`
 ### Async Setup
 The `Orchestrator` class in `src/base/orchestrator.py` is designed to coordinate multiple services that may have interdependencies, allowing them to run in a semi-iterative manner. This means, for example, that product A can be at stage III of the pipeline while product B is still at stage I.
-This behavior is enabled through an asynchronous pipeline setup. The three main steps, `SerpAPI`, `ZyteAPI`, and `Processor`, all utilize `httpx.AsyncClient`. It is both possible and highly recommended to manage a single AsyncClient instance per application for efficiency. We provide a `HttpxAsyncClient` class that you can pass For more details, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
+This behavior is enabled through an asynchronous pipeline setup. The three main steps, `Search`, `Context Extraction`, and `Processing`, all utilize `httpx.AsyncClient`. It is both possible and highly recommended to manage a single AsyncClient instance per application for efficiency. We provide a `HttpxAsyncClient` class that you can pass For more details, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
 The following image provides a schematic representation of the package's async setup.
 ![Async Setup](https://github.com/open-veanu/fraudcrawler/raw/master/docs/assets/images/Fraudcrawler_Async_Setup.svg)

fraudcrawler 0.5.9__py3-none-any.whl → 0.6.0__py3-none-any.whl

Potentially problematic release.

fraudcrawler 0.5.9py3-none-any.whl → 0.6.0py3-none-any.whl