PyPI - arcade-brightdata - Versions diffs - 0.2.0__py3-none-any.whl - Mend

arcade-brightdata 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

arcade_brightdata/__init__.py +3 -0
arcade_brightdata/bright_data_client.py +63 -0
arcade_brightdata/tools/__init__.py +7 -0
arcade_brightdata/tools/bright_data_tools.py +312 -0
arcade_brightdata-0.2.0.dist-info/METADATA +21 -0
arcade_brightdata-0.2.0.dist-info/RECORD +9 -0
arcade_brightdata-0.2.0.dist-info/WHEEL +4 -0
arcade_brightdata-0.2.0.dist-info/entry_points.txt +2 -0
arcade_brightdata-0.2.0.dist-info/licenses/LICENSE +21 -0

arcade_brightdata/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from arcade_brightdata.tools import scrape_as_markdown, search_engine, web_data_feed
+__all__ = ["scrape_as_markdown", "search_engine", "web_data_feed"]

arcade_brightdata/bright_data_client.py ADDED Viewed

@@ -0,0 +1,63 @@
+import json
+from typing import ClassVar
+from urllib.parse import quote
+import requests
+class BrightDataClient:
+    """Engine for interacting with Bright Data API with connection management."""
+    _clients: ClassVar[dict[str, "BrightDataClient"]] = {}
+    def __init__(self, api_key: str, zone: str = "web_unlocker1") -> None:
+        """
+        Initialize with API token and default zone.
+        Args:
+            api_key (str): Your Bright Data API token
+            zone (str): Bright Data zone name
+        """
+        self.api_key = api_key
+        self.headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}",
+        }
+        self.zone = zone
+        self.endpoint = "https://api.brightdata.com/request"
+    @classmethod
+    def create_client(cls, api_key: str, zone: str = "web_unlocker1") -> "BrightDataClient":
+        """Create or get cached client instance using API key only."""
+        if api_key not in cls._clients:
+            cls._clients[api_key] = cls(api_key, zone)
+        # Update zone for this request (user controls zone per request)
+        client = cls._clients[api_key]
+        client.zone = zone
+        return client
+    @classmethod
+    def clear_cache(cls) -> None:
+        """Clear the client cache."""
+        cls._clients.clear()
+    def make_request(self, payload: dict) -> str:
+        """
+        Make a request to Bright Data API.
+        Args:
+            payload (Dict): Request payload
+        Returns:
+            str: Response text
+        """
+        response = requests.post(
+            self.endpoint, headers=self.headers, data=json.dumps(payload), timeout=30
+        )
+        response.raise_for_status()
+        result: str = response.text
+        return result
+    @staticmethod
+    def encode_query(query: str) -> str:
+        """URL encode a search query."""
+        return quote(query)

arcade_brightdata/tools/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from arcade_brightdata.tools.bright_data_tools import (
+    scrape_as_markdown,
+    search_engine,
+    web_data_feed,
+)
+__all__ = ["scrape_as_markdown", "search_engine", "web_data_feed"]

arcade_brightdata/tools/bright_data_tools.py ADDED Viewed

@@ -0,0 +1,312 @@
+import json
+import time
+from enum import Enum
+from typing import Annotated, Any, cast
+import requests
+from arcade_core.errors import RetryableToolError
+from arcade_tdk import ToolContext, tool
+from arcade_brightdata.bright_data_client import BrightDataClient
+class DeviceType(str, Enum):
+    MOBILE = "mobile"
+    IOS = "ios"
+    IPHONE = "iphone"
+    IPAD = "ipad"
+    ANDROID = "android"
+    ANDROID_TABLET = "android_tablet"
+class SearchEngine(str, Enum):
+    GOOGLE = "google"
+    BING = "bing"
+    YANDEX = "yandex"
+class SearchType(str, Enum):
+    IMAGES = "images"
+    SHOPPING = "shopping"
+    NEWS = "news"
+    JOBS = "jobs"
+class SourceType(str, Enum):
+    AMAZON_PRODUCT = "amazon_product"
+    AMAZON_PRODUCT_REVIEWS = "amazon_product_reviews"
+    LINKEDIN_PERSON_PROFILE = "linkedin_person_profile"
+    LINKEDIN_COMPANY_PROFILE = "linkedin_company_profile"
+    ZOOMINFO_COMPANY_PROFILE = "zoominfo_company_profile"
+    INSTAGRAM_PROFILES = "instagram_profiles"
+    INSTAGRAM_POSTS = "instagram_posts"
+    INSTAGRAM_REELS = "instagram_reels"
+    INSTAGRAM_COMMENTS = "instagram_comments"
+    FACEBOOK_POSTS = "facebook_posts"
+    FACEBOOK_MARKETPLACE_LISTINGS = "facebook_marketplace_listings"
+    FACEBOOK_COMPANY_REVIEWS = "facebook_company_reviews"
+    X_POSTS = "x_posts"
+    ZILLOW_PROPERTIES_LISTING = "zillow_properties_listing"
+    BOOKING_HOTEL_LISTINGS = "booking_hotel_listings"
+    YOUTUBE_VIDEOS = "youtube_videos"
+@tool(requires_secrets=["BRIGHTDATA_API_KEY", "BRIGHTDATA_ZONE"])
+def scrape_as_markdown(
+    context: ToolContext,
+    url: Annotated[str, "URL to scrape"],
+) -> Annotated[str, "Scraped webpage content as Markdown"]:
+    """
+    Scrape a webpage and return content in Markdown format using Bright Data.
+    Examples:
+        scrape_as_markdown("https://example.com") -> "# Example Page\n\nContent..."
+        scrape_as_markdown("https://news.ycombinator.com") -> "# Hacker News\n..."
+    """
+    api_key = context.get_secret("BRIGHTDATA_API_KEY")
+    zone = context.get_secret("BRIGHTDATA_ZONE")
+    client = BrightDataClient.create_client(api_key=api_key, zone=zone)
+    payload = {"url": url, "zone": zone, "format": "raw", "data_format": "markdown"}
+    return client.make_request(payload)
+@tool(requires_secrets=["BRIGHTDATA_API_KEY", "BRIGHTDATA_ZONE"])
+def search_engine(  # noqa: C901
+    context: ToolContext,
+    query: Annotated[str, "Search query"],
+    engine: Annotated[SearchEngine, "Search engine to use"] = SearchEngine.GOOGLE,
+    language: Annotated[str | None, "Two-letter language code"] = None,
+    country_code: Annotated[str | None, "Two-letter country code"] = None,
+    search_type: Annotated[SearchType | None, "Type of search"] = None,
+    start: Annotated[int | None, "Results pagination offset"] = None,
+    num_results: Annotated[int, "Number of results to return. The default is 10"] = 10,
+    location: Annotated[str | None, "Location for search results"] = None,
+    device: Annotated[DeviceType | None, "Device type"] = None,
+    return_json: Annotated[bool, "Return JSON instead of Markdown"] = False,
+) -> Annotated[str, "Search results as Markdown or JSON"]:
+    """
+    Search using Google, Bing, or Yandex with advanced parameters using Bright Data.
+    Examples:
+        search_engine("climate change") -> "# Search Results\n\n## Climate Change - Wikipedia\n..."
+        search_engine("Python tutorials", engine="bing", num_results=5) -> "# Bing Results\n..."
+        search_engine("cats", search_type="images", country_code="us") -> "# Image Results\n..."
+    """
+    api_key = context.get_secret("BRIGHTDATA_API_KEY")
+    zone = context.get_secret("BRIGHTDATA_ZONE")
+    client = BrightDataClient.create_client(api_key=api_key, zone=zone)
+    encoded_query = BrightDataClient.encode_query(query)
+    base_urls = {
+        SearchEngine.GOOGLE: f"https://www.google.com/search?q={encoded_query}",
+        SearchEngine.BING: f"https://www.bing.com/search?q={encoded_query}",
+        SearchEngine.YANDEX: f"https://yandex.com/search/?text={encoded_query}",
+    }
+    search_url = base_urls[engine]
+    if engine == SearchEngine.GOOGLE:
+        params = []
+        if language:
+            params.append(f"hl={language}")
+        if country_code:
+            params.append(f"gl={country_code}")
+        if search_type:
+            if search_type == SearchType.JOBS:
+                params.append("ibp=htl;jobs")
+            else:
+                search_types = {
+                    SearchType.IMAGES: "isch",
+                    SearchType.SHOPPING: "shop",
+                    SearchType.NEWS: "nws",
+                }
+                tbm_value = search_types.get(search_type, search_type)
+                params.append(f"tbm={tbm_value}")
+        if start is not None:
+            params.append(f"start={start}")
+        if num_results:
+            params.append(f"num={num_results}")
+        if location:
+            params.append(f"uule={BrightDataClient.encode_query(location)}")
+        if device:
+            device_value = "1"
+            if device.value in ["ios", "iphone"]:
+                device_value = "ios"
+            elif device.value == "ipad":
+                device_value = "ios_tablet"
+            elif device.value == "android":
+                device_value = "android"
+            elif device.value == "android_tablet":
+                device_value = "android_tablet"
+            params.append(f"brd_mobile={device_value}")
+        if return_json:
+            params.append("brd_json=1")
+        if params:
+            search_url += "&" + "&".join(params)
+    payload = {
+        "url": search_url,
+        "zone": zone,
+        "format": "raw",
+        "data_format": "markdown" if not return_json else "raw",
+    }
+    return client.make_request(payload)
+@tool(requires_secrets=["BRIGHTDATA_API_KEY"])
+def web_data_feed(
+    context: ToolContext,
+    source_type: Annotated[SourceType, "Type of data source"],
+    url: Annotated[str, "URL of the web resource to extract data from"],
+    num_of_reviews: Annotated[
+        int | None,
+        (
+            "Number of reviews to retrieve. Only applicable for "
+            "facebook_company_reviews. Default is None"
+        ),
+    ] = None,
+    timeout: Annotated[int, "Maximum time in seconds to wait for data retrieval"] = 600,
+    polling_interval: Annotated[int, "Time in seconds between polling attempts"] = 1,
+) -> Annotated[str, "Structured data from the requested source as JSON"]:
+    """
+    Extract structured data from various websites like LinkedIn, Amazon, Instagram, etc.
+    NEVER MADE UP LINKS - IF LINKS ARE NEEDED, EXECUTE search_engine FIRST.
+    Supported source types:
+    - amazon_product, amazon_product_reviews
+    - linkedin_person_profile, linkedin_company_profile
+    - zoominfo_company_profile
+    - instagram_profiles, instagram_posts, instagram_reels, instagram_comments
+    - facebook_posts, facebook_marketplace_listings, facebook_company_reviews
+    - x_posts
+    - zillow_properties_listing
+    - booking_hotel_listings
+    - youtube_videos
+    Examples:
+        web_data_feed("amazon_product", "https://amazon.com/dp/B08N5WRWNW")
+            -> "{\"title\": \"Product Name\", ...}"
+        web_data_feed("linkedin_person_profile", "https://linkedin.com/in/johndoe")
+            -> "{\"name\": \"John Doe\", ...}"
+        web_data_feed(
+            "facebook_company_reviews", "https://facebook.com/company", num_of_reviews=50
+        ) -> "[{\"review\": \"...\", ...}]"
+    """
+    api_key = context.get_secret("BRIGHTDATA_API_KEY")
+    client = BrightDataClient.create_client(api_key=api_key)
+    if num_of_reviews is not None and source_type != SourceType.FACEBOOK_COMPANY_REVIEWS:
+        msg = (
+            f"num_of_reviews parameter is only applicable for facebook_company_reviews, "
+            f"not for {source_type.value}"
+        )
+        prompt = (
+            "The num_of_reviews parameter should only be used with "
+            "facebook_company_reviews source type."
+        )
+        raise RetryableToolError(msg, additional_prompt_content=prompt)
+    data = _extract_structured_data(
+        client=client,
+        source_type=source_type,
+        url=url,
+        num_of_reviews=num_of_reviews,
+        timeout=timeout,
+        polling_interval=polling_interval,
+    )
+    return json.dumps(data, indent=2)
+def _extract_structured_data(
+    client: BrightDataClient,
+    source_type: SourceType,
+    url: str,
+    num_of_reviews: int | None = None,
+    timeout: int = 600,
+    polling_interval: int = 1,
+) -> dict[str, Any]:
+    """
+    Extract structured data from various sources.
+    """
+    datasets = {
+        SourceType.AMAZON_PRODUCT: "gd_l7q7dkf244hwjntr0",
+        SourceType.AMAZON_PRODUCT_REVIEWS: "gd_le8e811kzy4ggddlq",
+        SourceType.LINKEDIN_PERSON_PROFILE: "gd_l1viktl72bvl7bjuj0",
+        SourceType.LINKEDIN_COMPANY_PROFILE: "gd_l1vikfnt1wgvvqz95w",
+        SourceType.ZOOMINFO_COMPANY_PROFILE: "gd_m0ci4a4ivx3j5l6nx",
+        SourceType.INSTAGRAM_PROFILES: "gd_l1vikfch901nx3by4",
+        SourceType.INSTAGRAM_POSTS: "gd_lk5ns7kz21pck8jpis",
+        SourceType.INSTAGRAM_REELS: "gd_lyclm20il4r5helnj",
+        SourceType.INSTAGRAM_COMMENTS: "gd_ltppn085pokosxh13",
+        SourceType.FACEBOOK_POSTS: "gd_lyclm1571iy3mv57zw",
+        SourceType.FACEBOOK_MARKETPLACE_LISTINGS: "gd_lvt9iwuh6fbcwmx1a",
+        SourceType.FACEBOOK_COMPANY_REVIEWS: "gd_m0dtqpiu1mbcyc2g86",
+        SourceType.X_POSTS: "gd_lwxkxvnf1cynvib9co",
+        SourceType.ZILLOW_PROPERTIES_LISTING: "gd_lfqkr8wm13ixtbd8f5",
+        SourceType.BOOKING_HOTEL_LISTINGS: "gd_m5mbdl081229ln6t4a",
+        SourceType.YOUTUBE_VIDEOS: "gd_m5mbdl081229ln6t4a",
+    }
+    dataset_id = datasets[source_type]
+    request_data = {"url": url}
+    if source_type == SourceType.FACEBOOK_COMPANY_REVIEWS and num_of_reviews is not None:
+        request_data["num_of_reviews"] = str(num_of_reviews)
+    trigger_response = requests.post(
+        "https://api.brightdata.com/datasets/v3/trigger",
+        params={"dataset_id": dataset_id, "include_errors": "true"},
+        headers=client.headers,
+        json=[request_data],
+        timeout=30,
+    )
+    trigger_data = trigger_response.json()
+    if not trigger_data.get("snapshot_id"):
+        msg = "No snapshot ID returned from trigger request"
+        prompt = "Invalid input provided, use search_engine to get the relevant data first"
+        raise RetryableToolError(msg, additional_prompt_content=prompt)
+    snapshot_id = trigger_data["snapshot_id"]
+    attempts = 0
+    max_attempts = timeout
+    while attempts < max_attempts:
+        try:
+            snapshot_response = requests.get(
+                f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}",
+                params={"format": "json"},
+                headers=client.headers,
+                timeout=30,
+            )
+            snapshot_data = cast(dict[str, Any], snapshot_response.json())
+            if isinstance(snapshot_data, dict) and snapshot_data.get("status") in (
+                "running",
+                "building",
+            ):
+                attempts += 1
+                time.sleep(polling_interval)
+                continue
+            else:
+                return snapshot_data
+        except Exception:
+            attempts += 1
+            time.sleep(polling_interval)
+    msg = f"Timeout after {max_attempts} seconds waiting for {source_type.value} data"
+    raise TimeoutError(msg)

arcade_brightdata-0.2.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,21 @@
+Metadata-Version: 2.4
+Name: arcade_brightdata
+Version: 0.2.0
+Summary: Search, Crawl and Scrape any site, at scale, without getting blocked
+Author-email: meirk-brd <meirk@brightdata.com>
+License-File: LICENSE
+Requires-Python: >=3.10
+Requires-Dist: arcade-tdk<4.0.0,>=3.0.0
+Requires-Dist: requests>=2.32.5
+Provides-Extra: dev
+Requires-Dist: arcade-mcp[all]<2.0.0,>=1.2.0; extra == 'dev'
+Requires-Dist: arcade-serve<4.0.0,>=3.0.0; extra == 'dev'
+Requires-Dist: mypy<1.6.0,>=1.5.1; extra == 'dev'
+Requires-Dist: pre-commit<3.5.0,>=3.4.0; extra == 'dev'
+Requires-Dist: pytest-asyncio<0.25.0,>=0.24.0; extra == 'dev'
+Requires-Dist: pytest-cov<4.1.0,>=4.0.0; extra == 'dev'
+Requires-Dist: pytest-mock<3.12.0,>=3.11.1; extra == 'dev'
+Requires-Dist: pytest<8.4.0,>=8.3.0; extra == 'dev'
+Requires-Dist: ruff<0.8.0,>=0.7.4; extra == 'dev'
+Requires-Dist: tox<4.12.0,>=4.11.1; extra == 'dev'
+Requires-Dist: types-requests>=2.32.0; extra == 'dev'

arcade_brightdata-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+arcade_brightdata/__init__.py,sha256=XMXRA26TDMuWNbQ8_xx1rBGC2sELrU21ABuuXItR5bo,153
+arcade_brightdata/bright_data_client.py,sha256=VJK16o5YDq-R_TOrZ3vVOnzuYZaqfAwCIK-NVb36aYU,1912
+arcade_brightdata/tools/__init__.py,sha256=Bt7FFs6TWixCgmTLpha0lz49ae4Kt_W0ghy6SS2qMUA,188
+arcade_brightdata/tools/bright_data_tools.py,sha256=Yd_KGnV3pFUmHqaawh4e0lzI56eVe6WhSiWqgnMWSE4,11478
+arcade_brightdata-0.2.0.dist-info/METADATA,sha256=_tgwRXpvUoo03IKCpZQAurFK3EhgowUSatVEoSUauMw,933
+arcade_brightdata-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+arcade_brightdata-0.2.0.dist-info/entry_points.txt,sha256=FQs787TL0d9kb6M4ECPu0j6bNxrvupINwuZag0gvW7g,51
+arcade_brightdata-0.2.0.dist-info/licenses/LICENSE,sha256=f4Q0XUZJ2MqZBO1XsqqHhuZfSs2ar1cZEJ45150zERo,1067
+arcade_brightdata-0.2.0.dist-info/RECORD,,

arcade_brightdata-0.2.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.27.0
+Root-Is-Purelib: true
+Tag: py3-none-any

arcade_brightdata-0.2.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [arcade_toolkits]
2	+ toolkit_name = arcade_brightdata

arcade_brightdata-0.2.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025, Arcade AI
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.