arcade-brightdata 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ from arcade_brightdata.tools import scrape_as_markdown, search_engine, web_data_feed
2
+
3
+ __all__ = ["scrape_as_markdown", "search_engine", "web_data_feed"]
@@ -0,0 +1,63 @@
1
+ import json
2
+ from typing import ClassVar
3
+ from urllib.parse import quote
4
+
5
+ import requests
6
+
7
+
8
+ class BrightDataClient:
9
+ """Engine for interacting with Bright Data API with connection management."""
10
+
11
+ _clients: ClassVar[dict[str, "BrightDataClient"]] = {}
12
+
13
+ def __init__(self, api_key: str, zone: str = "web_unlocker1") -> None:
14
+ """
15
+ Initialize with API token and default zone.
16
+ Args:
17
+ api_key (str): Your Bright Data API token
18
+ zone (str): Bright Data zone name
19
+ """
20
+ self.api_key = api_key
21
+ self.headers = {
22
+ "Content-Type": "application/json",
23
+ "Authorization": f"Bearer {self.api_key}",
24
+ }
25
+ self.zone = zone
26
+ self.endpoint = "https://api.brightdata.com/request"
27
+
28
+ @classmethod
29
+ def create_client(cls, api_key: str, zone: str = "web_unlocker1") -> "BrightDataClient":
30
+ """Create or get cached client instance using API key only."""
31
+ if api_key not in cls._clients:
32
+ cls._clients[api_key] = cls(api_key, zone)
33
+
34
+ # Update zone for this request (user controls zone per request)
35
+ client = cls._clients[api_key]
36
+ client.zone = zone
37
+ return client
38
+
39
+ @classmethod
40
+ def clear_cache(cls) -> None:
41
+ """Clear the client cache."""
42
+ cls._clients.clear()
43
+
44
+ def make_request(self, payload: dict) -> str:
45
+ """
46
+ Make a request to Bright Data API.
47
+ Args:
48
+ payload (Dict): Request payload
49
+ Returns:
50
+ str: Response text
51
+ """
52
+ response = requests.post(
53
+ self.endpoint, headers=self.headers, data=json.dumps(payload), timeout=30
54
+ )
55
+
56
+ response.raise_for_status()
57
+ result: str = response.text
58
+ return result
59
+
60
+ @staticmethod
61
+ def encode_query(query: str) -> str:
62
+ """URL encode a search query."""
63
+ return quote(query)
@@ -0,0 +1,7 @@
1
+ from arcade_brightdata.tools.bright_data_tools import (
2
+ scrape_as_markdown,
3
+ search_engine,
4
+ web_data_feed,
5
+ )
6
+
7
+ __all__ = ["scrape_as_markdown", "search_engine", "web_data_feed"]
@@ -0,0 +1,312 @@
1
+ import json
2
+ import time
3
+ from enum import Enum
4
+ from typing import Annotated, Any, cast
5
+
6
+ import requests
7
+ from arcade_core.errors import RetryableToolError
8
+ from arcade_tdk import ToolContext, tool
9
+
10
+ from arcade_brightdata.bright_data_client import BrightDataClient
11
+
12
+
13
+ class DeviceType(str, Enum):
14
+ MOBILE = "mobile"
15
+ IOS = "ios"
16
+ IPHONE = "iphone"
17
+ IPAD = "ipad"
18
+ ANDROID = "android"
19
+ ANDROID_TABLET = "android_tablet"
20
+
21
+
22
+ class SearchEngine(str, Enum):
23
+ GOOGLE = "google"
24
+ BING = "bing"
25
+ YANDEX = "yandex"
26
+
27
+
28
+ class SearchType(str, Enum):
29
+ IMAGES = "images"
30
+ SHOPPING = "shopping"
31
+ NEWS = "news"
32
+ JOBS = "jobs"
33
+
34
+
35
+ class SourceType(str, Enum):
36
+ AMAZON_PRODUCT = "amazon_product"
37
+ AMAZON_PRODUCT_REVIEWS = "amazon_product_reviews"
38
+ LINKEDIN_PERSON_PROFILE = "linkedin_person_profile"
39
+ LINKEDIN_COMPANY_PROFILE = "linkedin_company_profile"
40
+ ZOOMINFO_COMPANY_PROFILE = "zoominfo_company_profile"
41
+ INSTAGRAM_PROFILES = "instagram_profiles"
42
+ INSTAGRAM_POSTS = "instagram_posts"
43
+ INSTAGRAM_REELS = "instagram_reels"
44
+ INSTAGRAM_COMMENTS = "instagram_comments"
45
+ FACEBOOK_POSTS = "facebook_posts"
46
+ FACEBOOK_MARKETPLACE_LISTINGS = "facebook_marketplace_listings"
47
+ FACEBOOK_COMPANY_REVIEWS = "facebook_company_reviews"
48
+ X_POSTS = "x_posts"
49
+ ZILLOW_PROPERTIES_LISTING = "zillow_properties_listing"
50
+ BOOKING_HOTEL_LISTINGS = "booking_hotel_listings"
51
+ YOUTUBE_VIDEOS = "youtube_videos"
52
+
53
+
54
+ @tool(requires_secrets=["BRIGHTDATA_API_KEY", "BRIGHTDATA_ZONE"])
55
+ def scrape_as_markdown(
56
+ context: ToolContext,
57
+ url: Annotated[str, "URL to scrape"],
58
+ ) -> Annotated[str, "Scraped webpage content as Markdown"]:
59
+ """
60
+ Scrape a webpage and return content in Markdown format using Bright Data.
61
+
62
+ Examples:
63
+ scrape_as_markdown("https://example.com") -> "# Example Page\n\nContent..."
64
+ scrape_as_markdown("https://news.ycombinator.com") -> "# Hacker News\n..."
65
+ """
66
+ api_key = context.get_secret("BRIGHTDATA_API_KEY")
67
+ zone = context.get_secret("BRIGHTDATA_ZONE")
68
+ client = BrightDataClient.create_client(api_key=api_key, zone=zone)
69
+
70
+ payload = {"url": url, "zone": zone, "format": "raw", "data_format": "markdown"}
71
+ return client.make_request(payload)
72
+
73
+
74
+ @tool(requires_secrets=["BRIGHTDATA_API_KEY", "BRIGHTDATA_ZONE"])
75
+ def search_engine( # noqa: C901
76
+ context: ToolContext,
77
+ query: Annotated[str, "Search query"],
78
+ engine: Annotated[SearchEngine, "Search engine to use"] = SearchEngine.GOOGLE,
79
+ language: Annotated[str | None, "Two-letter language code"] = None,
80
+ country_code: Annotated[str | None, "Two-letter country code"] = None,
81
+ search_type: Annotated[SearchType | None, "Type of search"] = None,
82
+ start: Annotated[int | None, "Results pagination offset"] = None,
83
+ num_results: Annotated[int, "Number of results to return. The default is 10"] = 10,
84
+ location: Annotated[str | None, "Location for search results"] = None,
85
+ device: Annotated[DeviceType | None, "Device type"] = None,
86
+ return_json: Annotated[bool, "Return JSON instead of Markdown"] = False,
87
+ ) -> Annotated[str, "Search results as Markdown or JSON"]:
88
+ """
89
+ Search using Google, Bing, or Yandex with advanced parameters using Bright Data.
90
+
91
+ Examples:
92
+ search_engine("climate change") -> "# Search Results\n\n## Climate Change - Wikipedia\n..."
93
+ search_engine("Python tutorials", engine="bing", num_results=5) -> "# Bing Results\n..."
94
+ search_engine("cats", search_type="images", country_code="us") -> "# Image Results\n..."
95
+ """
96
+ api_key = context.get_secret("BRIGHTDATA_API_KEY")
97
+ zone = context.get_secret("BRIGHTDATA_ZONE")
98
+ client = BrightDataClient.create_client(api_key=api_key, zone=zone)
99
+
100
+ encoded_query = BrightDataClient.encode_query(query)
101
+
102
+ base_urls = {
103
+ SearchEngine.GOOGLE: f"https://www.google.com/search?q={encoded_query}",
104
+ SearchEngine.BING: f"https://www.bing.com/search?q={encoded_query}",
105
+ SearchEngine.YANDEX: f"https://yandex.com/search/?text={encoded_query}",
106
+ }
107
+
108
+ search_url = base_urls[engine]
109
+
110
+ if engine == SearchEngine.GOOGLE:
111
+ params = []
112
+
113
+ if language:
114
+ params.append(f"hl={language}")
115
+
116
+ if country_code:
117
+ params.append(f"gl={country_code}")
118
+
119
+ if search_type:
120
+ if search_type == SearchType.JOBS:
121
+ params.append("ibp=htl;jobs")
122
+ else:
123
+ search_types = {
124
+ SearchType.IMAGES: "isch",
125
+ SearchType.SHOPPING: "shop",
126
+ SearchType.NEWS: "nws",
127
+ }
128
+ tbm_value = search_types.get(search_type, search_type)
129
+ params.append(f"tbm={tbm_value}")
130
+
131
+ if start is not None:
132
+ params.append(f"start={start}")
133
+
134
+ if num_results:
135
+ params.append(f"num={num_results}")
136
+
137
+ if location:
138
+ params.append(f"uule={BrightDataClient.encode_query(location)}")
139
+
140
+ if device:
141
+ device_value = "1"
142
+
143
+ if device.value in ["ios", "iphone"]:
144
+ device_value = "ios"
145
+ elif device.value == "ipad":
146
+ device_value = "ios_tablet"
147
+ elif device.value == "android":
148
+ device_value = "android"
149
+ elif device.value == "android_tablet":
150
+ device_value = "android_tablet"
151
+
152
+ params.append(f"brd_mobile={device_value}")
153
+
154
+ if return_json:
155
+ params.append("brd_json=1")
156
+
157
+ if params:
158
+ search_url += "&" + "&".join(params)
159
+
160
+ payload = {
161
+ "url": search_url,
162
+ "zone": zone,
163
+ "format": "raw",
164
+ "data_format": "markdown" if not return_json else "raw",
165
+ }
166
+
167
+ return client.make_request(payload)
168
+
169
+
170
+ @tool(requires_secrets=["BRIGHTDATA_API_KEY"])
171
+ def web_data_feed(
172
+ context: ToolContext,
173
+ source_type: Annotated[SourceType, "Type of data source"],
174
+ url: Annotated[str, "URL of the web resource to extract data from"],
175
+ num_of_reviews: Annotated[
176
+ int | None,
177
+ (
178
+ "Number of reviews to retrieve. Only applicable for "
179
+ "facebook_company_reviews. Default is None"
180
+ ),
181
+ ] = None,
182
+ timeout: Annotated[int, "Maximum time in seconds to wait for data retrieval"] = 600,
183
+ polling_interval: Annotated[int, "Time in seconds between polling attempts"] = 1,
184
+ ) -> Annotated[str, "Structured data from the requested source as JSON"]:
185
+ """
186
+ Extract structured data from various websites like LinkedIn, Amazon, Instagram, etc.
187
+ NEVER MADE UP LINKS - IF LINKS ARE NEEDED, EXECUTE search_engine FIRST.
188
+ Supported source types:
189
+ - amazon_product, amazon_product_reviews
190
+ - linkedin_person_profile, linkedin_company_profile
191
+ - zoominfo_company_profile
192
+ - instagram_profiles, instagram_posts, instagram_reels, instagram_comments
193
+ - facebook_posts, facebook_marketplace_listings, facebook_company_reviews
194
+ - x_posts
195
+ - zillow_properties_listing
196
+ - booking_hotel_listings
197
+ - youtube_videos
198
+
199
+ Examples:
200
+ web_data_feed("amazon_product", "https://amazon.com/dp/B08N5WRWNW")
201
+ -> "{\"title\": \"Product Name\", ...}"
202
+ web_data_feed("linkedin_person_profile", "https://linkedin.com/in/johndoe")
203
+ -> "{\"name\": \"John Doe\", ...}"
204
+ web_data_feed(
205
+ "facebook_company_reviews", "https://facebook.com/company", num_of_reviews=50
206
+ ) -> "[{\"review\": \"...\", ...}]"
207
+ """
208
+ api_key = context.get_secret("BRIGHTDATA_API_KEY")
209
+ client = BrightDataClient.create_client(api_key=api_key)
210
+ if num_of_reviews is not None and source_type != SourceType.FACEBOOK_COMPANY_REVIEWS:
211
+ msg = (
212
+ f"num_of_reviews parameter is only applicable for facebook_company_reviews, "
213
+ f"not for {source_type.value}"
214
+ )
215
+ prompt = (
216
+ "The num_of_reviews parameter should only be used with "
217
+ "facebook_company_reviews source type."
218
+ )
219
+ raise RetryableToolError(msg, additional_prompt_content=prompt)
220
+ data = _extract_structured_data(
221
+ client=client,
222
+ source_type=source_type,
223
+ url=url,
224
+ num_of_reviews=num_of_reviews,
225
+ timeout=timeout,
226
+ polling_interval=polling_interval,
227
+ )
228
+ return json.dumps(data, indent=2)
229
+
230
+
231
+ def _extract_structured_data(
232
+ client: BrightDataClient,
233
+ source_type: SourceType,
234
+ url: str,
235
+ num_of_reviews: int | None = None,
236
+ timeout: int = 600,
237
+ polling_interval: int = 1,
238
+ ) -> dict[str, Any]:
239
+ """
240
+ Extract structured data from various sources.
241
+ """
242
+ datasets = {
243
+ SourceType.AMAZON_PRODUCT: "gd_l7q7dkf244hwjntr0",
244
+ SourceType.AMAZON_PRODUCT_REVIEWS: "gd_le8e811kzy4ggddlq",
245
+ SourceType.LINKEDIN_PERSON_PROFILE: "gd_l1viktl72bvl7bjuj0",
246
+ SourceType.LINKEDIN_COMPANY_PROFILE: "gd_l1vikfnt1wgvvqz95w",
247
+ SourceType.ZOOMINFO_COMPANY_PROFILE: "gd_m0ci4a4ivx3j5l6nx",
248
+ SourceType.INSTAGRAM_PROFILES: "gd_l1vikfch901nx3by4",
249
+ SourceType.INSTAGRAM_POSTS: "gd_lk5ns7kz21pck8jpis",
250
+ SourceType.INSTAGRAM_REELS: "gd_lyclm20il4r5helnj",
251
+ SourceType.INSTAGRAM_COMMENTS: "gd_ltppn085pokosxh13",
252
+ SourceType.FACEBOOK_POSTS: "gd_lyclm1571iy3mv57zw",
253
+ SourceType.FACEBOOK_MARKETPLACE_LISTINGS: "gd_lvt9iwuh6fbcwmx1a",
254
+ SourceType.FACEBOOK_COMPANY_REVIEWS: "gd_m0dtqpiu1mbcyc2g86",
255
+ SourceType.X_POSTS: "gd_lwxkxvnf1cynvib9co",
256
+ SourceType.ZILLOW_PROPERTIES_LISTING: "gd_lfqkr8wm13ixtbd8f5",
257
+ SourceType.BOOKING_HOTEL_LISTINGS: "gd_m5mbdl081229ln6t4a",
258
+ SourceType.YOUTUBE_VIDEOS: "gd_m5mbdl081229ln6t4a",
259
+ }
260
+
261
+ dataset_id = datasets[source_type]
262
+
263
+ request_data = {"url": url}
264
+ if source_type == SourceType.FACEBOOK_COMPANY_REVIEWS and num_of_reviews is not None:
265
+ request_data["num_of_reviews"] = str(num_of_reviews)
266
+
267
+ trigger_response = requests.post(
268
+ "https://api.brightdata.com/datasets/v3/trigger",
269
+ params={"dataset_id": dataset_id, "include_errors": "true"},
270
+ headers=client.headers,
271
+ json=[request_data],
272
+ timeout=30,
273
+ )
274
+
275
+ trigger_data = trigger_response.json()
276
+ if not trigger_data.get("snapshot_id"):
277
+ msg = "No snapshot ID returned from trigger request"
278
+ prompt = "Invalid input provided, use search_engine to get the relevant data first"
279
+ raise RetryableToolError(msg, additional_prompt_content=prompt)
280
+
281
+ snapshot_id = trigger_data["snapshot_id"]
282
+
283
+ attempts = 0
284
+ max_attempts = timeout
285
+
286
+ while attempts < max_attempts:
287
+ try:
288
+ snapshot_response = requests.get(
289
+ f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}",
290
+ params={"format": "json"},
291
+ headers=client.headers,
292
+ timeout=30,
293
+ )
294
+
295
+ snapshot_data = cast(dict[str, Any], snapshot_response.json())
296
+
297
+ if isinstance(snapshot_data, dict) and snapshot_data.get("status") in (
298
+ "running",
299
+ "building",
300
+ ):
301
+ attempts += 1
302
+ time.sleep(polling_interval)
303
+ continue
304
+ else:
305
+ return snapshot_data
306
+
307
+ except Exception:
308
+ attempts += 1
309
+ time.sleep(polling_interval)
310
+
311
+ msg = f"Timeout after {max_attempts} seconds waiting for {source_type.value} data"
312
+ raise TimeoutError(msg)
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.4
2
+ Name: arcade_brightdata
3
+ Version: 0.2.0
4
+ Summary: Search, Crawl and Scrape any site, at scale, without getting blocked
5
+ Author-email: meirk-brd <meirk@brightdata.com>
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.10
8
+ Requires-Dist: arcade-tdk<4.0.0,>=3.0.0
9
+ Requires-Dist: requests>=2.32.5
10
+ Provides-Extra: dev
11
+ Requires-Dist: arcade-mcp[all]<2.0.0,>=1.2.0; extra == 'dev'
12
+ Requires-Dist: arcade-serve<4.0.0,>=3.0.0; extra == 'dev'
13
+ Requires-Dist: mypy<1.6.0,>=1.5.1; extra == 'dev'
14
+ Requires-Dist: pre-commit<3.5.0,>=3.4.0; extra == 'dev'
15
+ Requires-Dist: pytest-asyncio<0.25.0,>=0.24.0; extra == 'dev'
16
+ Requires-Dist: pytest-cov<4.1.0,>=4.0.0; extra == 'dev'
17
+ Requires-Dist: pytest-mock<3.12.0,>=3.11.1; extra == 'dev'
18
+ Requires-Dist: pytest<8.4.0,>=8.3.0; extra == 'dev'
19
+ Requires-Dist: ruff<0.8.0,>=0.7.4; extra == 'dev'
20
+ Requires-Dist: tox<4.12.0,>=4.11.1; extra == 'dev'
21
+ Requires-Dist: types-requests>=2.32.0; extra == 'dev'
@@ -0,0 +1,9 @@
1
+ arcade_brightdata/__init__.py,sha256=XMXRA26TDMuWNbQ8_xx1rBGC2sELrU21ABuuXItR5bo,153
2
+ arcade_brightdata/bright_data_client.py,sha256=VJK16o5YDq-R_TOrZ3vVOnzuYZaqfAwCIK-NVb36aYU,1912
3
+ arcade_brightdata/tools/__init__.py,sha256=Bt7FFs6TWixCgmTLpha0lz49ae4Kt_W0ghy6SS2qMUA,188
4
+ arcade_brightdata/tools/bright_data_tools.py,sha256=Yd_KGnV3pFUmHqaawh4e0lzI56eVe6WhSiWqgnMWSE4,11478
5
+ arcade_brightdata-0.2.0.dist-info/METADATA,sha256=_tgwRXpvUoo03IKCpZQAurFK3EhgowUSatVEoSUauMw,933
6
+ arcade_brightdata-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
7
+ arcade_brightdata-0.2.0.dist-info/entry_points.txt,sha256=FQs787TL0d9kb6M4ECPu0j6bNxrvupINwuZag0gvW7g,51
8
+ arcade_brightdata-0.2.0.dist-info/licenses/LICENSE,sha256=f4Q0XUZJ2MqZBO1XsqqHhuZfSs2ar1cZEJ45150zERo,1067
9
+ arcade_brightdata-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [arcade_toolkits]
2
+ toolkit_name = arcade_brightdata
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025, Arcade AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.