arcade-brightdata 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arcade_brightdata-0.2.0/.gitignore +181 -0
- arcade_brightdata-0.2.0/.pre-commit-config.yaml +18 -0
- arcade_brightdata-0.2.0/.ruff.toml +44 -0
- arcade_brightdata-0.2.0/LICENSE +21 -0
- arcade_brightdata-0.2.0/Makefile +55 -0
- arcade_brightdata-0.2.0/PKG-INFO +21 -0
- arcade_brightdata-0.2.0/arcade_brightdata/__init__.py +3 -0
- arcade_brightdata-0.2.0/arcade_brightdata/bright_data_client.py +63 -0
- arcade_brightdata-0.2.0/arcade_brightdata/tools/__init__.py +7 -0
- arcade_brightdata-0.2.0/arcade_brightdata/tools/bright_data_tools.py +312 -0
- arcade_brightdata-0.2.0/pyproject.toml +60 -0
- arcade_brightdata-0.2.0/tests/__init__.py +0 -0
- arcade_brightdata-0.2.0/tests/test_brightdata.py +414 -0
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
.DS_Store
|
|
2
|
+
credentials.yaml
|
|
3
|
+
docker/credentials.yaml
|
|
4
|
+
|
|
5
|
+
*.lock
|
|
6
|
+
|
|
7
|
+
# example data
|
|
8
|
+
examples/data
|
|
9
|
+
scratch
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
docs/source
|
|
13
|
+
|
|
14
|
+
# From https://raw.githubusercontent.com/github/gitignore/main/Python.gitignore
|
|
15
|
+
|
|
16
|
+
# Byte-compiled / optimized / DLL files
|
|
17
|
+
__pycache__/
|
|
18
|
+
*.py[cod]
|
|
19
|
+
*$py.class
|
|
20
|
+
|
|
21
|
+
# C extensions
|
|
22
|
+
*.so
|
|
23
|
+
|
|
24
|
+
# Distribution / packaging
|
|
25
|
+
.Python
|
|
26
|
+
build/
|
|
27
|
+
develop-eggs/
|
|
28
|
+
dist/
|
|
29
|
+
downloads/
|
|
30
|
+
eggs/
|
|
31
|
+
.eggs/
|
|
32
|
+
lib/
|
|
33
|
+
lib64/
|
|
34
|
+
parts/
|
|
35
|
+
sdist/
|
|
36
|
+
var/
|
|
37
|
+
wheels/
|
|
38
|
+
share/python-wheels/
|
|
39
|
+
*.egg-info/
|
|
40
|
+
.installed.cfg
|
|
41
|
+
*.egg
|
|
42
|
+
MANIFEST
|
|
43
|
+
|
|
44
|
+
# PyInstaller
|
|
45
|
+
# Usually these files are written by a python script from a template
|
|
46
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
47
|
+
*.manifest
|
|
48
|
+
*.spec
|
|
49
|
+
|
|
50
|
+
# Installer logs
|
|
51
|
+
pip-log.txt
|
|
52
|
+
pip-delete-this-directory.txt
|
|
53
|
+
|
|
54
|
+
# Unit test / coverage reports
|
|
55
|
+
htmlcov/
|
|
56
|
+
.tox/
|
|
57
|
+
.nox/
|
|
58
|
+
.coverage
|
|
59
|
+
.coverage.*
|
|
60
|
+
.cache
|
|
61
|
+
nosetests.xml
|
|
62
|
+
coverage.xml
|
|
63
|
+
*.cover
|
|
64
|
+
*.py,cover
|
|
65
|
+
.hypothesis/
|
|
66
|
+
.pytest_cache/
|
|
67
|
+
cover/
|
|
68
|
+
|
|
69
|
+
# Translations
|
|
70
|
+
*.mo
|
|
71
|
+
*.pot
|
|
72
|
+
|
|
73
|
+
# Django stuff:
|
|
74
|
+
*.log
|
|
75
|
+
local_settings.py
|
|
76
|
+
db.sqlite3
|
|
77
|
+
db.sqlite3-journal
|
|
78
|
+
|
|
79
|
+
# Flask stuff:
|
|
80
|
+
instance/
|
|
81
|
+
.webassets-cache
|
|
82
|
+
|
|
83
|
+
# Scrapy stuff:
|
|
84
|
+
.scrapy
|
|
85
|
+
|
|
86
|
+
# Sphinx documentation
|
|
87
|
+
docs/_build/
|
|
88
|
+
|
|
89
|
+
# PyBuilder
|
|
90
|
+
.pybuilder/
|
|
91
|
+
target/
|
|
92
|
+
|
|
93
|
+
# Jupyter Notebook
|
|
94
|
+
.ipynb_checkpoints
|
|
95
|
+
|
|
96
|
+
# IPython
|
|
97
|
+
profile_default/
|
|
98
|
+
ipython_config.py
|
|
99
|
+
|
|
100
|
+
# IDE
|
|
101
|
+
*.code-workspace
|
|
102
|
+
|
|
103
|
+
# pyenv
|
|
104
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
105
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
106
|
+
# .python-version
|
|
107
|
+
|
|
108
|
+
# pipenv
|
|
109
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
110
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
111
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
112
|
+
# install all needed dependencies.
|
|
113
|
+
#Pipfile.lock
|
|
114
|
+
|
|
115
|
+
# poetry
|
|
116
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
117
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
118
|
+
# commonly ignored for libraries.
|
|
119
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
120
|
+
poetry.lock
|
|
121
|
+
|
|
122
|
+
# pdm
|
|
123
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
124
|
+
#pdm.lock
|
|
125
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
|
126
|
+
# in version control.
|
|
127
|
+
# https://pdm.fming.dev/#use-with-ide
|
|
128
|
+
.pdm.toml
|
|
129
|
+
|
|
130
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
131
|
+
__pypackages__/
|
|
132
|
+
|
|
133
|
+
# Celery stuff
|
|
134
|
+
celerybeat-schedule
|
|
135
|
+
celerybeat.pid
|
|
136
|
+
|
|
137
|
+
# SageMath parsed files
|
|
138
|
+
*.sage.py
|
|
139
|
+
|
|
140
|
+
# Environments
|
|
141
|
+
.env
|
|
142
|
+
.venv
|
|
143
|
+
env/
|
|
144
|
+
venv/
|
|
145
|
+
ENV/
|
|
146
|
+
env.bak/
|
|
147
|
+
venv.bak/
|
|
148
|
+
|
|
149
|
+
# Spyder project settings
|
|
150
|
+
.spyderproject
|
|
151
|
+
.spyproject
|
|
152
|
+
|
|
153
|
+
# Rope project settings
|
|
154
|
+
.ropeproject
|
|
155
|
+
|
|
156
|
+
# mkdocs documentation
|
|
157
|
+
/site
|
|
158
|
+
|
|
159
|
+
# mypy
|
|
160
|
+
.mypy_cache/
|
|
161
|
+
.dmypy.json
|
|
162
|
+
dmypy.json
|
|
163
|
+
|
|
164
|
+
# Pyre type checker
|
|
165
|
+
.pyre/
|
|
166
|
+
|
|
167
|
+
# pytype static type analyzer
|
|
168
|
+
.pytype/
|
|
169
|
+
|
|
170
|
+
# Cython debug symbols
|
|
171
|
+
cython_debug/
|
|
172
|
+
|
|
173
|
+
# PyCharm
|
|
174
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
175
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
176
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
177
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
178
|
+
#.idea/
|
|
179
|
+
|
|
180
|
+
# Docs
|
|
181
|
+
libs/arcade-mcp-server/site/*
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
files: ^arcade_brightdata/.*
|
|
2
|
+
repos:
|
|
3
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
4
|
+
rev: "v4.4.0"
|
|
5
|
+
hooks:
|
|
6
|
+
- id: check-case-conflict
|
|
7
|
+
- id: check-merge-conflict
|
|
8
|
+
- id: check-toml
|
|
9
|
+
- id: check-yaml
|
|
10
|
+
- id: end-of-file-fixer
|
|
11
|
+
- id: trailing-whitespace
|
|
12
|
+
|
|
13
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
14
|
+
rev: v0.6.7
|
|
15
|
+
hooks:
|
|
16
|
+
- id: ruff
|
|
17
|
+
args: [--fix]
|
|
18
|
+
- id: ruff-format
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
target-version = "py310"
|
|
2
|
+
line-length = 100
|
|
3
|
+
fix = true
|
|
4
|
+
|
|
5
|
+
[lint]
|
|
6
|
+
select = [
|
|
7
|
+
# flake8-2020
|
|
8
|
+
"YTT",
|
|
9
|
+
# flake8-bandit
|
|
10
|
+
"S",
|
|
11
|
+
# flake8-bugbear
|
|
12
|
+
"B",
|
|
13
|
+
# flake8-builtins
|
|
14
|
+
"A",
|
|
15
|
+
# flake8-comprehensions
|
|
16
|
+
"C4",
|
|
17
|
+
# flake8-debugger
|
|
18
|
+
"T10",
|
|
19
|
+
# flake8-simplify
|
|
20
|
+
"SIM",
|
|
21
|
+
# isort
|
|
22
|
+
"I",
|
|
23
|
+
# mccabe
|
|
24
|
+
"C90",
|
|
25
|
+
# pycodestyle
|
|
26
|
+
"E", "W",
|
|
27
|
+
# pyflakes
|
|
28
|
+
"F",
|
|
29
|
+
# pygrep-hooks
|
|
30
|
+
"PGH",
|
|
31
|
+
# pyupgrade
|
|
32
|
+
"UP",
|
|
33
|
+
# ruff
|
|
34
|
+
"RUF",
|
|
35
|
+
# tryceratops
|
|
36
|
+
"TRY",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[lint.per-file-ignores]
|
|
40
|
+
"**/tests/*" = ["S101"]
|
|
41
|
+
|
|
42
|
+
[format]
|
|
43
|
+
preview = true
|
|
44
|
+
skip-magic-trailing-comma = false
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025, Arcade AI
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
.PHONY: help
|
|
2
|
+
|
|
3
|
+
help:
|
|
4
|
+
@echo "🛠️ github Commands:\n"
|
|
5
|
+
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
|
|
6
|
+
|
|
7
|
+
.PHONY: install
|
|
8
|
+
install: ## Install the uv environment and install all packages with dependencies
|
|
9
|
+
@echo "🚀 Creating virtual environment and installing all packages using uv"
|
|
10
|
+
@uv sync --active --all-extras --no-sources
|
|
11
|
+
@if [ -f .pre-commit-config.yaml ]; then uv run --no-sources pre-commit install; fi
|
|
12
|
+
@echo "✅ All packages and dependencies installed via uv"
|
|
13
|
+
|
|
14
|
+
.PHONY: install-local
|
|
15
|
+
install-local: ## Install the uv environment and install all packages with dependencies with local Arcade sources
|
|
16
|
+
@echo "🚀 Creating virtual environment and installing all packages using uv"
|
|
17
|
+
@uv sync --active --all-extras
|
|
18
|
+
@if [ -f .pre-commit-config.yaml ]; then uv run pre-commit install; fi
|
|
19
|
+
@echo "✅ All packages and dependencies installed via uv"
|
|
20
|
+
|
|
21
|
+
.PHONY: build
|
|
22
|
+
build: clean-build ## Build wheel file using poetry
|
|
23
|
+
@echo "🚀 Creating wheel file"
|
|
24
|
+
uv build
|
|
25
|
+
|
|
26
|
+
.PHONY: clean-build
|
|
27
|
+
clean-build: ## clean build artifacts
|
|
28
|
+
@echo "🗑️ Cleaning dist directory"
|
|
29
|
+
rm -rf dist
|
|
30
|
+
|
|
31
|
+
.PHONY: test
|
|
32
|
+
test: ## Test the code with pytest
|
|
33
|
+
@echo "🚀 Testing code: Running pytest"
|
|
34
|
+
@uv run --no-sources pytest -W ignore -v --cov --cov-config=pyproject.toml --cov-report=xml
|
|
35
|
+
|
|
36
|
+
.PHONY: coverage
|
|
37
|
+
coverage: ## Generate coverage report
|
|
38
|
+
@echo "coverage report"
|
|
39
|
+
@uv run --no-sources coverage report
|
|
40
|
+
@echo "Generating coverage report"
|
|
41
|
+
@uv run --no-sources coverage html
|
|
42
|
+
|
|
43
|
+
.PHONY: bump-version
|
|
44
|
+
bump-version: ## Bump the version in the pyproject.toml file by a patch version
|
|
45
|
+
@echo "🚀 Bumping version in pyproject.toml"
|
|
46
|
+
uv version --no-sources --bump patch
|
|
47
|
+
|
|
48
|
+
.PHONY: check
|
|
49
|
+
check: ## Run code quality tools.
|
|
50
|
+
@if [ -f .pre-commit-config.yaml ]; then\
|
|
51
|
+
echo "🚀 Linting code: Running pre-commit";\
|
|
52
|
+
uv run --no-sources pre-commit run -a;\
|
|
53
|
+
fi
|
|
54
|
+
@echo "🚀 Static type checking: Running mypy"
|
|
55
|
+
@uv run --no-sources mypy --config-file=pyproject.toml
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: arcade_brightdata
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Search, Crawl and Scrape any site, at scale, without getting blocked
|
|
5
|
+
Author-email: meirk-brd <meirk@brightdata.com>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Requires-Dist: arcade-tdk<4.0.0,>=3.0.0
|
|
9
|
+
Requires-Dist: requests>=2.32.5
|
|
10
|
+
Provides-Extra: dev
|
|
11
|
+
Requires-Dist: arcade-mcp[all]<2.0.0,>=1.2.0; extra == 'dev'
|
|
12
|
+
Requires-Dist: arcade-serve<4.0.0,>=3.0.0; extra == 'dev'
|
|
13
|
+
Requires-Dist: mypy<1.6.0,>=1.5.1; extra == 'dev'
|
|
14
|
+
Requires-Dist: pre-commit<3.5.0,>=3.4.0; extra == 'dev'
|
|
15
|
+
Requires-Dist: pytest-asyncio<0.25.0,>=0.24.0; extra == 'dev'
|
|
16
|
+
Requires-Dist: pytest-cov<4.1.0,>=4.0.0; extra == 'dev'
|
|
17
|
+
Requires-Dist: pytest-mock<3.12.0,>=3.11.1; extra == 'dev'
|
|
18
|
+
Requires-Dist: pytest<8.4.0,>=8.3.0; extra == 'dev'
|
|
19
|
+
Requires-Dist: ruff<0.8.0,>=0.7.4; extra == 'dev'
|
|
20
|
+
Requires-Dist: tox<4.12.0,>=4.11.1; extra == 'dev'
|
|
21
|
+
Requires-Dist: types-requests>=2.32.0; extra == 'dev'
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import ClassVar
|
|
3
|
+
from urllib.parse import quote
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BrightDataClient:
|
|
9
|
+
"""Engine for interacting with Bright Data API with connection management."""
|
|
10
|
+
|
|
11
|
+
_clients: ClassVar[dict[str, "BrightDataClient"]] = {}
|
|
12
|
+
|
|
13
|
+
def __init__(self, api_key: str, zone: str = "web_unlocker1") -> None:
|
|
14
|
+
"""
|
|
15
|
+
Initialize with API token and default zone.
|
|
16
|
+
Args:
|
|
17
|
+
api_key (str): Your Bright Data API token
|
|
18
|
+
zone (str): Bright Data zone name
|
|
19
|
+
"""
|
|
20
|
+
self.api_key = api_key
|
|
21
|
+
self.headers = {
|
|
22
|
+
"Content-Type": "application/json",
|
|
23
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
24
|
+
}
|
|
25
|
+
self.zone = zone
|
|
26
|
+
self.endpoint = "https://api.brightdata.com/request"
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def create_client(cls, api_key: str, zone: str = "web_unlocker1") -> "BrightDataClient":
|
|
30
|
+
"""Create or get cached client instance using API key only."""
|
|
31
|
+
if api_key not in cls._clients:
|
|
32
|
+
cls._clients[api_key] = cls(api_key, zone)
|
|
33
|
+
|
|
34
|
+
# Update zone for this request (user controls zone per request)
|
|
35
|
+
client = cls._clients[api_key]
|
|
36
|
+
client.zone = zone
|
|
37
|
+
return client
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def clear_cache(cls) -> None:
|
|
41
|
+
"""Clear the client cache."""
|
|
42
|
+
cls._clients.clear()
|
|
43
|
+
|
|
44
|
+
def make_request(self, payload: dict) -> str:
|
|
45
|
+
"""
|
|
46
|
+
Make a request to Bright Data API.
|
|
47
|
+
Args:
|
|
48
|
+
payload (Dict): Request payload
|
|
49
|
+
Returns:
|
|
50
|
+
str: Response text
|
|
51
|
+
"""
|
|
52
|
+
response = requests.post(
|
|
53
|
+
self.endpoint, headers=self.headers, data=json.dumps(payload), timeout=30
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
response.raise_for_status()
|
|
57
|
+
result: str = response.text
|
|
58
|
+
return result
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def encode_query(query: str) -> str:
|
|
62
|
+
"""URL encode a search query."""
|
|
63
|
+
return quote(query)
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import time
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import Annotated, Any, cast
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
from arcade_core.errors import RetryableToolError
|
|
8
|
+
from arcade_tdk import ToolContext, tool
|
|
9
|
+
|
|
10
|
+
from arcade_brightdata.bright_data_client import BrightDataClient
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DeviceType(str, Enum):
|
|
14
|
+
MOBILE = "mobile"
|
|
15
|
+
IOS = "ios"
|
|
16
|
+
IPHONE = "iphone"
|
|
17
|
+
IPAD = "ipad"
|
|
18
|
+
ANDROID = "android"
|
|
19
|
+
ANDROID_TABLET = "android_tablet"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SearchEngine(str, Enum):
|
|
23
|
+
GOOGLE = "google"
|
|
24
|
+
BING = "bing"
|
|
25
|
+
YANDEX = "yandex"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class SearchType(str, Enum):
|
|
29
|
+
IMAGES = "images"
|
|
30
|
+
SHOPPING = "shopping"
|
|
31
|
+
NEWS = "news"
|
|
32
|
+
JOBS = "jobs"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SourceType(str, Enum):
|
|
36
|
+
AMAZON_PRODUCT = "amazon_product"
|
|
37
|
+
AMAZON_PRODUCT_REVIEWS = "amazon_product_reviews"
|
|
38
|
+
LINKEDIN_PERSON_PROFILE = "linkedin_person_profile"
|
|
39
|
+
LINKEDIN_COMPANY_PROFILE = "linkedin_company_profile"
|
|
40
|
+
ZOOMINFO_COMPANY_PROFILE = "zoominfo_company_profile"
|
|
41
|
+
INSTAGRAM_PROFILES = "instagram_profiles"
|
|
42
|
+
INSTAGRAM_POSTS = "instagram_posts"
|
|
43
|
+
INSTAGRAM_REELS = "instagram_reels"
|
|
44
|
+
INSTAGRAM_COMMENTS = "instagram_comments"
|
|
45
|
+
FACEBOOK_POSTS = "facebook_posts"
|
|
46
|
+
FACEBOOK_MARKETPLACE_LISTINGS = "facebook_marketplace_listings"
|
|
47
|
+
FACEBOOK_COMPANY_REVIEWS = "facebook_company_reviews"
|
|
48
|
+
X_POSTS = "x_posts"
|
|
49
|
+
ZILLOW_PROPERTIES_LISTING = "zillow_properties_listing"
|
|
50
|
+
BOOKING_HOTEL_LISTINGS = "booking_hotel_listings"
|
|
51
|
+
YOUTUBE_VIDEOS = "youtube_videos"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@tool(requires_secrets=["BRIGHTDATA_API_KEY", "BRIGHTDATA_ZONE"])
|
|
55
|
+
def scrape_as_markdown(
|
|
56
|
+
context: ToolContext,
|
|
57
|
+
url: Annotated[str, "URL to scrape"],
|
|
58
|
+
) -> Annotated[str, "Scraped webpage content as Markdown"]:
|
|
59
|
+
"""
|
|
60
|
+
Scrape a webpage and return content in Markdown format using Bright Data.
|
|
61
|
+
|
|
62
|
+
Examples:
|
|
63
|
+
scrape_as_markdown("https://example.com") -> "# Example Page\n\nContent..."
|
|
64
|
+
scrape_as_markdown("https://news.ycombinator.com") -> "# Hacker News\n..."
|
|
65
|
+
"""
|
|
66
|
+
api_key = context.get_secret("BRIGHTDATA_API_KEY")
|
|
67
|
+
zone = context.get_secret("BRIGHTDATA_ZONE")
|
|
68
|
+
client = BrightDataClient.create_client(api_key=api_key, zone=zone)
|
|
69
|
+
|
|
70
|
+
payload = {"url": url, "zone": zone, "format": "raw", "data_format": "markdown"}
|
|
71
|
+
return client.make_request(payload)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@tool(requires_secrets=["BRIGHTDATA_API_KEY", "BRIGHTDATA_ZONE"])
|
|
75
|
+
def search_engine( # noqa: C901
|
|
76
|
+
context: ToolContext,
|
|
77
|
+
query: Annotated[str, "Search query"],
|
|
78
|
+
engine: Annotated[SearchEngine, "Search engine to use"] = SearchEngine.GOOGLE,
|
|
79
|
+
language: Annotated[str | None, "Two-letter language code"] = None,
|
|
80
|
+
country_code: Annotated[str | None, "Two-letter country code"] = None,
|
|
81
|
+
search_type: Annotated[SearchType | None, "Type of search"] = None,
|
|
82
|
+
start: Annotated[int | None, "Results pagination offset"] = None,
|
|
83
|
+
num_results: Annotated[int, "Number of results to return. The default is 10"] = 10,
|
|
84
|
+
location: Annotated[str | None, "Location for search results"] = None,
|
|
85
|
+
device: Annotated[DeviceType | None, "Device type"] = None,
|
|
86
|
+
return_json: Annotated[bool, "Return JSON instead of Markdown"] = False,
|
|
87
|
+
) -> Annotated[str, "Search results as Markdown or JSON"]:
|
|
88
|
+
"""
|
|
89
|
+
Search using Google, Bing, or Yandex with advanced parameters using Bright Data.
|
|
90
|
+
|
|
91
|
+
Examples:
|
|
92
|
+
search_engine("climate change") -> "# Search Results\n\n## Climate Change - Wikipedia\n..."
|
|
93
|
+
search_engine("Python tutorials", engine="bing", num_results=5) -> "# Bing Results\n..."
|
|
94
|
+
search_engine("cats", search_type="images", country_code="us") -> "# Image Results\n..."
|
|
95
|
+
"""
|
|
96
|
+
api_key = context.get_secret("BRIGHTDATA_API_KEY")
|
|
97
|
+
zone = context.get_secret("BRIGHTDATA_ZONE")
|
|
98
|
+
client = BrightDataClient.create_client(api_key=api_key, zone=zone)
|
|
99
|
+
|
|
100
|
+
encoded_query = BrightDataClient.encode_query(query)
|
|
101
|
+
|
|
102
|
+
base_urls = {
|
|
103
|
+
SearchEngine.GOOGLE: f"https://www.google.com/search?q={encoded_query}",
|
|
104
|
+
SearchEngine.BING: f"https://www.bing.com/search?q={encoded_query}",
|
|
105
|
+
SearchEngine.YANDEX: f"https://yandex.com/search/?text={encoded_query}",
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
search_url = base_urls[engine]
|
|
109
|
+
|
|
110
|
+
if engine == SearchEngine.GOOGLE:
|
|
111
|
+
params = []
|
|
112
|
+
|
|
113
|
+
if language:
|
|
114
|
+
params.append(f"hl={language}")
|
|
115
|
+
|
|
116
|
+
if country_code:
|
|
117
|
+
params.append(f"gl={country_code}")
|
|
118
|
+
|
|
119
|
+
if search_type:
|
|
120
|
+
if search_type == SearchType.JOBS:
|
|
121
|
+
params.append("ibp=htl;jobs")
|
|
122
|
+
else:
|
|
123
|
+
search_types = {
|
|
124
|
+
SearchType.IMAGES: "isch",
|
|
125
|
+
SearchType.SHOPPING: "shop",
|
|
126
|
+
SearchType.NEWS: "nws",
|
|
127
|
+
}
|
|
128
|
+
tbm_value = search_types.get(search_type, search_type)
|
|
129
|
+
params.append(f"tbm={tbm_value}")
|
|
130
|
+
|
|
131
|
+
if start is not None:
|
|
132
|
+
params.append(f"start={start}")
|
|
133
|
+
|
|
134
|
+
if num_results:
|
|
135
|
+
params.append(f"num={num_results}")
|
|
136
|
+
|
|
137
|
+
if location:
|
|
138
|
+
params.append(f"uule={BrightDataClient.encode_query(location)}")
|
|
139
|
+
|
|
140
|
+
if device:
|
|
141
|
+
device_value = "1"
|
|
142
|
+
|
|
143
|
+
if device.value in ["ios", "iphone"]:
|
|
144
|
+
device_value = "ios"
|
|
145
|
+
elif device.value == "ipad":
|
|
146
|
+
device_value = "ios_tablet"
|
|
147
|
+
elif device.value == "android":
|
|
148
|
+
device_value = "android"
|
|
149
|
+
elif device.value == "android_tablet":
|
|
150
|
+
device_value = "android_tablet"
|
|
151
|
+
|
|
152
|
+
params.append(f"brd_mobile={device_value}")
|
|
153
|
+
|
|
154
|
+
if return_json:
|
|
155
|
+
params.append("brd_json=1")
|
|
156
|
+
|
|
157
|
+
if params:
|
|
158
|
+
search_url += "&" + "&".join(params)
|
|
159
|
+
|
|
160
|
+
payload = {
|
|
161
|
+
"url": search_url,
|
|
162
|
+
"zone": zone,
|
|
163
|
+
"format": "raw",
|
|
164
|
+
"data_format": "markdown" if not return_json else "raw",
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
return client.make_request(payload)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@tool(requires_secrets=["BRIGHTDATA_API_KEY"])
|
|
171
|
+
def web_data_feed(
|
|
172
|
+
context: ToolContext,
|
|
173
|
+
source_type: Annotated[SourceType, "Type of data source"],
|
|
174
|
+
url: Annotated[str, "URL of the web resource to extract data from"],
|
|
175
|
+
num_of_reviews: Annotated[
|
|
176
|
+
int | None,
|
|
177
|
+
(
|
|
178
|
+
"Number of reviews to retrieve. Only applicable for "
|
|
179
|
+
"facebook_company_reviews. Default is None"
|
|
180
|
+
),
|
|
181
|
+
] = None,
|
|
182
|
+
timeout: Annotated[int, "Maximum time in seconds to wait for data retrieval"] = 600,
|
|
183
|
+
polling_interval: Annotated[int, "Time in seconds between polling attempts"] = 1,
|
|
184
|
+
) -> Annotated[str, "Structured data from the requested source as JSON"]:
|
|
185
|
+
"""
|
|
186
|
+
Extract structured data from various websites like LinkedIn, Amazon, Instagram, etc.
|
|
187
|
+
NEVER MADE UP LINKS - IF LINKS ARE NEEDED, EXECUTE search_engine FIRST.
|
|
188
|
+
Supported source types:
|
|
189
|
+
- amazon_product, amazon_product_reviews
|
|
190
|
+
- linkedin_person_profile, linkedin_company_profile
|
|
191
|
+
- zoominfo_company_profile
|
|
192
|
+
- instagram_profiles, instagram_posts, instagram_reels, instagram_comments
|
|
193
|
+
- facebook_posts, facebook_marketplace_listings, facebook_company_reviews
|
|
194
|
+
- x_posts
|
|
195
|
+
- zillow_properties_listing
|
|
196
|
+
- booking_hotel_listings
|
|
197
|
+
- youtube_videos
|
|
198
|
+
|
|
199
|
+
Examples:
|
|
200
|
+
web_data_feed("amazon_product", "https://amazon.com/dp/B08N5WRWNW")
|
|
201
|
+
-> "{\"title\": \"Product Name\", ...}"
|
|
202
|
+
web_data_feed("linkedin_person_profile", "https://linkedin.com/in/johndoe")
|
|
203
|
+
-> "{\"name\": \"John Doe\", ...}"
|
|
204
|
+
web_data_feed(
|
|
205
|
+
"facebook_company_reviews", "https://facebook.com/company", num_of_reviews=50
|
|
206
|
+
) -> "[{\"review\": \"...\", ...}]"
|
|
207
|
+
"""
|
|
208
|
+
api_key = context.get_secret("BRIGHTDATA_API_KEY")
|
|
209
|
+
client = BrightDataClient.create_client(api_key=api_key)
|
|
210
|
+
if num_of_reviews is not None and source_type != SourceType.FACEBOOK_COMPANY_REVIEWS:
|
|
211
|
+
msg = (
|
|
212
|
+
f"num_of_reviews parameter is only applicable for facebook_company_reviews, "
|
|
213
|
+
f"not for {source_type.value}"
|
|
214
|
+
)
|
|
215
|
+
prompt = (
|
|
216
|
+
"The num_of_reviews parameter should only be used with "
|
|
217
|
+
"facebook_company_reviews source type."
|
|
218
|
+
)
|
|
219
|
+
raise RetryableToolError(msg, additional_prompt_content=prompt)
|
|
220
|
+
data = _extract_structured_data(
|
|
221
|
+
client=client,
|
|
222
|
+
source_type=source_type,
|
|
223
|
+
url=url,
|
|
224
|
+
num_of_reviews=num_of_reviews,
|
|
225
|
+
timeout=timeout,
|
|
226
|
+
polling_interval=polling_interval,
|
|
227
|
+
)
|
|
228
|
+
return json.dumps(data, indent=2)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _extract_structured_data(
|
|
232
|
+
client: BrightDataClient,
|
|
233
|
+
source_type: SourceType,
|
|
234
|
+
url: str,
|
|
235
|
+
num_of_reviews: int | None = None,
|
|
236
|
+
timeout: int = 600,
|
|
237
|
+
polling_interval: int = 1,
|
|
238
|
+
) -> dict[str, Any]:
|
|
239
|
+
"""
|
|
240
|
+
Extract structured data from various sources.
|
|
241
|
+
"""
|
|
242
|
+
datasets = {
|
|
243
|
+
SourceType.AMAZON_PRODUCT: "gd_l7q7dkf244hwjntr0",
|
|
244
|
+
SourceType.AMAZON_PRODUCT_REVIEWS: "gd_le8e811kzy4ggddlq",
|
|
245
|
+
SourceType.LINKEDIN_PERSON_PROFILE: "gd_l1viktl72bvl7bjuj0",
|
|
246
|
+
SourceType.LINKEDIN_COMPANY_PROFILE: "gd_l1vikfnt1wgvvqz95w",
|
|
247
|
+
SourceType.ZOOMINFO_COMPANY_PROFILE: "gd_m0ci4a4ivx3j5l6nx",
|
|
248
|
+
SourceType.INSTAGRAM_PROFILES: "gd_l1vikfch901nx3by4",
|
|
249
|
+
SourceType.INSTAGRAM_POSTS: "gd_lk5ns7kz21pck8jpis",
|
|
250
|
+
SourceType.INSTAGRAM_REELS: "gd_lyclm20il4r5helnj",
|
|
251
|
+
SourceType.INSTAGRAM_COMMENTS: "gd_ltppn085pokosxh13",
|
|
252
|
+
SourceType.FACEBOOK_POSTS: "gd_lyclm1571iy3mv57zw",
|
|
253
|
+
SourceType.FACEBOOK_MARKETPLACE_LISTINGS: "gd_lvt9iwuh6fbcwmx1a",
|
|
254
|
+
SourceType.FACEBOOK_COMPANY_REVIEWS: "gd_m0dtqpiu1mbcyc2g86",
|
|
255
|
+
SourceType.X_POSTS: "gd_lwxkxvnf1cynvib9co",
|
|
256
|
+
SourceType.ZILLOW_PROPERTIES_LISTING: "gd_lfqkr8wm13ixtbd8f5",
|
|
257
|
+
SourceType.BOOKING_HOTEL_LISTINGS: "gd_m5mbdl081229ln6t4a",
|
|
258
|
+
SourceType.YOUTUBE_VIDEOS: "gd_m5mbdl081229ln6t4a",
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
dataset_id = datasets[source_type]
|
|
262
|
+
|
|
263
|
+
request_data = {"url": url}
|
|
264
|
+
if source_type == SourceType.FACEBOOK_COMPANY_REVIEWS and num_of_reviews is not None:
|
|
265
|
+
request_data["num_of_reviews"] = str(num_of_reviews)
|
|
266
|
+
|
|
267
|
+
trigger_response = requests.post(
|
|
268
|
+
"https://api.brightdata.com/datasets/v3/trigger",
|
|
269
|
+
params={"dataset_id": dataset_id, "include_errors": "true"},
|
|
270
|
+
headers=client.headers,
|
|
271
|
+
json=[request_data],
|
|
272
|
+
timeout=30,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
trigger_data = trigger_response.json()
|
|
276
|
+
if not trigger_data.get("snapshot_id"):
|
|
277
|
+
msg = "No snapshot ID returned from trigger request"
|
|
278
|
+
prompt = "Invalid input provided, use search_engine to get the relevant data first"
|
|
279
|
+
raise RetryableToolError(msg, additional_prompt_content=prompt)
|
|
280
|
+
|
|
281
|
+
snapshot_id = trigger_data["snapshot_id"]
|
|
282
|
+
|
|
283
|
+
attempts = 0
|
|
284
|
+
max_attempts = timeout
|
|
285
|
+
|
|
286
|
+
while attempts < max_attempts:
|
|
287
|
+
try:
|
|
288
|
+
snapshot_response = requests.get(
|
|
289
|
+
f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}",
|
|
290
|
+
params={"format": "json"},
|
|
291
|
+
headers=client.headers,
|
|
292
|
+
timeout=30,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
snapshot_data = cast(dict[str, Any], snapshot_response.json())
|
|
296
|
+
|
|
297
|
+
if isinstance(snapshot_data, dict) and snapshot_data.get("status") in (
|
|
298
|
+
"running",
|
|
299
|
+
"building",
|
|
300
|
+
):
|
|
301
|
+
attempts += 1
|
|
302
|
+
time.sleep(polling_interval)
|
|
303
|
+
continue
|
|
304
|
+
else:
|
|
305
|
+
return snapshot_data
|
|
306
|
+
|
|
307
|
+
except Exception:
|
|
308
|
+
attempts += 1
|
|
309
|
+
time.sleep(polling_interval)
|
|
310
|
+
|
|
311
|
+
msg = f"Timeout after {max_attempts} seconds waiting for {source_type.value} data"
|
|
312
|
+
raise TimeoutError(msg)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = [ "hatchling",]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "arcade_brightdata"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Search, Crawl and Scrape any site, at scale, without getting blocked"
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"arcade-tdk>=3.0.0,<4.0.0",
|
|
12
|
+
"requests>=2.32.5",
|
|
13
|
+
]
|
|
14
|
+
[[project.authors]]
|
|
15
|
+
name = "meirk-brd"
|
|
16
|
+
email = "meirk@brightdata.com"
|
|
17
|
+
|
|
18
|
+
[project.optional-dependencies]
|
|
19
|
+
dev = [
|
|
20
|
+
"arcade-mcp[all]>=1.2.0,<2.0.0",
|
|
21
|
+
"arcade-serve>=3.0.0,<4.0.0",
|
|
22
|
+
"pytest>=8.3.0,<8.4.0",
|
|
23
|
+
"pytest-cov>=4.0.0,<4.1.0",
|
|
24
|
+
"pytest-mock>=3.11.1,<3.12.0",
|
|
25
|
+
"pytest-asyncio>=0.24.0,<0.25.0",
|
|
26
|
+
"mypy>=1.5.1,<1.6.0",
|
|
27
|
+
"pre-commit>=3.4.0,<3.5.0",
|
|
28
|
+
"tox>=4.11.1,<4.12.0",
|
|
29
|
+
"ruff>=0.7.4,<0.8.0",
|
|
30
|
+
"types-requests>=2.32.0",
|
|
31
|
+
]
|
|
32
|
+
# Tell Arcade.dev that this package is a toolkit
|
|
33
|
+
[project.entry-points.arcade_toolkits]
|
|
34
|
+
toolkit_name = "arcade_brightdata"
|
|
35
|
+
|
|
36
|
+
[tool.mypy]
|
|
37
|
+
files = [ "arcade_brightdata/**/*.py",]
|
|
38
|
+
python_version = "3.10"
|
|
39
|
+
disallow_untyped_defs = "True"
|
|
40
|
+
disallow_any_unimported = "True"
|
|
41
|
+
no_implicit_optional = "True"
|
|
42
|
+
check_untyped_defs = "True"
|
|
43
|
+
warn_return_any = "True"
|
|
44
|
+
warn_unused_ignores = "True"
|
|
45
|
+
show_error_codes = "True"
|
|
46
|
+
ignore_missing_imports = "True"
|
|
47
|
+
|
|
48
|
+
[tool.uv.sources]
|
|
49
|
+
arcade-mcp = { path = "../../", editable = true }
|
|
50
|
+
arcade-serve = { path = "../../libs/arcade-serve/", editable = true }
|
|
51
|
+
arcade-tdk = { path = "../../libs/arcade-tdk/", editable = true }
|
|
52
|
+
|
|
53
|
+
[tool.pytest.ini_options]
|
|
54
|
+
testpaths = [ "tests",]
|
|
55
|
+
|
|
56
|
+
[tool.coverage.report]
|
|
57
|
+
skip_empty = true
|
|
58
|
+
|
|
59
|
+
[tool.hatch.build.targets.wheel]
|
|
60
|
+
packages = [ "arcade_brightdata",]
|
|
File without changes
|
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
from os import environ
|
|
2
|
+
from unittest.mock import Mock, patch
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
import requests
|
|
6
|
+
from arcade_tdk import ToolContext, ToolSecretItem
|
|
7
|
+
from arcade_tdk.errors import ToolExecutionError
|
|
8
|
+
|
|
9
|
+
from arcade_brightdata.bright_data_client import BrightDataClient
|
|
10
|
+
from arcade_brightdata.tools.bright_data_tools import (
|
|
11
|
+
DeviceType,
|
|
12
|
+
SourceType,
|
|
13
|
+
scrape_as_markdown,
|
|
14
|
+
search_engine,
|
|
15
|
+
web_data_feed,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
BRIGHTDATA_API_KEY = environ.get("TEST_BRIGHTDATA_API_KEY") or "api-key"
|
|
19
|
+
BRIGHTDATA_ZONE = environ.get("TEST_BRIGHTDATA_ZONE") or "unblocker"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@pytest.fixture
|
|
23
|
+
def mock_context():
|
|
24
|
+
context = ToolContext()
|
|
25
|
+
context.secrets = []
|
|
26
|
+
context.secrets.append(ToolSecretItem(key="BRIGHTDATA_API_KEY", value=BRIGHTDATA_API_KEY))
|
|
27
|
+
context.secrets.append(ToolSecretItem(key="BRIGHTDATA_ZONE", value=BRIGHTDATA_ZONE))
|
|
28
|
+
return context
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@pytest.fixture(autouse=True)
|
|
32
|
+
def cleanup_engines():
|
|
33
|
+
"""Clean up bright data clients after each test to prevent connection leaks."""
|
|
34
|
+
yield
|
|
35
|
+
BrightDataClient.clear_cache()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class TestBrightDataClient:
|
|
39
|
+
def test_get_instance_creates_new_client(self):
|
|
40
|
+
client1 = BrightDataClient.create_client("test_key_1", "zone1")
|
|
41
|
+
client2 = BrightDataClient.create_client("test_key_2", "zone2")
|
|
42
|
+
|
|
43
|
+
assert client1 != client2
|
|
44
|
+
assert client1.api_key == "test_key_1"
|
|
45
|
+
assert client1.zone == "zone1"
|
|
46
|
+
assert client2.api_key == "test_key_2"
|
|
47
|
+
assert client2.zone == "zone2"
|
|
48
|
+
|
|
49
|
+
def test_get_instance_returns_cached_client(self):
|
|
50
|
+
client1 = BrightDataClient.create_client("test_key", "zone1")
|
|
51
|
+
client2 = BrightDataClient.create_client("test_key", "zone1")
|
|
52
|
+
|
|
53
|
+
assert client1 is client2
|
|
54
|
+
|
|
55
|
+
def test_clear_cache(self):
|
|
56
|
+
client1 = BrightDataClient.create_client("test_key", "zone1")
|
|
57
|
+
BrightDataClient.clear_cache()
|
|
58
|
+
client2 = BrightDataClient.create_client("test_key", "zone1")
|
|
59
|
+
|
|
60
|
+
assert client1 is not client2
|
|
61
|
+
|
|
62
|
+
def test_encode_query(self):
|
|
63
|
+
result = BrightDataClient.encode_query("hello world test")
|
|
64
|
+
assert result == "hello%20world%20test"
|
|
65
|
+
|
|
66
|
+
@patch("requests.post")
|
|
67
|
+
def test_make_request_success(self, mock_post):
|
|
68
|
+
mock_response = Mock()
|
|
69
|
+
mock_response.status_code = 200
|
|
70
|
+
mock_response.text = "Success response"
|
|
71
|
+
mock_post.return_value = mock_response
|
|
72
|
+
|
|
73
|
+
client = BrightDataClient("test_key", "test_zone")
|
|
74
|
+
result = client.make_request({"url": "https://example.com"})
|
|
75
|
+
|
|
76
|
+
assert result == "Success response"
|
|
77
|
+
mock_post.assert_called_once()
|
|
78
|
+
|
|
79
|
+
@patch("requests.post")
|
|
80
|
+
def test_make_request_failure(self, mock_post):
|
|
81
|
+
mock_response = Mock()
|
|
82
|
+
mock_response.status_code = 400
|
|
83
|
+
mock_response.text = "Bad Request"
|
|
84
|
+
mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError(
|
|
85
|
+
"400 Client Error"
|
|
86
|
+
)
|
|
87
|
+
mock_post.return_value = mock_response
|
|
88
|
+
|
|
89
|
+
client = BrightDataClient("test_key", "test_zone")
|
|
90
|
+
|
|
91
|
+
with pytest.raises(requests.exceptions.HTTPError):
|
|
92
|
+
client.make_request({"url": "https://example.com"})
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class TestScrapeAsMarkdown:
|
|
96
|
+
@patch("arcade_brightdata.tools.bright_data_tools.BrightDataClient")
|
|
97
|
+
def test_scrape_as_markdown_success(self, mock_engine_class, mock_context):
|
|
98
|
+
mock_client = Mock()
|
|
99
|
+
mock_client.make_request.return_value = "# Test Page\n\nContent here"
|
|
100
|
+
mock_engine_class.create_client.return_value = mock_client
|
|
101
|
+
|
|
102
|
+
result = scrape_as_markdown(mock_context, "https://example.com")
|
|
103
|
+
|
|
104
|
+
assert result == "# Test Page\n\nContent here"
|
|
105
|
+
mock_engine_class.create_client.assert_called_once_with(
|
|
106
|
+
api_key=BRIGHTDATA_API_KEY, zone=BRIGHTDATA_ZONE
|
|
107
|
+
)
|
|
108
|
+
mock_client.make_request.assert_called_once_with({
|
|
109
|
+
"url": "https://example.com",
|
|
110
|
+
"zone": BRIGHTDATA_ZONE,
|
|
111
|
+
"format": "raw",
|
|
112
|
+
"data_format": "markdown",
|
|
113
|
+
})
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class TestSearchEngine:
|
|
117
|
+
@patch("arcade_brightdata.tools.bright_data_tools.BrightDataClient")
|
|
118
|
+
def test_search_engine_google_basic(self, mock_engine_class, mock_context):
|
|
119
|
+
mock_client = Mock()
|
|
120
|
+
mock_client.make_request.return_value = "# Search Results\n\nResult 1\nResult 2"
|
|
121
|
+
mock_engine_class.create_client.return_value = mock_client
|
|
122
|
+
mock_engine_class.encode_query.return_value = "test%20query"
|
|
123
|
+
|
|
124
|
+
result = search_engine(mock_context, "test query")
|
|
125
|
+
|
|
126
|
+
assert result == "# Search Results\n\nResult 1\nResult 2"
|
|
127
|
+
mock_engine_class.create_client.assert_called_once_with(
|
|
128
|
+
api_key=BRIGHTDATA_API_KEY, zone=BRIGHTDATA_ZONE
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
@patch("arcade_brightdata.tools.bright_data_tools.BrightDataClient")
|
|
132
|
+
def test_search_engine_bing(self, mock_engine_class, mock_context):
|
|
133
|
+
mock_client = Mock()
|
|
134
|
+
mock_client.make_request.return_value = "# Bing Results"
|
|
135
|
+
mock_engine_class.create_client.return_value = mock_client
|
|
136
|
+
mock_engine_class.encode_query.return_value = "test%20query"
|
|
137
|
+
|
|
138
|
+
result = search_engine(mock_context, "test query", engine="bing")
|
|
139
|
+
|
|
140
|
+
assert result == "# Bing Results"
|
|
141
|
+
expected_payload = {
|
|
142
|
+
"url": "https://www.bing.com/search?q=test%20query",
|
|
143
|
+
"zone": BRIGHTDATA_ZONE,
|
|
144
|
+
"format": "raw",
|
|
145
|
+
"data_format": "markdown",
|
|
146
|
+
}
|
|
147
|
+
mock_client.make_request.assert_called_once_with(expected_payload)
|
|
148
|
+
|
|
149
|
+
@patch("arcade_brightdata.tools.bright_data_tools.BrightDataClient")
|
|
150
|
+
def test_search_engine_google_with_parameters(self, mock_engine_class, mock_context):
|
|
151
|
+
mock_client = Mock()
|
|
152
|
+
mock_client.make_request.return_value = "# Google Results with params"
|
|
153
|
+
mock_engine_class.create_client.return_value = mock_client
|
|
154
|
+
mock_engine_class.encode_query.side_effect = lambda x: x.replace(" ", "%20")
|
|
155
|
+
|
|
156
|
+
result = search_engine(
|
|
157
|
+
mock_context,
|
|
158
|
+
"test query",
|
|
159
|
+
language="en",
|
|
160
|
+
country_code="us",
|
|
161
|
+
search_type="images",
|
|
162
|
+
start=10,
|
|
163
|
+
num_results=20,
|
|
164
|
+
location="New York",
|
|
165
|
+
device=DeviceType.MOBILE,
|
|
166
|
+
return_json=True,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
assert result == "# Google Results with params"
|
|
170
|
+
call_args = mock_client.make_request.call_args[0][0]
|
|
171
|
+
|
|
172
|
+
assert "hl=en" in call_args["url"]
|
|
173
|
+
assert "gl=us" in call_args["url"]
|
|
174
|
+
assert "tbm=isch" in call_args["url"]
|
|
175
|
+
assert "start=10" in call_args["url"]
|
|
176
|
+
assert "num=20" in call_args["url"]
|
|
177
|
+
assert "brd_mobile=1" in call_args["url"]
|
|
178
|
+
assert "brd_json=1" in call_args["url"]
|
|
179
|
+
assert call_args["data_format"] == "raw"
|
|
180
|
+
|
|
181
|
+
def test_search_engine_invalid_engine(self, mock_context):
|
|
182
|
+
with pytest.raises(ToolExecutionError):
|
|
183
|
+
search_engine(mock_context, "test query", engine="invalid_engine")
|
|
184
|
+
|
|
185
|
+
@patch("arcade_brightdata.tools.bright_data_tools.BrightDataClient")
|
|
186
|
+
def test_search_engine_google_jobs(self, mock_engine_class, mock_context):
|
|
187
|
+
mock_client = Mock()
|
|
188
|
+
mock_client.make_request.return_value = "# Job Results"
|
|
189
|
+
mock_engine_class.create_client.return_value = mock_client
|
|
190
|
+
mock_engine_class.encode_query.return_value = "python%20developer"
|
|
191
|
+
|
|
192
|
+
result = search_engine(mock_context, "python developer", search_type="jobs")
|
|
193
|
+
|
|
194
|
+
assert result == "# Job Results"
|
|
195
|
+
call_args = mock_client.make_request.call_args[0][0]
|
|
196
|
+
assert "ibp=htl;jobs" in call_args["url"]
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class TestWebDataFeed:
|
|
200
|
+
@patch("arcade_brightdata.tools.bright_data_tools._extract_structured_data")
|
|
201
|
+
@patch("arcade_brightdata.tools.bright_data_tools.BrightDataClient")
|
|
202
|
+
def test_web_data_feed_success(self, mock_engine_class, mock_extract, mock_context):
|
|
203
|
+
mock_client = Mock()
|
|
204
|
+
mock_engine_class.create_client.return_value = mock_client
|
|
205
|
+
mock_extract.return_value = {"title": "Test Product", "price": "$19.99"}
|
|
206
|
+
|
|
207
|
+
result = web_data_feed(mock_context, "amazon_product", "https://amazon.com/dp/B08N5WRWNW")
|
|
208
|
+
|
|
209
|
+
expected_json = '{\n "title": "Test Product",\n "price": "$19.99"\n}'
|
|
210
|
+
assert result == expected_json
|
|
211
|
+
|
|
212
|
+
mock_engine_class.create_client.assert_called_once_with(api_key=BRIGHTDATA_API_KEY)
|
|
213
|
+
mock_extract.assert_called_once_with(
|
|
214
|
+
client=mock_client,
|
|
215
|
+
source_type=SourceType.AMAZON_PRODUCT,
|
|
216
|
+
url="https://amazon.com/dp/B08N5WRWNW",
|
|
217
|
+
num_of_reviews=None,
|
|
218
|
+
timeout=600,
|
|
219
|
+
polling_interval=1,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
@patch("arcade_brightdata.tools.bright_data_tools._extract_structured_data")
|
|
223
|
+
@patch("arcade_brightdata.tools.bright_data_tools.BrightDataClient")
|
|
224
|
+
def test_web_data_feed_with_reviews(self, mock_engine_class, mock_extract, mock_context):
|
|
225
|
+
mock_client = Mock()
|
|
226
|
+
mock_engine_class.create_client.return_value = mock_client
|
|
227
|
+
mock_extract.return_value = [{"review": "Great product!", "rating": 5}]
|
|
228
|
+
|
|
229
|
+
result = web_data_feed(
|
|
230
|
+
mock_context,
|
|
231
|
+
"facebook_company_reviews",
|
|
232
|
+
"https://facebook.com/company",
|
|
233
|
+
num_of_reviews=50,
|
|
234
|
+
timeout=300,
|
|
235
|
+
polling_interval=2,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
expected_json = '[\n {\n "review": "Great product!",\n "rating": 5\n }\n]'
|
|
239
|
+
assert result == expected_json
|
|
240
|
+
|
|
241
|
+
mock_extract.assert_called_once_with(
|
|
242
|
+
client=mock_client,
|
|
243
|
+
source_type=SourceType.FACEBOOK_COMPANY_REVIEWS,
|
|
244
|
+
url="https://facebook.com/company",
|
|
245
|
+
num_of_reviews=50,
|
|
246
|
+
timeout=300,
|
|
247
|
+
polling_interval=2,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
class TestExtractStructuredData:
|
|
252
|
+
@patch("requests.get")
|
|
253
|
+
@patch("requests.post")
|
|
254
|
+
def test_extract_structured_data_success(self, mock_post, mock_get):
|
|
255
|
+
from arcade_brightdata.tools.bright_data_tools import _extract_structured_data
|
|
256
|
+
|
|
257
|
+
client = BrightDataClient("test_key", "test_zone")
|
|
258
|
+
|
|
259
|
+
mock_trigger_response = Mock()
|
|
260
|
+
mock_trigger_response.json.return_value = {"snapshot_id": "snap_123"}
|
|
261
|
+
mock_post.return_value = mock_trigger_response
|
|
262
|
+
|
|
263
|
+
mock_snapshot_response = Mock()
|
|
264
|
+
mock_snapshot_response.json.return_value = {"data": "extracted_data"}
|
|
265
|
+
mock_get.return_value = mock_snapshot_response
|
|
266
|
+
|
|
267
|
+
result = _extract_structured_data(
|
|
268
|
+
client=client,
|
|
269
|
+
source_type=SourceType.AMAZON_PRODUCT,
|
|
270
|
+
url="https://amazon.com/dp/TEST",
|
|
271
|
+
timeout=10,
|
|
272
|
+
polling_interval=0.1,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
assert result == {"data": "extracted_data"}
|
|
276
|
+
|
|
277
|
+
mock_post.assert_called_once()
|
|
278
|
+
trigger_call = mock_post.call_args
|
|
279
|
+
assert "gd_l7q7dkf244hwjntr0" in str(trigger_call) # Amazon product dataset ID
|
|
280
|
+
|
|
281
|
+
mock_get.assert_called_once()
|
|
282
|
+
snapshot_call = mock_get.call_args
|
|
283
|
+
assert "snap_123" in str(snapshot_call)
|
|
284
|
+
|
|
285
|
+
@patch("requests.get")
|
|
286
|
+
@patch("requests.post")
|
|
287
|
+
def test_extract_structured_data_with_polling(self, mock_post, mock_get):
|
|
288
|
+
from arcade_brightdata.tools.bright_data_tools import _extract_structured_data
|
|
289
|
+
|
|
290
|
+
client = BrightDataClient("test_key", "test_zone")
|
|
291
|
+
|
|
292
|
+
mock_trigger_response = Mock()
|
|
293
|
+
mock_trigger_response.json.return_value = {"snapshot_id": "snap_123"}
|
|
294
|
+
mock_post.return_value = mock_trigger_response
|
|
295
|
+
|
|
296
|
+
running_response = Mock()
|
|
297
|
+
running_response.json.return_value = {"status": "running"}
|
|
298
|
+
|
|
299
|
+
complete_response = Mock()
|
|
300
|
+
complete_response.json.return_value = {"data": "final_data"}
|
|
301
|
+
|
|
302
|
+
mock_get.side_effect = [running_response, complete_response]
|
|
303
|
+
|
|
304
|
+
result = _extract_structured_data(
|
|
305
|
+
client=client,
|
|
306
|
+
source_type=SourceType.LINKEDIN_PERSON_PROFILE,
|
|
307
|
+
url="https://linkedin.com/in/test",
|
|
308
|
+
timeout=10,
|
|
309
|
+
polling_interval=0.1,
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
assert result == {"data": "final_data"}
|
|
313
|
+
assert mock_get.call_count == 2
|
|
314
|
+
|
|
315
|
+
@patch("requests.post")
|
|
316
|
+
def test_extract_structured_data_invalid_source_type(self, mock_post):
|
|
317
|
+
from arcade_brightdata.tools.bright_data_tools import _extract_structured_data
|
|
318
|
+
|
|
319
|
+
client = BrightDataClient("test_key", "test_zone")
|
|
320
|
+
|
|
321
|
+
# Create a mock SourceType that doesn't exist in the datasets dict
|
|
322
|
+
class InvalidSourceType:
|
|
323
|
+
value = "invalid_source"
|
|
324
|
+
|
|
325
|
+
with pytest.raises(KeyError):
|
|
326
|
+
_extract_structured_data(
|
|
327
|
+
client=client, source_type=InvalidSourceType(), url="https://example.com"
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
@patch("requests.get")
|
|
331
|
+
@patch("requests.post")
|
|
332
|
+
def test_extract_structured_data_no_snapshot_id(self, mock_post, mock_get):
|
|
333
|
+
from arcade_brightdata.tools.bright_data_tools import _extract_structured_data
|
|
334
|
+
|
|
335
|
+
client = BrightDataClient("test_key", "test_zone")
|
|
336
|
+
|
|
337
|
+
# Mock trigger response without snapshot_id
|
|
338
|
+
mock_trigger_response = Mock()
|
|
339
|
+
mock_trigger_response.json.return_value = {}
|
|
340
|
+
mock_post.return_value = mock_trigger_response
|
|
341
|
+
|
|
342
|
+
with pytest.raises(Exception) as exc_info:
|
|
343
|
+
_extract_structured_data(
|
|
344
|
+
client=client,
|
|
345
|
+
source_type=SourceType.AMAZON_PRODUCT,
|
|
346
|
+
url="https://amazon.com/dp/TEST",
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
assert "No snapshot ID returned from trigger request" in str(exc_info.value)
|
|
350
|
+
|
|
351
|
+
@patch("requests.get")
|
|
352
|
+
@patch("requests.post")
|
|
353
|
+
@patch("time.sleep")
|
|
354
|
+
def test_extract_structured_data_timeout(self, mock_sleep, mock_post, mock_get):
|
|
355
|
+
from arcade_brightdata.tools.bright_data_tools import _extract_structured_data
|
|
356
|
+
|
|
357
|
+
client = BrightDataClient("test_key", "test_zone")
|
|
358
|
+
|
|
359
|
+
# Mock trigger response
|
|
360
|
+
mock_trigger_response = Mock()
|
|
361
|
+
mock_trigger_response.json.return_value = {"snapshot_id": "snap_123"}
|
|
362
|
+
mock_post.return_value = mock_trigger_response
|
|
363
|
+
|
|
364
|
+
# Mock snapshot response that always returns running
|
|
365
|
+
mock_snapshot_response = Mock()
|
|
366
|
+
mock_snapshot_response.json.return_value = {"status": "running"}
|
|
367
|
+
mock_get.return_value = mock_snapshot_response
|
|
368
|
+
|
|
369
|
+
with pytest.raises(TimeoutError) as exc_info:
|
|
370
|
+
_extract_structured_data(
|
|
371
|
+
client=client,
|
|
372
|
+
source_type=SourceType.AMAZON_PRODUCT,
|
|
373
|
+
url="https://amazon.com/dp/TEST",
|
|
374
|
+
timeout=2,
|
|
375
|
+
polling_interval=0.1,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
assert "Timeout after 2 seconds waiting for amazon_product data" in str(exc_info.value)
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
class TestIntegration:
|
|
382
|
+
"""Integration tests that test the full flow without mocking internal components."""
|
|
383
|
+
|
|
384
|
+
@patch("requests.post")
|
|
385
|
+
def test_scrape_as_markdown_integration(self, mock_post, mock_context):
|
|
386
|
+
mock_response = Mock()
|
|
387
|
+
mock_response.status_code = 200
|
|
388
|
+
mock_response.text = "# Integration Test\n\nThis is a test page"
|
|
389
|
+
mock_post.return_value = mock_response
|
|
390
|
+
|
|
391
|
+
result = scrape_as_markdown(mock_context, "https://example.com")
|
|
392
|
+
|
|
393
|
+
assert result == "# Integration Test\n\nThis is a test page"
|
|
394
|
+
|
|
395
|
+
# Verify the request was made correctly
|
|
396
|
+
call_args = mock_post.call_args
|
|
397
|
+
assert call_args[1]["headers"]["Authorization"] == f"Bearer {BRIGHTDATA_API_KEY}"
|
|
398
|
+
assert "https://api.brightdata.com/request" in str(call_args)
|
|
399
|
+
|
|
400
|
+
@patch("requests.post")
|
|
401
|
+
def test_search_engine_integration(self, mock_post, mock_context):
|
|
402
|
+
mock_response = Mock()
|
|
403
|
+
mock_response.status_code = 200
|
|
404
|
+
mock_response.text = "# Search Results\n\n1. First result\n2. Second result"
|
|
405
|
+
mock_post.return_value = mock_response
|
|
406
|
+
|
|
407
|
+
result = search_engine(mock_context, "test query", engine="google")
|
|
408
|
+
|
|
409
|
+
assert result == "# Search Results\n\n1. First result\n2. Second result"
|
|
410
|
+
|
|
411
|
+
call_args = mock_post.call_args
|
|
412
|
+
payload = call_args[1]["data"]
|
|
413
|
+
assert '"url": "https://www.google.com/search?q=test%20query' in payload
|
|
414
|
+
assert '"data_format": "markdown"' in payload
|