langchain-mrscraper 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyo
5
+ *.pyd
6
+ .Python
7
+ *.egg
8
+ *.egg-info/
9
+ dist/
10
+ build/
11
+ eggs/
12
+ parts/
13
+ var/
14
+ sdist/
15
+ wheels/
16
+ pip-wheel-metadata/
17
+ share/python-wheels/
18
+ *.manifest
19
+ *.spec
20
+ debug_dir/
21
+ debug_results/
22
+ .ruff_cache/
23
+
24
+ # Virtual environments
25
+ .venv/
26
+ venv/
27
+ env/
28
+ ENV/
29
+
30
+ # Distribution / packaging
31
+ .eggs/
32
+ *.tar.gz
33
+
34
+ # Unit test / coverage
35
+ .coverage
36
+ .coverage.*
37
+ htmlcov/
38
+ .cache
39
+ .pytest_cache/
40
+ pytest.log
41
+ nosetests.xml
42
+ coverage.xml
43
+ *.cover
44
+ .hypothesis/
45
+
46
+ # Type checking
47
+ .mypy_cache/
48
+ .dmypy.json
49
+ dmypy.json
50
+ .pytype/
51
+ .pyre/
52
+
53
+ # IDEs
54
+ .idea/
55
+ .vscode/
56
+ *.swp
57
+ *.swo
58
+ *~
59
+
60
+ # macOS
61
+ .DS_Store
62
+
63
+ # Environment files
64
+ .env
65
+ .env.*
66
+ !.env.example
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 MrScraper
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,140 @@
1
+ Metadata-Version: 2.4
2
+ Name: langchain-mrscraper
3
+ Version: 0.1.0
4
+ Summary: LangChain tools for the MrScraper web-scraping API
5
+ Project-URL: Homepage, https://mrscraper.com
6
+ Project-URL: Documentation, https://docs.mrscraper.com
7
+ Project-URL: Repository, https://github.com/mrscraper/langchain-mrscraper
8
+ Author: Riandra Diva Auzan, R&D Team MrScraper
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: langchain,llm,mrscraper,scraping,tools,web scraping
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Internet :: WWW/HTTP
22
+ Classifier: Typing :: Typed
23
+ Requires-Python: >=3.9
24
+ Requires-Dist: langchain-core>=0.3.0
25
+ Requires-Dist: mrscraper-sdk>=0.1.2
26
+ Provides-Extra: dev
27
+ Requires-Dist: langchain-tests>=0.3.0; extra == 'dev'
28
+ Requires-Dist: mypy>=1.10; extra == 'dev'
29
+ Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
30
+ Requires-Dist: pytest>=8; extra == 'dev'
31
+ Requires-Dist: ruff>=0.4; extra == 'dev'
32
+ Provides-Extra: test
33
+ Requires-Dist: langchain-tests>=0.3.0; extra == 'test'
34
+ Requires-Dist: pytest-asyncio>=0.24; extra == 'test'
35
+ Requires-Dist: pytest>=8; extra == 'test'
36
+ Description-Content-Type: text/markdown
37
+
38
+ # langchain-mrscraper
39
+
40
+ LangChain integration package for the [MrScraper SDK](https://pypi.org/project/mrscraper-sdk/).
41
+
42
+ This package exposes MrScraper capabilities as LangChain tools so agents can:
43
+
44
+ - Fetch rendered HTML from protected websites
45
+ - Create AI scrapers from natural-language prompts
46
+ - Rerun AI/manual scrapers (single and bulk)
47
+ - List and fetch scraping results
48
+
49
+ ## Installation
50
+
51
+ ```bash
52
+ pip install -U langchain-mrscraper
53
+ ```
54
+
55
+ or:
56
+
57
+ ```bash
58
+ uv add langchain-mrscraper
59
+ ```
60
+
61
+ `mrscraper-sdk` is installed automatically as a dependency, so users do not need to install it separately.
62
+
63
+ ## Quick start
64
+
65
+ ```python
66
+ import os
67
+ from langchain_mrscraper import MrScraperToolkit
68
+
69
+ os.environ["MRSCRAPER_API_KEY"] = "your-token"
70
+
71
+ tools = MrScraperToolkit().get_tools()
72
+ ```
73
+
74
+ ## Use with an agent
75
+
76
+ ```python
77
+ from langgraph.prebuilt import create_react_agent
78
+ from langchain_openai import ChatOpenAI
79
+ from langchain_mrscraper import MrScraperToolkit
80
+
81
+ tools = MrScraperToolkit(token="your-token").get_tools()
82
+ agent = create_react_agent(ChatOpenAI(model="gpt-4o-mini"), tools)
83
+ ```
84
+
85
+ ## Available tools
86
+
87
+ - `mrscraper_fetch_html`
88
+ - `mrscraper_create_scraper`
89
+ - `mrscraper_rerun_scraper`
90
+ - `mrscraper_bulk_rerun_ai_scraper`
91
+ - `mrscraper_rerun_manual_scraper`
92
+ - `mrscraper_bulk_rerun_manual_scraper`
93
+ - `mrscraper_get_all_results`
94
+ - `mrscraper_get_result_by_id`
95
+
96
+ ## API styles
97
+
98
+ You can initialize via:
99
+
100
+ - `MrScraperToolkit(...).get_tools()` (recommended)
101
+ - `load_mrscraper_tools(...)` convenience function
102
+ - per-tool constructors with `token="..."` or `mrscraper_api_key="..."`
103
+ - environment variables `MRSCRAPER_API_KEY` (preferred) or `MRSCRAPER_API_TOKEN`
104
+
105
+ ## Tools vs. loaders
106
+
107
+ This integration is intentionally tools-first. MrScraper endpoints are action-oriented
108
+ (fetch, create, rerun, list, retrieve) and best represented as `BaseTool` methods that
109
+ agents can call explicitly.
110
+
111
+ A document loader abstraction is usually better when the primary job is deterministic
112
+ "URL -> documents" ingestion into vector stores. MrScraper can support that in a
113
+ separate package later, but this package should remain focused on agent tools.
114
+
115
+ ## Testing
116
+
117
+ ```bash
118
+ pytest tests/unit_tests -v
119
+ ```
120
+
121
+ Integration smoke tests (real API):
122
+
123
+ ```bash
124
+ MRSCRAPER_API_KEY=your-token pytest tests/integration_tests -m integration -v
125
+ ```
126
+
127
+ ## Local release workflow
128
+
129
+ 1. Update `version` in `pyproject.toml`
130
+ 2. Build: `python -m build`
131
+ 3. Upload to TestPyPI: `twine upload --repository testpypi dist/*`
132
+ 4. Verify install from TestPyPI
133
+ 5. Upload to PyPI: `twine upload dist/*`
134
+
135
+ ## Docs files for LangChain PR
136
+
137
+ - Provider page: `docs/providers/mrscraper.mdx`
138
+ - Tool pages: `docs/tools/*.mdx` (one page per tool)
139
+
140
+ These are prepared to submit to `langchain-ai/docs`.
@@ -0,0 +1,103 @@
1
+ # langchain-mrscraper
2
+
3
+ LangChain integration package for the [MrScraper SDK](https://pypi.org/project/mrscraper-sdk/).
4
+
5
+ This package exposes MrScraper capabilities as LangChain tools so agents can:
6
+
7
+ - Fetch rendered HTML from protected websites
8
+ - Create AI scrapers from natural-language prompts
9
+ - Rerun AI/manual scrapers (single and bulk)
10
+ - List and fetch scraping results
11
+
12
+ ## Installation
13
+
14
+ ```bash
15
+ pip install -U langchain-mrscraper
16
+ ```
17
+
18
+ or:
19
+
20
+ ```bash
21
+ uv add langchain-mrscraper
22
+ ```
23
+
24
+ `mrscraper-sdk` is installed automatically as a dependency, so users do not need to install it separately.
25
+
26
+ ## Quick start
27
+
28
+ ```python
29
+ import os
30
+ from langchain_mrscraper import MrScraperToolkit
31
+
32
+ os.environ["MRSCRAPER_API_KEY"] = "your-token"
33
+
34
+ tools = MrScraperToolkit().get_tools()
35
+ ```
36
+
37
+ ## Use with an agent
38
+
39
+ ```python
40
+ from langgraph.prebuilt import create_react_agent
41
+ from langchain_openai import ChatOpenAI
42
+ from langchain_mrscraper import MrScraperToolkit
43
+
44
+ tools = MrScraperToolkit(token="your-token").get_tools()
45
+ agent = create_react_agent(ChatOpenAI(model="gpt-4o-mini"), tools)
46
+ ```
47
+
48
+ ## Available tools
49
+
50
+ - `mrscraper_fetch_html`
51
+ - `mrscraper_create_scraper`
52
+ - `mrscraper_rerun_scraper`
53
+ - `mrscraper_bulk_rerun_ai_scraper`
54
+ - `mrscraper_rerun_manual_scraper`
55
+ - `mrscraper_bulk_rerun_manual_scraper`
56
+ - `mrscraper_get_all_results`
57
+ - `mrscraper_get_result_by_id`
58
+
59
+ ## API styles
60
+
61
+ You can initialize via:
62
+
63
+ - `MrScraperToolkit(...).get_tools()` (recommended)
64
+ - `load_mrscraper_tools(...)` convenience function
65
+ - per-tool constructors with `token="..."` or `mrscraper_api_key="..."`
66
+ - environment variables `MRSCRAPER_API_KEY` (preferred) or `MRSCRAPER_API_TOKEN`
67
+
68
+ ## Tools vs. loaders
69
+
70
+ This integration is intentionally tools-first. MrScraper endpoints are action-oriented
71
+ (fetch, create, rerun, list, retrieve) and best represented as `BaseTool` methods that
72
+ agents can call explicitly.
73
+
74
+ A document loader abstraction is usually better when the primary job is deterministic
75
+ "URL -> documents" ingestion into vector stores. MrScraper can support that in a
76
+ separate package later, but this package should remain focused on agent tools.
77
+
78
+ ## Testing
79
+
80
+ ```bash
81
+ pytest tests/unit_tests -v
82
+ ```
83
+
84
+ Integration smoke tests (real API):
85
+
86
+ ```bash
87
+ MRSCRAPER_API_KEY=your-token pytest tests/integration_tests -m integration -v
88
+ ```
89
+
90
+ ## Local release workflow
91
+
92
+ 1. Update `version` in `pyproject.toml`
93
+ 2. Build: `python -m build`
94
+ 3. Upload to TestPyPI: `twine upload --repository testpypi dist/*`
95
+ 4. Verify install from TestPyPI
96
+ 5. Upload to PyPI: `twine upload dist/*`
97
+
98
+ ## Docs files for LangChain PR
99
+
100
+ - Provider page: `docs/providers/mrscraper.mdx`
101
+ - Tool pages: `docs/tools/*.mdx` (one page per tool)
102
+
103
+ These are prepared to submit to `langchain-ai/docs`.
@@ -0,0 +1,70 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "langchain-mrscraper"
7
+ version = "0.1.0"
8
+ description = "LangChain tools for the MrScraper web-scraping API"
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ requires-python = ">=3.9"
12
+ authors = [
13
+ { name = "Riandra Diva Auzan", url = "https://mrscraper.com" },
14
+ { name = "R&D Team MrScraper", url = "https://mrscraper.com" }
15
+ ]
16
+ keywords = ["langchain", "scraping", "web scraping", "mrscraper", "tools", "llm"]
17
+ classifiers = [
18
+ "Development Status :: 4 - Beta",
19
+ "Intended Audience :: Developers",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Programming Language :: Python :: 3.13",
27
+ "Topic :: Internet :: WWW/HTTP",
28
+ "Typing :: Typed",
29
+ ]
30
+ dependencies = [
31
+ "mrscraper-sdk>=0.1.2",
32
+ "langchain-core>=0.3.0",
33
+ ]
34
+
35
+ [project.optional-dependencies]
36
+ test = [
37
+ "pytest>=8",
38
+ "pytest-asyncio>=0.24",
39
+ "langchain-tests>=0.3.0",
40
+ ]
41
+ dev = [
42
+ "pytest>=8",
43
+ "pytest-asyncio>=0.24",
44
+ "langchain-tests>=0.3.0",
45
+ "ruff>=0.4",
46
+ "mypy>=1.10",
47
+ ]
48
+
49
+ [project.urls]
50
+ Homepage = "https://mrscraper.com"
51
+ Documentation = "https://docs.mrscraper.com"
52
+ Repository = "https://github.com/mrscraper/langchain-mrscraper"
53
+
54
+ [tool.hatch.build.targets.wheel]
55
+ packages = ["src/langchain_mrscraper"]
56
+
57
+ [tool.hatch.build.targets.sdist]
58
+ include = [
59
+ "src/",
60
+ "README.md",
61
+ "LICENSE",
62
+ "pyproject.toml",
63
+ ]
64
+
65
+ [tool.pytest.ini_options]
66
+ asyncio_mode = "auto"
67
+ testpaths = ["tests"]
68
+ markers = [
69
+ "integration: marks tests as integration tests (hit real API)",
70
+ ]
@@ -0,0 +1,34 @@
1
+ """LangChain integration package for MrScraper tools."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ from .tools import (
6
+ MrScraperBulkRerunAIScraper,
7
+ MrScraperBulkRerunManualScraper,
8
+ MrScraperCreateScraper,
9
+ MrScraperFetchHTML,
10
+ MrScraperGetAllResults,
11
+ MrScraperGetResultById,
12
+ MrScraperRerunManualScraper,
13
+ MrScraperRerunScraper,
14
+ MrScraperToolkit,
15
+ load_mrscraper_tools,
16
+ )
17
+
18
+ __all__ = [
19
+ "load_mrscraper_tools",
20
+ "MrScraperToolkit",
21
+ "MrScraperFetchHTML",
22
+ "MrScraperCreateScraper",
23
+ "MrScraperRerunScraper",
24
+ "MrScraperBulkRerunAIScraper",
25
+ "MrScraperRerunManualScraper",
26
+ "MrScraperBulkRerunManualScraper",
27
+ "MrScraperGetAllResults",
28
+ "MrScraperGetResultById",
29
+ ]
30
+
31
+ try:
32
+ __version__ = version("langchain-mrscraper")
33
+ except PackageNotFoundError:
34
+ __version__ = "0.0.0"
@@ -0,0 +1,368 @@
1
+ """LangChain tools for the MrScraper web scraping API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import os
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from typing import Any, Literal, Optional, Sequence, Type
10
+
11
+ from langchain_core.tools import BaseTool
12
+ from pydantic import BaseModel, Field
13
+
14
+ from mrscraper import MrScraper
15
+
16
+
17
+ def _serialize_response(result: dict[str, Any]) -> str:
18
+ """Convert API response dict to tool output text."""
19
+ return json.dumps(result, indent=2, default=str)
20
+
21
+
22
+ def _run_coro_sync(coro: Any) -> Any:
23
+ """Run coroutine from sync context, including active event loop environments."""
24
+ try:
25
+ asyncio.get_running_loop()
26
+ except RuntimeError:
27
+ return asyncio.run(coro)
28
+
29
+ # When already in an event loop (e.g. notebooks), run coroutine in a fresh loop
30
+ # inside a worker thread to avoid nested-loop RuntimeError.
31
+ with ThreadPoolExecutor(max_workers=1) as executor:
32
+ future = executor.submit(asyncio.run, coro)
33
+ return future.result()
34
+
35
+
36
+ class FetchHTMLInput(BaseModel):
37
+ """Input schema for fetching rendered HTML."""
38
+
39
+ url: str = Field(description="The full URL to fetch.")
40
+ timeout: int = Field(default=120, description="Max seconds to wait for page load.")
41
+ geo_code: str = Field(default="US", description="Two-letter proxy country code.")
42
+ block_resources: bool = Field(
43
+ default=False,
44
+ description="Whether to block image/CSS/font resources for faster loading.",
45
+ )
46
+
47
+
48
+ class CreateScraperInput(BaseModel):
49
+ """Input schema for creating and running an AI scraper."""
50
+
51
+ url: str = Field(description="Target URL to scrape.")
52
+ message: str = Field(description="Natural-language extraction instructions.")
53
+ agent: Literal["general", "listing", "map"] = Field(
54
+ default="general",
55
+ description="Scraper mode: general, listing, or map.",
56
+ )
57
+ proxy_country: Optional[str] = Field(default=None, description="Two-letter proxy country code.")
58
+ max_depth: int = Field(default=2, description="Map mode: crawl depth.")
59
+ max_pages: int = Field(default=50, description="Map mode: max pages to crawl.")
60
+ limit: int = Field(default=1000, description="Map mode: max records to extract.")
61
+ include_patterns: str = Field(default="", description="Map mode: include URL regex patterns.")
62
+ exclude_patterns: str = Field(default="", description="Map mode: exclude URL regex patterns.")
63
+
64
+
65
+ class RerunScraperInput(BaseModel):
66
+ """Input schema for rerunning an AI scraper."""
67
+
68
+ scraper_id: str = Field(description="Existing AI scraper ID.")
69
+ url: str = Field(description="URL to run the scraper against.")
70
+ max_depth: int = Field(default=2, description="Map mode: crawl depth.")
71
+ max_pages: int = Field(default=50, description="Map mode: max pages to crawl.")
72
+ limit: int = Field(default=1000, description="Map mode: max records to extract.")
73
+ include_patterns: str = Field(default="", description="Map mode: include URL regex patterns.")
74
+ exclude_patterns: str = Field(default="", description="Map mode: exclude URL regex patterns.")
75
+
76
+
77
+ class BulkRerunAIScraperInput(BaseModel):
78
+ """Input schema for bulk rerunning an AI scraper."""
79
+
80
+ scraper_id: str = Field(description="Existing AI scraper ID.")
81
+ urls: list[str] = Field(min_length=1, description="One or more target URLs.")
82
+
83
+
84
+ class RerunManualScraperInput(BaseModel):
85
+ """Input schema for rerunning a manual scraper."""
86
+
87
+ scraper_id: str = Field(description="Manual scraper ID from MrScraper dashboard.")
88
+ url: str = Field(description="URL to run the scraper against.")
89
+
90
+
91
+ class BulkRerunManualScraperInput(BaseModel):
92
+ """Input schema for bulk rerunning a manual scraper."""
93
+
94
+ scraper_id: str = Field(description="Manual scraper ID from MrScraper dashboard.")
95
+ urls: list[str] = Field(min_length=1, description="One or more target URLs.")
96
+
97
+
98
+ class GetAllResultsInput(BaseModel):
99
+ """Input schema for listing results."""
100
+
101
+ sort_field: Literal[
102
+ "createdAt",
103
+ "updatedAt",
104
+ "id",
105
+ "type",
106
+ "url",
107
+ "status",
108
+ "error",
109
+ "tokenUsage",
110
+ "runtime",
111
+ ] = Field(default="updatedAt", description="Field used for sorting.")
112
+ sort_order: Literal["ASC", "DESC"] = Field(default="DESC", description="Sort direction.")
113
+ page_size: int = Field(default=10, description="Results per page.")
114
+ page: int = Field(default=1, description="Page number, starting at 1.")
115
+ search: Optional[str] = Field(default=None, description="Free text search string.")
116
+ date_range_column: Optional[str] = Field(default=None, description="Date field for range filtering.")
117
+ start_at: Optional[str] = Field(default=None, description="ISO-8601 start date.")
118
+ end_at: Optional[str] = Field(default=None, description="ISO-8601 end date.")
119
+
120
+
121
+ class GetResultByIdInput(BaseModel):
122
+ """Input schema for fetching a single result."""
123
+
124
+ result_id: str = Field(description="MrScraper result ID.")
125
+
126
+
127
+ class MrScraperBaseTool(BaseTool):
128
+ """Base class for all MrScraper tools."""
129
+
130
+ client: Any = Field(default=None, exclude=True, repr=False)
131
+ token: Optional[str] = Field(default=None, exclude=True, repr=False)
132
+ mrscraper_api_key: Optional[str] = Field(default=None, exclude=True, repr=False)
133
+
134
+ @staticmethod
135
+ def _first_arg_or_kwargs(args: tuple[Any, ...], kwargs: dict[str, Any]) -> dict[str, Any]:
136
+ """Normalize args for direct and low-level calls."""
137
+ if args and isinstance(args[0], dict):
138
+ return {**kwargs, **args[0]}
139
+ return kwargs
140
+
141
+ def _resolve_token(self) -> str:
142
+ token = (
143
+ self.token
144
+ or self.mrscraper_api_key
145
+ or os.getenv("MRSCRAPER_API_KEY")
146
+ or os.getenv("MRSCRAPER_API_TOKEN")
147
+ )
148
+ if not token:
149
+ raise ValueError(
150
+ "Missing MrScraper API key. Pass `token` or `mrscraper_api_key` when "
151
+ "initializing the tool, or set MRSCRAPER_API_KEY / MRSCRAPER_API_TOKEN."
152
+ )
153
+ return token
154
+
155
+ def _get_client(self) -> MrScraper:
156
+ if self.client is None:
157
+ self.client = MrScraper(token=self._resolve_token())
158
+ return self.client
159
+
160
+
161
+ class MrScraperFetchHTML(MrScraperBaseTool):
162
+ """Fetch rendered HTML for a webpage."""
163
+
164
+ name: str = "mrscraper_fetch_html"
165
+ description: str = (
166
+ "Fetch rendered HTML using MrScraper's stealth browser. "
167
+ "Useful when you need full page HTML after JavaScript execution."
168
+ )
169
+ args_schema: Type[BaseModel] = FetchHTMLInput
170
+
171
+ def _run(self, *args: Any, **kwargs: Any) -> str:
172
+ params = self._first_arg_or_kwargs(args, kwargs)
173
+ return _run_coro_sync(self._arun(**params))
174
+
175
+ async def _arun(self, **kwargs: Any) -> str:
176
+ result = await self._get_client().fetch_html(**kwargs)
177
+ return _serialize_response(result)
178
+
179
+
180
+ class MrScraperCreateScraper(MrScraperBaseTool):
181
+ """Create and run an AI scraper."""
182
+
183
+ name: str = "mrscraper_create_scraper"
184
+ description: str = (
185
+ "Create and run an AI-powered scraper from natural-language instructions. "
186
+ "Returns scraper metadata, including scraper ID for follow-up runs."
187
+ )
188
+ args_schema: Type[BaseModel] = CreateScraperInput
189
+
190
+ def _run(self, *args: Any, **kwargs: Any) -> str:
191
+ params = self._first_arg_or_kwargs(args, kwargs)
192
+ return _run_coro_sync(self._arun(**params))
193
+
194
+ async def _arun(self, **kwargs: Any) -> str:
195
+ result = await self._get_client().create_scraper(**kwargs)
196
+ return _serialize_response(result)
197
+
198
+
199
+ class MrScraperRerunScraper(MrScraperBaseTool):
200
+ """Rerun an existing AI scraper."""
201
+
202
+ name: str = "mrscraper_rerun_scraper"
203
+ description: str = (
204
+ "Rerun an existing AI scraper on a different URL while preserving extraction logic."
205
+ )
206
+ args_schema: Type[BaseModel] = RerunScraperInput
207
+
208
+ def _run(self, *args: Any, **kwargs: Any) -> str:
209
+ params = self._first_arg_or_kwargs(args, kwargs)
210
+ return _run_coro_sync(self._arun(**params))
211
+
212
+ async def _arun(self, **kwargs: Any) -> str:
213
+ result = await self._get_client().rerun_scraper(**kwargs)
214
+ return _serialize_response(result)
215
+
216
+
217
+ class MrScraperBulkRerunAIScraper(MrScraperBaseTool):
218
+ """Rerun an AI scraper for multiple URLs."""
219
+
220
+ name: str = "mrscraper_bulk_rerun_ai_scraper"
221
+ description: str = "Bulk rerun an AI scraper across multiple URLs."
222
+ args_schema: Type[BaseModel] = BulkRerunAIScraperInput
223
+
224
+ def _run(self, *args: Any, **kwargs: Any) -> str:
225
+ params = self._first_arg_or_kwargs(args, kwargs)
226
+ return _run_coro_sync(self._arun(**params))
227
+
228
+ async def _arun(self, **kwargs: Any) -> str:
229
+ result = await self._get_client().bulk_rerun_ai_scraper(**kwargs)
230
+ return _serialize_response(result)
231
+
232
+
233
+ class MrScraperRerunManualScraper(MrScraperBaseTool):
234
+ """Rerun a dashboard-defined manual scraper."""
235
+
236
+ name: str = "mrscraper_rerun_manual_scraper"
237
+ description: str = "Rerun a manual dashboard scraper on a target URL."
238
+ args_schema: Type[BaseModel] = RerunManualScraperInput
239
+
240
+ def _run(self, *args: Any, **kwargs: Any) -> str:
241
+ params = self._first_arg_or_kwargs(args, kwargs)
242
+ return _run_coro_sync(self._arun(**params))
243
+
244
+ async def _arun(self, **kwargs: Any) -> str:
245
+ result = await self._get_client().rerun_manual_scraper(**kwargs)
246
+ return _serialize_response(result)
247
+
248
+
249
+ class MrScraperBulkRerunManualScraper(MrScraperBaseTool):
250
+ """Rerun a manual scraper for multiple URLs."""
251
+
252
+ name: str = "mrscraper_bulk_rerun_manual_scraper"
253
+ description: str = (
254
+ "Bulk rerun a manual scraper across multiple URLs in one request."
255
+ )
256
+ args_schema: Type[BaseModel] = BulkRerunManualScraperInput
257
+
258
+ def _run(self, *args: Any, **kwargs: Any) -> str:
259
+ params = self._first_arg_or_kwargs(args, kwargs)
260
+ return _run_coro_sync(self._arun(**params))
261
+
262
+ async def _arun(self, **kwargs: Any) -> str:
263
+ result = await self._get_client().bulk_rerun_manual_scraper(**kwargs)
264
+ return _serialize_response(result)
265
+
266
+
267
+ class MrScraperGetAllResults(MrScraperBaseTool):
268
+ """Get paginated scraping results."""
269
+
270
+ name: str = "mrscraper_get_all_results"
271
+ description: str = (
272
+ "List scraping results with pagination, sorting, search, and date filters."
273
+ )
274
+ args_schema: Type[BaseModel] = GetAllResultsInput
275
+
276
+ def _run(self, *args: Any, **kwargs: Any) -> str:
277
+ params = self._first_arg_or_kwargs(args, kwargs)
278
+ return _run_coro_sync(self._arun(**params))
279
+
280
+ async def _arun(self, **kwargs: Any) -> str:
281
+ result = await self._get_client().get_all_results(**kwargs)
282
+ return _serialize_response(result)
283
+
284
+
285
+ class MrScraperGetResultById(MrScraperBaseTool):
286
+ """Get one scraping result by ID."""
287
+
288
+ name: str = "mrscraper_get_result_by_id"
289
+ description: str = "Fetch a specific scraping result by result ID."
290
+ args_schema: Type[BaseModel] = GetResultByIdInput
291
+
292
+ def _run(self, *args: Any, **kwargs: Any) -> str:
293
+ params = self._first_arg_or_kwargs(args, kwargs)
294
+ return _run_coro_sync(self._arun(**params))
295
+
296
+ async def _arun(self, **kwargs: Any) -> str:
297
+ result = await self._get_client().get_result_by_id(**kwargs)
298
+ return _serialize_response(result)
299
+
300
+
301
+ TOOL_CLASSES: tuple[type[MrScraperBaseTool], ...] = (
302
+ MrScraperFetchHTML,
303
+ MrScraperCreateScraper,
304
+ MrScraperRerunScraper,
305
+ MrScraperBulkRerunAIScraper,
306
+ MrScraperRerunManualScraper,
307
+ MrScraperBulkRerunManualScraper,
308
+ MrScraperGetAllResults,
309
+ MrScraperGetResultById,
310
+ )
311
+
312
+
313
+ def load_mrscraper_tools(
314
+ *,
315
+ token: Optional[str] = None,
316
+ mrscraper_api_key: Optional[str] = None,
317
+ client: Optional[MrScraper] = None,
318
+ tool_names: Optional[Sequence[str]] = None,
319
+ ) -> list[BaseTool]:
320
+ """Construct a configured list of MrScraper tools."""
321
+ resolved_client = client
322
+ resolved_token = (
323
+ token
324
+ or mrscraper_api_key
325
+ or os.getenv("MRSCRAPER_API_KEY")
326
+ or os.getenv("MRSCRAPER_API_TOKEN")
327
+ )
328
+ if resolved_client is None:
329
+ if resolved_token is None:
330
+ raise ValueError(
331
+ "Either client, token, or mrscraper_api_key must be provided "
332
+ "(or set MRSCRAPER_API_KEY / MRSCRAPER_API_TOKEN)."
333
+ )
334
+ resolved_client = MrScraper(token=resolved_token)
335
+
336
+ tools = [tool_cls(client=resolved_client) for tool_cls in TOOL_CLASSES]
337
+ if tool_names is None:
338
+ return tools
339
+
340
+ requested = set(tool_names)
341
+ return [tool for tool in tools if tool.name in requested]
342
+
343
+
344
+ class MrScraperToolkit:
345
+ """Factory object that returns configured MrScraper LangChain tools."""
346
+
347
+ def __init__(
348
+ self,
349
+ *,
350
+ token: Optional[str] = None,
351
+ mrscraper_api_key: Optional[str] = None,
352
+ client: Optional[MrScraper] = None,
353
+ tool_names: Optional[Sequence[str]] = None,
354
+ ) -> None:
355
+ self._token = token
356
+ self._mrscraper_api_key = mrscraper_api_key
357
+ self._client = client
358
+ self._tool_names = tool_names
359
+
360
+ def get_tools(self) -> list[BaseTool]:
361
+ """Return a list of MrScraper tools ready for LangChain agents."""
362
+ return load_mrscraper_tools(
363
+ token=self._token,
364
+ mrscraper_api_key=self._mrscraper_api_key,
365
+ client=self._client,
366
+ tool_names=self._tool_names,
367
+ )
368
+