langchain-mrscraper 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langchain_mrscraper-0.1.0/.gitignore +66 -0
- langchain_mrscraper-0.1.0/LICENSE +21 -0
- langchain_mrscraper-0.1.0/PKG-INFO +140 -0
- langchain_mrscraper-0.1.0/README.md +103 -0
- langchain_mrscraper-0.1.0/pyproject.toml +70 -0
- langchain_mrscraper-0.1.0/src/langchain_mrscraper/__init__.py +34 -0
- langchain_mrscraper-0.1.0/src/langchain_mrscraper/tools.py +368 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.pyo
|
|
5
|
+
*.pyd
|
|
6
|
+
.Python
|
|
7
|
+
*.egg
|
|
8
|
+
*.egg-info/
|
|
9
|
+
dist/
|
|
10
|
+
build/
|
|
11
|
+
eggs/
|
|
12
|
+
parts/
|
|
13
|
+
var/
|
|
14
|
+
sdist/
|
|
15
|
+
wheels/
|
|
16
|
+
pip-wheel-metadata/
|
|
17
|
+
share/python-wheels/
|
|
18
|
+
*.manifest
|
|
19
|
+
*.spec
|
|
20
|
+
debug_dir/
|
|
21
|
+
debug_results/
|
|
22
|
+
.ruff_cache/
|
|
23
|
+
|
|
24
|
+
# Virtual environments
|
|
25
|
+
.venv/
|
|
26
|
+
venv/
|
|
27
|
+
env/
|
|
28
|
+
ENV/
|
|
29
|
+
|
|
30
|
+
# Distribution / packaging
|
|
31
|
+
.eggs/
|
|
32
|
+
*.tar.gz
|
|
33
|
+
|
|
34
|
+
# Unit test / coverage
|
|
35
|
+
.coverage
|
|
36
|
+
.coverage.*
|
|
37
|
+
htmlcov/
|
|
38
|
+
.cache
|
|
39
|
+
.pytest_cache/
|
|
40
|
+
pytest.log
|
|
41
|
+
nosetests.xml
|
|
42
|
+
coverage.xml
|
|
43
|
+
*.cover
|
|
44
|
+
.hypothesis/
|
|
45
|
+
|
|
46
|
+
# Type checking
|
|
47
|
+
.mypy_cache/
|
|
48
|
+
.dmypy.json
|
|
49
|
+
dmypy.json
|
|
50
|
+
.pytype/
|
|
51
|
+
.pyre/
|
|
52
|
+
|
|
53
|
+
# IDEs
|
|
54
|
+
.idea/
|
|
55
|
+
.vscode/
|
|
56
|
+
*.swp
|
|
57
|
+
*.swo
|
|
58
|
+
*~
|
|
59
|
+
|
|
60
|
+
# macOS
|
|
61
|
+
.DS_Store
|
|
62
|
+
|
|
63
|
+
# Environment files
|
|
64
|
+
.env
|
|
65
|
+
.env.*
|
|
66
|
+
!.env.example
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 MrScraper
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: langchain-mrscraper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LangChain tools for the MrScraper web-scraping API
|
|
5
|
+
Project-URL: Homepage, https://mrscraper.com
|
|
6
|
+
Project-URL: Documentation, https://docs.mrscraper.com
|
|
7
|
+
Project-URL: Repository, https://github.com/mrscraper/langchain-mrscraper
|
|
8
|
+
Author: Riandra Diva Auzan, R&D Team MrScraper
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: langchain,llm,mrscraper,scraping,tools,web scraping
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Requires-Dist: langchain-core>=0.3.0
|
|
25
|
+
Requires-Dist: mrscraper-sdk>=0.1.2
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: langchain-tests>=0.3.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
31
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
32
|
+
Provides-Extra: test
|
|
33
|
+
Requires-Dist: langchain-tests>=0.3.0; extra == 'test'
|
|
34
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == 'test'
|
|
35
|
+
Requires-Dist: pytest>=8; extra == 'test'
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
# langchain-mrscraper
|
|
39
|
+
|
|
40
|
+
LangChain integration package for the [MrScraper SDK](https://pypi.org/project/mrscraper-sdk/).
|
|
41
|
+
|
|
42
|
+
This package exposes MrScraper capabilities as LangChain tools so agents can:
|
|
43
|
+
|
|
44
|
+
- Fetch rendered HTML from protected websites
|
|
45
|
+
- Create AI scrapers from natural-language prompts
|
|
46
|
+
- Rerun AI/manual scrapers (single and bulk)
|
|
47
|
+
- List and fetch scraping results
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install -U langchain-mrscraper
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
or:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
uv add langchain-mrscraper
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
`mrscraper-sdk` is installed automatically as a dependency, so users do not need to install it separately.
|
|
62
|
+
|
|
63
|
+
## Quick start
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
import os
|
|
67
|
+
from langchain_mrscraper import MrScraperToolkit
|
|
68
|
+
|
|
69
|
+
os.environ["MRSCRAPER_API_KEY"] = "your-token"
|
|
70
|
+
|
|
71
|
+
tools = MrScraperToolkit().get_tools()
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Use with an agent
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from langgraph.prebuilt import create_react_agent
|
|
78
|
+
from langchain_openai import ChatOpenAI
|
|
79
|
+
from langchain_mrscraper import MrScraperToolkit
|
|
80
|
+
|
|
81
|
+
tools = MrScraperToolkit(token="your-token").get_tools()
|
|
82
|
+
agent = create_react_agent(ChatOpenAI(model="gpt-4o-mini"), tools)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Available tools
|
|
86
|
+
|
|
87
|
+
- `mrscraper_fetch_html`
|
|
88
|
+
- `mrscraper_create_scraper`
|
|
89
|
+
- `mrscraper_rerun_scraper`
|
|
90
|
+
- `mrscraper_bulk_rerun_ai_scraper`
|
|
91
|
+
- `mrscraper_rerun_manual_scraper`
|
|
92
|
+
- `mrscraper_bulk_rerun_manual_scraper`
|
|
93
|
+
- `mrscraper_get_all_results`
|
|
94
|
+
- `mrscraper_get_result_by_id`
|
|
95
|
+
|
|
96
|
+
## API styles
|
|
97
|
+
|
|
98
|
+
You can initialize via:
|
|
99
|
+
|
|
100
|
+
- `MrScraperToolkit(...).get_tools()` (recommended)
|
|
101
|
+
- `load_mrscraper_tools(...)` convenience function
|
|
102
|
+
- per-tool constructors with `token="..."` or `mrscraper_api_key="..."`
|
|
103
|
+
- environment variables `MRSCRAPER_API_KEY` (preferred) or `MRSCRAPER_API_TOKEN`
|
|
104
|
+
|
|
105
|
+
## Tools vs. loaders
|
|
106
|
+
|
|
107
|
+
This integration is intentionally tools-first. MrScraper endpoints are action-oriented
|
|
108
|
+
(fetch, create, rerun, list, retrieve) and best represented as `BaseTool` methods that
|
|
109
|
+
agents can call explicitly.
|
|
110
|
+
|
|
111
|
+
A document loader abstraction is usually better when the primary job is deterministic
|
|
112
|
+
"URL -> documents" ingestion into vector stores. MrScraper can support that in a
|
|
113
|
+
separate package later, but this package should remain focused on agent tools.
|
|
114
|
+
|
|
115
|
+
## Testing
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
pytest tests/unit_tests -v
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Integration smoke tests (real API):
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
MRSCRAPER_API_KEY=your-token pytest tests/integration_tests -m integration -v
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Local release workflow
|
|
128
|
+
|
|
129
|
+
1. Update `version` in `pyproject.toml`
|
|
130
|
+
2. Build: `python -m build`
|
|
131
|
+
3. Upload to TestPyPI: `twine upload --repository testpypi dist/*`
|
|
132
|
+
4. Verify install from TestPyPI
|
|
133
|
+
5. Upload to PyPI: `twine upload dist/*`
|
|
134
|
+
|
|
135
|
+
## Docs files for LangChain PR
|
|
136
|
+
|
|
137
|
+
- Provider page: `docs/providers/mrscraper.mdx`
|
|
138
|
+
- Tool pages: `docs/tools/*.mdx` (one page per tool)
|
|
139
|
+
|
|
140
|
+
These are prepared to submit to `langchain-ai/docs`.
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# langchain-mrscraper
|
|
2
|
+
|
|
3
|
+
LangChain integration package for the [MrScraper SDK](https://pypi.org/project/mrscraper-sdk/).
|
|
4
|
+
|
|
5
|
+
This package exposes MrScraper capabilities as LangChain tools so agents can:
|
|
6
|
+
|
|
7
|
+
- Fetch rendered HTML from protected websites
|
|
8
|
+
- Create AI scrapers from natural-language prompts
|
|
9
|
+
- Rerun AI/manual scrapers (single and bulk)
|
|
10
|
+
- List and fetch scraping results
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install -U langchain-mrscraper
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
or:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
uv add langchain-mrscraper
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
`mrscraper-sdk` is installed automatically as a dependency, so users do not need to install it separately.
|
|
25
|
+
|
|
26
|
+
## Quick start
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import os
|
|
30
|
+
from langchain_mrscraper import MrScraperToolkit
|
|
31
|
+
|
|
32
|
+
os.environ["MRSCRAPER_API_KEY"] = "your-token"
|
|
33
|
+
|
|
34
|
+
tools = MrScraperToolkit().get_tools()
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Use with an agent
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from langgraph.prebuilt import create_react_agent
|
|
41
|
+
from langchain_openai import ChatOpenAI
|
|
42
|
+
from langchain_mrscraper import MrScraperToolkit
|
|
43
|
+
|
|
44
|
+
tools = MrScraperToolkit(token="your-token").get_tools()
|
|
45
|
+
agent = create_react_agent(ChatOpenAI(model="gpt-4o-mini"), tools)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Available tools
|
|
49
|
+
|
|
50
|
+
- `mrscraper_fetch_html`
|
|
51
|
+
- `mrscraper_create_scraper`
|
|
52
|
+
- `mrscraper_rerun_scraper`
|
|
53
|
+
- `mrscraper_bulk_rerun_ai_scraper`
|
|
54
|
+
- `mrscraper_rerun_manual_scraper`
|
|
55
|
+
- `mrscraper_bulk_rerun_manual_scraper`
|
|
56
|
+
- `mrscraper_get_all_results`
|
|
57
|
+
- `mrscraper_get_result_by_id`
|
|
58
|
+
|
|
59
|
+
## API styles
|
|
60
|
+
|
|
61
|
+
You can initialize via:
|
|
62
|
+
|
|
63
|
+
- `MrScraperToolkit(...).get_tools()` (recommended)
|
|
64
|
+
- `load_mrscraper_tools(...)` convenience function
|
|
65
|
+
- per-tool constructors with `token="..."` or `mrscraper_api_key="..."`
|
|
66
|
+
- environment variables `MRSCRAPER_API_KEY` (preferred) or `MRSCRAPER_API_TOKEN`
|
|
67
|
+
|
|
68
|
+
## Tools vs. loaders
|
|
69
|
+
|
|
70
|
+
This integration is intentionally tools-first. MrScraper endpoints are action-oriented
|
|
71
|
+
(fetch, create, rerun, list, retrieve) and best represented as `BaseTool` methods that
|
|
72
|
+
agents can call explicitly.
|
|
73
|
+
|
|
74
|
+
A document loader abstraction is usually better when the primary job is deterministic
|
|
75
|
+
"URL -> documents" ingestion into vector stores. MrScraper can support that in a
|
|
76
|
+
separate package later, but this package should remain focused on agent tools.
|
|
77
|
+
|
|
78
|
+
## Testing
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
pytest tests/unit_tests -v
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Integration smoke tests (real API):
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
MRSCRAPER_API_KEY=your-token pytest tests/integration_tests -m integration -v
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Local release workflow
|
|
91
|
+
|
|
92
|
+
1. Update `version` in `pyproject.toml`
|
|
93
|
+
2. Build: `python -m build`
|
|
94
|
+
3. Upload to TestPyPI: `twine upload --repository testpypi dist/*`
|
|
95
|
+
4. Verify install from TestPyPI
|
|
96
|
+
5. Upload to PyPI: `twine upload dist/*`
|
|
97
|
+
|
|
98
|
+
## Docs files for LangChain PR
|
|
99
|
+
|
|
100
|
+
- Provider page: `docs/providers/mrscraper.mdx`
|
|
101
|
+
- Tool pages: `docs/tools/*.mdx` (one page per tool)
|
|
102
|
+
|
|
103
|
+
These are prepared to submit to `langchain-ai/docs`.
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "langchain-mrscraper"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "LangChain tools for the MrScraper web-scraping API"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Riandra Diva Auzan", url = "https://mrscraper.com" },
|
|
14
|
+
{ name = "R&D Team MrScraper", url = "https://mrscraper.com" }
|
|
15
|
+
]
|
|
16
|
+
keywords = ["langchain", "scraping", "web scraping", "mrscraper", "tools", "llm"]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Development Status :: 4 - Beta",
|
|
19
|
+
"Intended Audience :: Developers",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Programming Language :: Python :: 3.13",
|
|
27
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
28
|
+
"Typing :: Typed",
|
|
29
|
+
]
|
|
30
|
+
dependencies = [
|
|
31
|
+
"mrscraper-sdk>=0.1.2",
|
|
32
|
+
"langchain-core>=0.3.0",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.optional-dependencies]
|
|
36
|
+
test = [
|
|
37
|
+
"pytest>=8",
|
|
38
|
+
"pytest-asyncio>=0.24",
|
|
39
|
+
"langchain-tests>=0.3.0",
|
|
40
|
+
]
|
|
41
|
+
dev = [
|
|
42
|
+
"pytest>=8",
|
|
43
|
+
"pytest-asyncio>=0.24",
|
|
44
|
+
"langchain-tests>=0.3.0",
|
|
45
|
+
"ruff>=0.4",
|
|
46
|
+
"mypy>=1.10",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[project.urls]
|
|
50
|
+
Homepage = "https://mrscraper.com"
|
|
51
|
+
Documentation = "https://docs.mrscraper.com"
|
|
52
|
+
Repository = "https://github.com/mrscraper/langchain-mrscraper"
|
|
53
|
+
|
|
54
|
+
[tool.hatch.build.targets.wheel]
|
|
55
|
+
packages = ["src/langchain_mrscraper"]
|
|
56
|
+
|
|
57
|
+
[tool.hatch.build.targets.sdist]
|
|
58
|
+
include = [
|
|
59
|
+
"src/",
|
|
60
|
+
"README.md",
|
|
61
|
+
"LICENSE",
|
|
62
|
+
"pyproject.toml",
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
[tool.pytest.ini_options]
|
|
66
|
+
asyncio_mode = "auto"
|
|
67
|
+
testpaths = ["tests"]
|
|
68
|
+
markers = [
|
|
69
|
+
"integration: marks tests as integration tests (hit real API)",
|
|
70
|
+
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""LangChain integration package for MrScraper tools."""
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
4
|
+
|
|
5
|
+
from .tools import (
|
|
6
|
+
MrScraperBulkRerunAIScraper,
|
|
7
|
+
MrScraperBulkRerunManualScraper,
|
|
8
|
+
MrScraperCreateScraper,
|
|
9
|
+
MrScraperFetchHTML,
|
|
10
|
+
MrScraperGetAllResults,
|
|
11
|
+
MrScraperGetResultById,
|
|
12
|
+
MrScraperRerunManualScraper,
|
|
13
|
+
MrScraperRerunScraper,
|
|
14
|
+
MrScraperToolkit,
|
|
15
|
+
load_mrscraper_tools,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"load_mrscraper_tools",
|
|
20
|
+
"MrScraperToolkit",
|
|
21
|
+
"MrScraperFetchHTML",
|
|
22
|
+
"MrScraperCreateScraper",
|
|
23
|
+
"MrScraperRerunScraper",
|
|
24
|
+
"MrScraperBulkRerunAIScraper",
|
|
25
|
+
"MrScraperRerunManualScraper",
|
|
26
|
+
"MrScraperBulkRerunManualScraper",
|
|
27
|
+
"MrScraperGetAllResults",
|
|
28
|
+
"MrScraperGetResultById",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
__version__ = version("langchain-mrscraper")
|
|
33
|
+
except PackageNotFoundError:
|
|
34
|
+
__version__ = "0.0.0"
|
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
"""LangChain tools for the MrScraper web scraping API."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
9
|
+
from typing import Any, Literal, Optional, Sequence, Type
|
|
10
|
+
|
|
11
|
+
from langchain_core.tools import BaseTool
|
|
12
|
+
from pydantic import BaseModel, Field
|
|
13
|
+
|
|
14
|
+
from mrscraper import MrScraper
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _serialize_response(result: dict[str, Any]) -> str:
|
|
18
|
+
"""Convert API response dict to tool output text."""
|
|
19
|
+
return json.dumps(result, indent=2, default=str)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _run_coro_sync(coro: Any) -> Any:
|
|
23
|
+
"""Run coroutine from sync context, including active event loop environments."""
|
|
24
|
+
try:
|
|
25
|
+
asyncio.get_running_loop()
|
|
26
|
+
except RuntimeError:
|
|
27
|
+
return asyncio.run(coro)
|
|
28
|
+
|
|
29
|
+
# When already in an event loop (e.g. notebooks), run coroutine in a fresh loop
|
|
30
|
+
# inside a worker thread to avoid nested-loop RuntimeError.
|
|
31
|
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
|
32
|
+
future = executor.submit(asyncio.run, coro)
|
|
33
|
+
return future.result()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class FetchHTMLInput(BaseModel):
|
|
37
|
+
"""Input schema for fetching rendered HTML."""
|
|
38
|
+
|
|
39
|
+
url: str = Field(description="The full URL to fetch.")
|
|
40
|
+
timeout: int = Field(default=120, description="Max seconds to wait for page load.")
|
|
41
|
+
geo_code: str = Field(default="US", description="Two-letter proxy country code.")
|
|
42
|
+
block_resources: bool = Field(
|
|
43
|
+
default=False,
|
|
44
|
+
description="Whether to block image/CSS/font resources for faster loading.",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class CreateScraperInput(BaseModel):
|
|
49
|
+
"""Input schema for creating and running an AI scraper."""
|
|
50
|
+
|
|
51
|
+
url: str = Field(description="Target URL to scrape.")
|
|
52
|
+
message: str = Field(description="Natural-language extraction instructions.")
|
|
53
|
+
agent: Literal["general", "listing", "map"] = Field(
|
|
54
|
+
default="general",
|
|
55
|
+
description="Scraper mode: general, listing, or map.",
|
|
56
|
+
)
|
|
57
|
+
proxy_country: Optional[str] = Field(default=None, description="Two-letter proxy country code.")
|
|
58
|
+
max_depth: int = Field(default=2, description="Map mode: crawl depth.")
|
|
59
|
+
max_pages: int = Field(default=50, description="Map mode: max pages to crawl.")
|
|
60
|
+
limit: int = Field(default=1000, description="Map mode: max records to extract.")
|
|
61
|
+
include_patterns: str = Field(default="", description="Map mode: include URL regex patterns.")
|
|
62
|
+
exclude_patterns: str = Field(default="", description="Map mode: exclude URL regex patterns.")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class RerunScraperInput(BaseModel):
|
|
66
|
+
"""Input schema for rerunning an AI scraper."""
|
|
67
|
+
|
|
68
|
+
scraper_id: str = Field(description="Existing AI scraper ID.")
|
|
69
|
+
url: str = Field(description="URL to run the scraper against.")
|
|
70
|
+
max_depth: int = Field(default=2, description="Map mode: crawl depth.")
|
|
71
|
+
max_pages: int = Field(default=50, description="Map mode: max pages to crawl.")
|
|
72
|
+
limit: int = Field(default=1000, description="Map mode: max records to extract.")
|
|
73
|
+
include_patterns: str = Field(default="", description="Map mode: include URL regex patterns.")
|
|
74
|
+
exclude_patterns: str = Field(default="", description="Map mode: exclude URL regex patterns.")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class BulkRerunAIScraperInput(BaseModel):
|
|
78
|
+
"""Input schema for bulk rerunning an AI scraper."""
|
|
79
|
+
|
|
80
|
+
scraper_id: str = Field(description="Existing AI scraper ID.")
|
|
81
|
+
urls: list[str] = Field(min_length=1, description="One or more target URLs.")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class RerunManualScraperInput(BaseModel):
|
|
85
|
+
"""Input schema for rerunning a manual scraper."""
|
|
86
|
+
|
|
87
|
+
scraper_id: str = Field(description="Manual scraper ID from MrScraper dashboard.")
|
|
88
|
+
url: str = Field(description="URL to run the scraper against.")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class BulkRerunManualScraperInput(BaseModel):
|
|
92
|
+
"""Input schema for bulk rerunning a manual scraper."""
|
|
93
|
+
|
|
94
|
+
scraper_id: str = Field(description="Manual scraper ID from MrScraper dashboard.")
|
|
95
|
+
urls: list[str] = Field(min_length=1, description="One or more target URLs.")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class GetAllResultsInput(BaseModel):
|
|
99
|
+
"""Input schema for listing results."""
|
|
100
|
+
|
|
101
|
+
sort_field: Literal[
|
|
102
|
+
"createdAt",
|
|
103
|
+
"updatedAt",
|
|
104
|
+
"id",
|
|
105
|
+
"type",
|
|
106
|
+
"url",
|
|
107
|
+
"status",
|
|
108
|
+
"error",
|
|
109
|
+
"tokenUsage",
|
|
110
|
+
"runtime",
|
|
111
|
+
] = Field(default="updatedAt", description="Field used for sorting.")
|
|
112
|
+
sort_order: Literal["ASC", "DESC"] = Field(default="DESC", description="Sort direction.")
|
|
113
|
+
page_size: int = Field(default=10, description="Results per page.")
|
|
114
|
+
page: int = Field(default=1, description="Page number, starting at 1.")
|
|
115
|
+
search: Optional[str] = Field(default=None, description="Free text search string.")
|
|
116
|
+
date_range_column: Optional[str] = Field(default=None, description="Date field for range filtering.")
|
|
117
|
+
start_at: Optional[str] = Field(default=None, description="ISO-8601 start date.")
|
|
118
|
+
end_at: Optional[str] = Field(default=None, description="ISO-8601 end date.")
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class GetResultByIdInput(BaseModel):
|
|
122
|
+
"""Input schema for fetching a single result."""
|
|
123
|
+
|
|
124
|
+
result_id: str = Field(description="MrScraper result ID.")
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class MrScraperBaseTool(BaseTool):
|
|
128
|
+
"""Base class for all MrScraper tools."""
|
|
129
|
+
|
|
130
|
+
client: Any = Field(default=None, exclude=True, repr=False)
|
|
131
|
+
token: Optional[str] = Field(default=None, exclude=True, repr=False)
|
|
132
|
+
mrscraper_api_key: Optional[str] = Field(default=None, exclude=True, repr=False)
|
|
133
|
+
|
|
134
|
+
@staticmethod
|
|
135
|
+
def _first_arg_or_kwargs(args: tuple[Any, ...], kwargs: dict[str, Any]) -> dict[str, Any]:
|
|
136
|
+
"""Normalize args for direct and low-level calls."""
|
|
137
|
+
if args and isinstance(args[0], dict):
|
|
138
|
+
return {**kwargs, **args[0]}
|
|
139
|
+
return kwargs
|
|
140
|
+
|
|
141
|
+
def _resolve_token(self) -> str:
|
|
142
|
+
token = (
|
|
143
|
+
self.token
|
|
144
|
+
or self.mrscraper_api_key
|
|
145
|
+
or os.getenv("MRSCRAPER_API_KEY")
|
|
146
|
+
or os.getenv("MRSCRAPER_API_TOKEN")
|
|
147
|
+
)
|
|
148
|
+
if not token:
|
|
149
|
+
raise ValueError(
|
|
150
|
+
"Missing MrScraper API key. Pass `token` or `mrscraper_api_key` when "
|
|
151
|
+
"initializing the tool, or set MRSCRAPER_API_KEY / MRSCRAPER_API_TOKEN."
|
|
152
|
+
)
|
|
153
|
+
return token
|
|
154
|
+
|
|
155
|
+
def _get_client(self) -> MrScraper:
|
|
156
|
+
if self.client is None:
|
|
157
|
+
self.client = MrScraper(token=self._resolve_token())
|
|
158
|
+
return self.client
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class MrScraperFetchHTML(MrScraperBaseTool):
|
|
162
|
+
"""Fetch rendered HTML for a webpage."""
|
|
163
|
+
|
|
164
|
+
name: str = "mrscraper_fetch_html"
|
|
165
|
+
description: str = (
|
|
166
|
+
"Fetch rendered HTML using MrScraper's stealth browser. "
|
|
167
|
+
"Useful when you need full page HTML after JavaScript execution."
|
|
168
|
+
)
|
|
169
|
+
args_schema: Type[BaseModel] = FetchHTMLInput
|
|
170
|
+
|
|
171
|
+
def _run(self, *args: Any, **kwargs: Any) -> str:
|
|
172
|
+
params = self._first_arg_or_kwargs(args, kwargs)
|
|
173
|
+
return _run_coro_sync(self._arun(**params))
|
|
174
|
+
|
|
175
|
+
async def _arun(self, **kwargs: Any) -> str:
|
|
176
|
+
result = await self._get_client().fetch_html(**kwargs)
|
|
177
|
+
return _serialize_response(result)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class MrScraperCreateScraper(MrScraperBaseTool):
|
|
181
|
+
"""Create and run an AI scraper."""
|
|
182
|
+
|
|
183
|
+
name: str = "mrscraper_create_scraper"
|
|
184
|
+
description: str = (
|
|
185
|
+
"Create and run an AI-powered scraper from natural-language instructions. "
|
|
186
|
+
"Returns scraper metadata, including scraper ID for follow-up runs."
|
|
187
|
+
)
|
|
188
|
+
args_schema: Type[BaseModel] = CreateScraperInput
|
|
189
|
+
|
|
190
|
+
def _run(self, *args: Any, **kwargs: Any) -> str:
|
|
191
|
+
params = self._first_arg_or_kwargs(args, kwargs)
|
|
192
|
+
return _run_coro_sync(self._arun(**params))
|
|
193
|
+
|
|
194
|
+
async def _arun(self, **kwargs: Any) -> str:
|
|
195
|
+
result = await self._get_client().create_scraper(**kwargs)
|
|
196
|
+
return _serialize_response(result)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class MrScraperRerunScraper(MrScraperBaseTool):
|
|
200
|
+
"""Rerun an existing AI scraper."""
|
|
201
|
+
|
|
202
|
+
name: str = "mrscraper_rerun_scraper"
|
|
203
|
+
description: str = (
|
|
204
|
+
"Rerun an existing AI scraper on a different URL while preserving extraction logic."
|
|
205
|
+
)
|
|
206
|
+
args_schema: Type[BaseModel] = RerunScraperInput
|
|
207
|
+
|
|
208
|
+
def _run(self, *args: Any, **kwargs: Any) -> str:
|
|
209
|
+
params = self._first_arg_or_kwargs(args, kwargs)
|
|
210
|
+
return _run_coro_sync(self._arun(**params))
|
|
211
|
+
|
|
212
|
+
async def _arun(self, **kwargs: Any) -> str:
|
|
213
|
+
result = await self._get_client().rerun_scraper(**kwargs)
|
|
214
|
+
return _serialize_response(result)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
class MrScraperBulkRerunAIScraper(MrScraperBaseTool):
|
|
218
|
+
"""Rerun an AI scraper for multiple URLs."""
|
|
219
|
+
|
|
220
|
+
name: str = "mrscraper_bulk_rerun_ai_scraper"
|
|
221
|
+
description: str = "Bulk rerun an AI scraper across multiple URLs."
|
|
222
|
+
args_schema: Type[BaseModel] = BulkRerunAIScraperInput
|
|
223
|
+
|
|
224
|
+
def _run(self, *args: Any, **kwargs: Any) -> str:
|
|
225
|
+
params = self._first_arg_or_kwargs(args, kwargs)
|
|
226
|
+
return _run_coro_sync(self._arun(**params))
|
|
227
|
+
|
|
228
|
+
async def _arun(self, **kwargs: Any) -> str:
|
|
229
|
+
result = await self._get_client().bulk_rerun_ai_scraper(**kwargs)
|
|
230
|
+
return _serialize_response(result)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class MrScraperRerunManualScraper(MrScraperBaseTool):
|
|
234
|
+
"""Rerun a dashboard-defined manual scraper."""
|
|
235
|
+
|
|
236
|
+
name: str = "mrscraper_rerun_manual_scraper"
|
|
237
|
+
description: str = "Rerun a manual dashboard scraper on a target URL."
|
|
238
|
+
args_schema: Type[BaseModel] = RerunManualScraperInput
|
|
239
|
+
|
|
240
|
+
def _run(self, *args: Any, **kwargs: Any) -> str:
|
|
241
|
+
params = self._first_arg_or_kwargs(args, kwargs)
|
|
242
|
+
return _run_coro_sync(self._arun(**params))
|
|
243
|
+
|
|
244
|
+
async def _arun(self, **kwargs: Any) -> str:
|
|
245
|
+
result = await self._get_client().rerun_manual_scraper(**kwargs)
|
|
246
|
+
return _serialize_response(result)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
class MrScraperBulkRerunManualScraper(MrScraperBaseTool):
|
|
250
|
+
"""Rerun a manual scraper for multiple URLs."""
|
|
251
|
+
|
|
252
|
+
name: str = "mrscraper_bulk_rerun_manual_scraper"
|
|
253
|
+
description: str = (
|
|
254
|
+
"Bulk rerun a manual scraper across multiple URLs in one request."
|
|
255
|
+
)
|
|
256
|
+
args_schema: Type[BaseModel] = BulkRerunManualScraperInput
|
|
257
|
+
|
|
258
|
+
def _run(self, *args: Any, **kwargs: Any) -> str:
|
|
259
|
+
params = self._first_arg_or_kwargs(args, kwargs)
|
|
260
|
+
return _run_coro_sync(self._arun(**params))
|
|
261
|
+
|
|
262
|
+
async def _arun(self, **kwargs: Any) -> str:
|
|
263
|
+
result = await self._get_client().bulk_rerun_manual_scraper(**kwargs)
|
|
264
|
+
return _serialize_response(result)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
class MrScraperGetAllResults(MrScraperBaseTool):
|
|
268
|
+
"""Get paginated scraping results."""
|
|
269
|
+
|
|
270
|
+
name: str = "mrscraper_get_all_results"
|
|
271
|
+
description: str = (
|
|
272
|
+
"List scraping results with pagination, sorting, search, and date filters."
|
|
273
|
+
)
|
|
274
|
+
args_schema: Type[BaseModel] = GetAllResultsInput
|
|
275
|
+
|
|
276
|
+
def _run(self, *args: Any, **kwargs: Any) -> str:
|
|
277
|
+
params = self._first_arg_or_kwargs(args, kwargs)
|
|
278
|
+
return _run_coro_sync(self._arun(**params))
|
|
279
|
+
|
|
280
|
+
async def _arun(self, **kwargs: Any) -> str:
|
|
281
|
+
result = await self._get_client().get_all_results(**kwargs)
|
|
282
|
+
return _serialize_response(result)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
class MrScraperGetResultById(MrScraperBaseTool):
|
|
286
|
+
"""Get one scraping result by ID."""
|
|
287
|
+
|
|
288
|
+
name: str = "mrscraper_get_result_by_id"
|
|
289
|
+
description: str = "Fetch a specific scraping result by result ID."
|
|
290
|
+
args_schema: Type[BaseModel] = GetResultByIdInput
|
|
291
|
+
|
|
292
|
+
def _run(self, *args: Any, **kwargs: Any) -> str:
|
|
293
|
+
params = self._first_arg_or_kwargs(args, kwargs)
|
|
294
|
+
return _run_coro_sync(self._arun(**params))
|
|
295
|
+
|
|
296
|
+
async def _arun(self, **kwargs: Any) -> str:
|
|
297
|
+
result = await self._get_client().get_result_by_id(**kwargs)
|
|
298
|
+
return _serialize_response(result)
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
TOOL_CLASSES: tuple[type[MrScraperBaseTool], ...] = (
|
|
302
|
+
MrScraperFetchHTML,
|
|
303
|
+
MrScraperCreateScraper,
|
|
304
|
+
MrScraperRerunScraper,
|
|
305
|
+
MrScraperBulkRerunAIScraper,
|
|
306
|
+
MrScraperRerunManualScraper,
|
|
307
|
+
MrScraperBulkRerunManualScraper,
|
|
308
|
+
MrScraperGetAllResults,
|
|
309
|
+
MrScraperGetResultById,
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def load_mrscraper_tools(
|
|
314
|
+
*,
|
|
315
|
+
token: Optional[str] = None,
|
|
316
|
+
mrscraper_api_key: Optional[str] = None,
|
|
317
|
+
client: Optional[MrScraper] = None,
|
|
318
|
+
tool_names: Optional[Sequence[str]] = None,
|
|
319
|
+
) -> list[BaseTool]:
|
|
320
|
+
"""Construct a configured list of MrScraper tools."""
|
|
321
|
+
resolved_client = client
|
|
322
|
+
resolved_token = (
|
|
323
|
+
token
|
|
324
|
+
or mrscraper_api_key
|
|
325
|
+
or os.getenv("MRSCRAPER_API_KEY")
|
|
326
|
+
or os.getenv("MRSCRAPER_API_TOKEN")
|
|
327
|
+
)
|
|
328
|
+
if resolved_client is None:
|
|
329
|
+
if resolved_token is None:
|
|
330
|
+
raise ValueError(
|
|
331
|
+
"Either client, token, or mrscraper_api_key must be provided "
|
|
332
|
+
"(or set MRSCRAPER_API_KEY / MRSCRAPER_API_TOKEN)."
|
|
333
|
+
)
|
|
334
|
+
resolved_client = MrScraper(token=resolved_token)
|
|
335
|
+
|
|
336
|
+
tools = [tool_cls(client=resolved_client) for tool_cls in TOOL_CLASSES]
|
|
337
|
+
if tool_names is None:
|
|
338
|
+
return tools
|
|
339
|
+
|
|
340
|
+
requested = set(tool_names)
|
|
341
|
+
return [tool for tool in tools if tool.name in requested]
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
class MrScraperToolkit:
|
|
345
|
+
"""Factory object that returns configured MrScraper LangChain tools."""
|
|
346
|
+
|
|
347
|
+
def __init__(
|
|
348
|
+
self,
|
|
349
|
+
*,
|
|
350
|
+
token: Optional[str] = None,
|
|
351
|
+
mrscraper_api_key: Optional[str] = None,
|
|
352
|
+
client: Optional[MrScraper] = None,
|
|
353
|
+
tool_names: Optional[Sequence[str]] = None,
|
|
354
|
+
) -> None:
|
|
355
|
+
self._token = token
|
|
356
|
+
self._mrscraper_api_key = mrscraper_api_key
|
|
357
|
+
self._client = client
|
|
358
|
+
self._tool_names = tool_names
|
|
359
|
+
|
|
360
|
+
def get_tools(self) -> list[BaseTool]:
|
|
361
|
+
"""Return a list of MrScraper tools ready for LangChain agents."""
|
|
362
|
+
return load_mrscraper_tools(
|
|
363
|
+
token=self._token,
|
|
364
|
+
mrscraper_api_key=self._mrscraper_api_key,
|
|
365
|
+
client=self._client,
|
|
366
|
+
tool_names=self._tool_names,
|
|
367
|
+
)
|
|
368
|
+
|