langchain-proxyclaw 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langchain_proxyclaw-0.1.0/LICENSE +21 -0
- langchain_proxyclaw-0.1.0/Makefile +44 -0
- langchain_proxyclaw-0.1.0/PKG-INFO +291 -0
- langchain_proxyclaw-0.1.0/README.md +257 -0
- langchain_proxyclaw-0.1.0/langchain_proxyclaw/__init__.py +14 -0
- langchain_proxyclaw-0.1.0/langchain_proxyclaw/tools.py +318 -0
- langchain_proxyclaw-0.1.0/langchain_proxyclaw/utils.py +76 -0
- langchain_proxyclaw-0.1.0/pyproject.toml +60 -0
- langchain_proxyclaw-0.1.0/tests/__init__.py +1 -0
- langchain_proxyclaw-0.1.0/tests/test_tools.py +153 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 IPLoop
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
.PHONY: help install install-dev test lint format clean build publish
|
|
2
|
+
|
|
3
|
+
help:
|
|
4
|
+
@echo "Available commands:"
|
|
5
|
+
@echo " install - Install package"
|
|
6
|
+
@echo " install-dev - Install with dev dependencies"
|
|
7
|
+
@echo " test - Run tests"
|
|
8
|
+
@echo " lint - Run linters"
|
|
9
|
+
@echo " format - Format code with black"
|
|
10
|
+
@echo " clean - Clean build artifacts"
|
|
11
|
+
@echo " build - Build package"
|
|
12
|
+
@echo " publish - Publish to PyPI"
|
|
13
|
+
|
|
14
|
+
install:
|
|
15
|
+
pip install -e .
|
|
16
|
+
|
|
17
|
+
install-dev:
|
|
18
|
+
pip install -e ".[dev]"
|
|
19
|
+
|
|
20
|
+
test:
|
|
21
|
+
pytest tests/ -v --cov=langchain_proxyclaw --cov-report=term-missing
|
|
22
|
+
|
|
23
|
+
lint:
|
|
24
|
+
mypy langchain_proxyclaw/
|
|
25
|
+
flake8 langchain_proxyclaw/ tests/
|
|
26
|
+
|
|
27
|
+
format:
|
|
28
|
+
black langchain_proxyclaw/ tests/
|
|
29
|
+
|
|
30
|
+
clean:
|
|
31
|
+
rm -rf build/
|
|
32
|
+
rm -rf dist/
|
|
33
|
+
rm -rf *.egg-info/
|
|
34
|
+
find . -type d -name __pycache__ -exec rm -rf {} +
|
|
35
|
+
find . -type f -name "*.pyc" -delete
|
|
36
|
+
|
|
37
|
+
build: clean
|
|
38
|
+
python -m build
|
|
39
|
+
|
|
40
|
+
publish: build
|
|
41
|
+
python -m twine upload dist/*
|
|
42
|
+
|
|
43
|
+
check: lint test
|
|
44
|
+
@echo "All checks passed!"
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: langchain-proxyclaw
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LangChain integration for ProxyClaw - residential proxy network
|
|
5
|
+
Project-URL: Homepage, https://proxyclaw.ai
|
|
6
|
+
Project-URL: Documentation, https://docs.proxyclaw.ai
|
|
7
|
+
Project-URL: Repository, https://github.com/iploop/langchain-proxyclaw
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/iploop/langchain-proxyclaw/issues
|
|
9
|
+
Author-email: IPLoop Team <support@iploop.io>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: iploop,langchain,proxy,proxyclaw,residential,scraping
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Requires-Dist: iploop-sdk>=1.8.0
|
|
25
|
+
Requires-Dist: langchain-core>=0.1.0
|
|
26
|
+
Requires-Dist: pydantic>=2.0.0
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: beautifulsoup4>=4.12.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
|
|
32
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
# LangChain ProxyClaw Integration
|
|
36
|
+
|
|
37
|
+
[](https://badge.fury.io/py/langchain-proxyclaw)
|
|
38
|
+
[](https://opensource.org/licenses/MIT)
|
|
39
|
+
|
|
40
|
+
Official LangChain integration for [ProxyClaw](https://proxyclaw.ai) - a residential proxy network with 10M+ IPs across 111+ countries.
|
|
41
|
+
|
|
42
|
+
## Overview
|
|
43
|
+
|
|
44
|
+
This package provides LangChain Tools for routing HTTP requests through ProxyClaw's residential proxy network, enabling AI agents to scrape websites with:
|
|
45
|
+
|
|
46
|
+
- 🌍 **Global IP coverage** - 111+ countries
|
|
47
|
+
- 🔄 **Automatic rotation** - Fresh IPs per request or sticky sessions
|
|
48
|
+
- 🛡️ **Anti-detection** - Built-in fingerprint spoofing
|
|
49
|
+
- ⚡ **High success rate** - Residential IPs bypass most blocks
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install langchain-proxyclaw
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
For development:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install langchain-proxyclaw[dev]
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Quick Start
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from langchain_proxyclaw import ProxyClawTool
|
|
67
|
+
from langchain.agents import AgentType, initialize_agent
|
|
68
|
+
from langchain_openai import ChatOpenAI
|
|
69
|
+
|
|
70
|
+
# Initialize the tool
|
|
71
|
+
tool = ProxyClawTool(api_key="your_api_key")
|
|
72
|
+
|
|
73
|
+
# Use with an agent
|
|
74
|
+
llm = ChatOpenAI(temperature=0)
|
|
75
|
+
agent = initialize_agent(
|
|
76
|
+
[tool],
|
|
77
|
+
llm,
|
|
78
|
+
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
|
|
79
|
+
verbose=True
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Ask the agent to scrape a website
|
|
83
|
+
agent.run("Get the content from https://example.com using a US proxy")
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Tools
|
|
87
|
+
|
|
88
|
+
### 1. ProxyClawTool
|
|
89
|
+
|
|
90
|
+
Basic HTTP requests through ProxyClaw proxies.
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from langchain_proxyclaw import ProxyClawTool
|
|
94
|
+
|
|
95
|
+
tool = ProxyClawTool(api_key="your_api_key")
|
|
96
|
+
|
|
97
|
+
# Simple GET request
|
|
98
|
+
result = tool.invoke({
|
|
99
|
+
"url": "https://example.com",
|
|
100
|
+
"country": "US"
|
|
101
|
+
})
|
|
102
|
+
|
|
103
|
+
# POST request with data
|
|
104
|
+
result = tool.invoke({
|
|
105
|
+
"url": "https://api.example.com/data",
|
|
106
|
+
"method": "POST",
|
|
107
|
+
"data": {"key": "value"},
|
|
108
|
+
"headers": {"Authorization": "Bearer token"},
|
|
109
|
+
"country": "GB"
|
|
110
|
+
})
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
**Parameters:**
|
|
114
|
+
- `url` (str, required): Target URL
|
|
115
|
+
- `method` (str): HTTP method (GET, POST, PUT, DELETE) - default: GET
|
|
116
|
+
- `headers` (dict): Optional HTTP headers
|
|
117
|
+
- `data` (str/dict): Request body data
|
|
118
|
+
- `country` (str): Country code for proxy location (e.g., "US", "GB", "DE")
|
|
119
|
+
- `session_id` (str): Session ID for sticky sessions
|
|
120
|
+
|
|
121
|
+
### 2. ProxyClawSessionTool
|
|
122
|
+
|
|
123
|
+
Sticky proxy sessions for multi-step workflows.
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from langchain_proxyclaw import ProxyClawSessionTool
|
|
127
|
+
|
|
128
|
+
tool = ProxyClawSessionTool(api_key="your_api_key")
|
|
129
|
+
|
|
130
|
+
# Scrape multiple pages with the same IP
|
|
131
|
+
result = tool.invoke({
|
|
132
|
+
"urls": [
|
|
133
|
+
"https://site.com/login",
|
|
134
|
+
"https://site.com/dashboard",
|
|
135
|
+
"https://site.com/profile"
|
|
136
|
+
],
|
|
137
|
+
"country": "US"
|
|
138
|
+
})
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
**Parameters:**
|
|
142
|
+
- `urls` (list[str], required): List of URLs to fetch
|
|
143
|
+
- `country` (str): Country code
|
|
144
|
+
- `session_lifetime` (int): Session duration in minutes - default: 30
|
|
145
|
+
|
|
146
|
+
### 3. ProxyClawScraperTool
|
|
147
|
+
|
|
148
|
+
Advanced scraper with retries and auto-rotation.
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
from langchain_proxyclaw import ProxyClawScraperTool
|
|
152
|
+
|
|
153
|
+
tool = ProxyClawScraperTool(api_key="your_api_key")
|
|
154
|
+
|
|
155
|
+
# Scrape with retries and link extraction
|
|
156
|
+
result = tool.invoke({
|
|
157
|
+
"url": "https://example.com/products",
|
|
158
|
+
"country": "US",
|
|
159
|
+
"retries": 3,
|
|
160
|
+
"timeout": 30,
|
|
161
|
+
"extract_links": True
|
|
162
|
+
})
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
**Parameters:**
|
|
166
|
+
- `url` (str, required): Target URL
|
|
167
|
+
- `country` (str): Country code
|
|
168
|
+
- `retries` (int): Number of retry attempts - default: 3
|
|
169
|
+
- `timeout` (int): Request timeout in seconds - default: 30
|
|
170
|
+
- `extract_links` (bool): Extract all links from the page - default: False
|
|
171
|
+
|
|
172
|
+
## Authentication
|
|
173
|
+
|
|
174
|
+
Get your API key from [ProxyClaw Dashboard](https://platform.proxyclaw.ai):
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
api_key = "pk_live_xxxxxxxxxxxxxxxx"
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Country Codes
|
|
181
|
+
|
|
182
|
+
Common country codes:
|
|
183
|
+
- `US` - United States
|
|
184
|
+
- `GB` - United Kingdom
|
|
185
|
+
- `DE` - Germany
|
|
186
|
+
- `FR` - France
|
|
187
|
+
- `JP` - Japan
|
|
188
|
+
- `IN` - India
|
|
189
|
+
- `BR` - Brazil
|
|
190
|
+
|
|
191
|
+
Full list: 111+ countries supported. Use ISO 3166-1 alpha-2 codes.
|
|
192
|
+
|
|
193
|
+
## Use Cases
|
|
194
|
+
|
|
195
|
+
### Web Scraping for AI Agents
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
from langchain_proxyclaw import ProxyClawScraperTool
|
|
199
|
+
from langchain.agents import AgentType, initialize_agent
|
|
200
|
+
|
|
201
|
+
scraper = ProxyClawScraperTool(api_key="your_api_key")
|
|
202
|
+
|
|
203
|
+
agent = initialize_agent(
|
|
204
|
+
[scraper],
|
|
205
|
+
llm,
|
|
206
|
+
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Agent can now scrape without being blocked
|
|
210
|
+
agent.run("Find pricing information from https://competitor.com/pricing")
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
### E-commerce Monitoring
|
|
214
|
+
|
|
215
|
+
```python
|
|
216
|
+
session_tool = ProxyClawSessionTool(api_key="your_api_key")
|
|
217
|
+
|
|
218
|
+
# Same IP for login + price check
|
|
219
|
+
result = session_tool.invoke({
|
|
220
|
+
"urls": [
|
|
221
|
+
"https://shop.com/login",
|
|
222
|
+
"https://shop.com/product/123"
|
|
223
|
+
],
|
|
224
|
+
"country": "US"
|
|
225
|
+
})
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
### Market Research
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
from langchain_proxyclaw import ProxyClawTool
|
|
232
|
+
|
|
233
|
+
tool = ProxyClawTool(api_key="your_api_key")
|
|
234
|
+
|
|
235
|
+
# Check how a site appears from different countries
|
|
236
|
+
for country in ["US", "GB", "DE", "JP"]:
|
|
237
|
+
result = tool.invoke({
|
|
238
|
+
"url": "https://global-site.com",
|
|
239
|
+
"country": country
|
|
240
|
+
})
|
|
241
|
+
print(f"{country}: {result}")
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
## LangChain Integration Tests
|
|
245
|
+
|
|
246
|
+
To run LangChain's standard integration tests:
|
|
247
|
+
|
|
248
|
+
```bash
|
|
249
|
+
pip install langchain-proxyclaw[dev]
|
|
250
|
+
pytest tests/ -v
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
## Documentation
|
|
254
|
+
|
|
255
|
+
Full documentation: [docs.proxyclaw.ai](https://docs.proxyclaw.ai)
|
|
256
|
+
|
|
257
|
+
LangChain docs: [python.langchain.com](https://python.langchain.com)
|
|
258
|
+
|
|
259
|
+
## API Reference
|
|
260
|
+
|
|
261
|
+
See [ProxyClaw API Docs](https://docs.proxyclaw.ai/api) for details on:
|
|
262
|
+
- Authentication formats
|
|
263
|
+
- Country targeting
|
|
264
|
+
- Session management
|
|
265
|
+
- Bandwidth tracking
|
|
266
|
+
|
|
267
|
+
## Pricing
|
|
268
|
+
|
|
269
|
+
ProxyClaw uses pay-as-you-go pricing:
|
|
270
|
+
- $0.35/GB under 10TB
|
|
271
|
+
- $0.25/GB over 10TB
|
|
272
|
+
|
|
273
|
+
No minimums, no commitments. [Sign up](https://proxyclaw.ai)
|
|
274
|
+
|
|
275
|
+
## Support
|
|
276
|
+
|
|
277
|
+
- 📧 Email: support@iploop.io
|
|
278
|
+
- 💬 Discord: [discord.gg/clawd](https://discord.gg/clawd)
|
|
279
|
+
- 📖 Docs: [docs.proxyclaw.ai](https://docs.proxyclaw.ai)
|
|
280
|
+
|
|
281
|
+
## License
|
|
282
|
+
|
|
283
|
+
MIT License - see [LICENSE](LICENSE) file.
|
|
284
|
+
|
|
285
|
+
## Contributing
|
|
286
|
+
|
|
287
|
+
Contributions welcome! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
288
|
+
|
|
289
|
+
---
|
|
290
|
+
|
|
291
|
+
Built with ❤️ by the [IPLoop](https://iploop.io) team.
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
# LangChain ProxyClaw Integration
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/py/langchain-proxyclaw)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
|
|
6
|
+
Official LangChain integration for [ProxyClaw](https://proxyclaw.ai) - a residential proxy network with 10M+ IPs across 111+ countries.
|
|
7
|
+
|
|
8
|
+
## Overview
|
|
9
|
+
|
|
10
|
+
This package provides LangChain Tools for routing HTTP requests through ProxyClaw's residential proxy network, enabling AI agents to scrape websites with:
|
|
11
|
+
|
|
12
|
+
- 🌍 **Global IP coverage** - 111+ countries
|
|
13
|
+
- 🔄 **Automatic rotation** - Fresh IPs per request or sticky sessions
|
|
14
|
+
- 🛡️ **Anti-detection** - Built-in fingerprint spoofing
|
|
15
|
+
- ⚡ **High success rate** - Residential IPs bypass most blocks
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install langchain-proxyclaw
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
For development:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install langchain-proxyclaw[dev]
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from langchain_proxyclaw import ProxyClawTool
|
|
33
|
+
from langchain.agents import AgentType, initialize_agent
|
|
34
|
+
from langchain_openai import ChatOpenAI
|
|
35
|
+
|
|
36
|
+
# Initialize the tool
|
|
37
|
+
tool = ProxyClawTool(api_key="your_api_key")
|
|
38
|
+
|
|
39
|
+
# Use with an agent
|
|
40
|
+
llm = ChatOpenAI(temperature=0)
|
|
41
|
+
agent = initialize_agent(
|
|
42
|
+
[tool],
|
|
43
|
+
llm,
|
|
44
|
+
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
|
|
45
|
+
verbose=True
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Ask the agent to scrape a website
|
|
49
|
+
agent.run("Get the content from https://example.com using a US proxy")
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Tools
|
|
53
|
+
|
|
54
|
+
### 1. ProxyClawTool
|
|
55
|
+
|
|
56
|
+
Basic HTTP requests through ProxyClaw proxies.
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from langchain_proxyclaw import ProxyClawTool
|
|
60
|
+
|
|
61
|
+
tool = ProxyClawTool(api_key="your_api_key")
|
|
62
|
+
|
|
63
|
+
# Simple GET request
|
|
64
|
+
result = tool.invoke({
|
|
65
|
+
"url": "https://example.com",
|
|
66
|
+
"country": "US"
|
|
67
|
+
})
|
|
68
|
+
|
|
69
|
+
# POST request with data
|
|
70
|
+
result = tool.invoke({
|
|
71
|
+
"url": "https://api.example.com/data",
|
|
72
|
+
"method": "POST",
|
|
73
|
+
"data": {"key": "value"},
|
|
74
|
+
"headers": {"Authorization": "Bearer token"},
|
|
75
|
+
"country": "GB"
|
|
76
|
+
})
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**Parameters:**
|
|
80
|
+
- `url` (str, required): Target URL
|
|
81
|
+
- `method` (str): HTTP method (GET, POST, PUT, DELETE) - default: GET
|
|
82
|
+
- `headers` (dict): Optional HTTP headers
|
|
83
|
+
- `data` (str/dict): Request body data
|
|
84
|
+
- `country` (str): Country code for proxy location (e.g., "US", "GB", "DE")
|
|
85
|
+
- `session_id` (str): Session ID for sticky sessions
|
|
86
|
+
|
|
87
|
+
### 2. ProxyClawSessionTool
|
|
88
|
+
|
|
89
|
+
Sticky proxy sessions for multi-step workflows.
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
from langchain_proxyclaw import ProxyClawSessionTool
|
|
93
|
+
|
|
94
|
+
tool = ProxyClawSessionTool(api_key="your_api_key")
|
|
95
|
+
|
|
96
|
+
# Scrape multiple pages with the same IP
|
|
97
|
+
result = tool.invoke({
|
|
98
|
+
"urls": [
|
|
99
|
+
"https://site.com/login",
|
|
100
|
+
"https://site.com/dashboard",
|
|
101
|
+
"https://site.com/profile"
|
|
102
|
+
],
|
|
103
|
+
"country": "US"
|
|
104
|
+
})
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
**Parameters:**
|
|
108
|
+
- `urls` (list[str], required): List of URLs to fetch
|
|
109
|
+
- `country` (str): Country code
|
|
110
|
+
- `session_lifetime` (int): Session duration in minutes - default: 30
|
|
111
|
+
|
|
112
|
+
### 3. ProxyClawScraperTool
|
|
113
|
+
|
|
114
|
+
Advanced scraper with retries and auto-rotation.
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from langchain_proxyclaw import ProxyClawScraperTool
|
|
118
|
+
|
|
119
|
+
tool = ProxyClawScraperTool(api_key="your_api_key")
|
|
120
|
+
|
|
121
|
+
# Scrape with retries and link extraction
|
|
122
|
+
result = tool.invoke({
|
|
123
|
+
"url": "https://example.com/products",
|
|
124
|
+
"country": "US",
|
|
125
|
+
"retries": 3,
|
|
126
|
+
"timeout": 30,
|
|
127
|
+
"extract_links": True
|
|
128
|
+
})
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
**Parameters:**
|
|
132
|
+
- `url` (str, required): Target URL
|
|
133
|
+
- `country` (str): Country code
|
|
134
|
+
- `retries` (int): Number of retry attempts - default: 3
|
|
135
|
+
- `timeout` (int): Request timeout in seconds - default: 30
|
|
136
|
+
- `extract_links` (bool): Extract all links from the page - default: False
|
|
137
|
+
|
|
138
|
+
## Authentication
|
|
139
|
+
|
|
140
|
+
Get your API key from [ProxyClaw Dashboard](https://platform.proxyclaw.ai):
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
api_key = "pk_live_xxxxxxxxxxxxxxxx"
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Country Codes
|
|
147
|
+
|
|
148
|
+
Common country codes:
|
|
149
|
+
- `US` - United States
|
|
150
|
+
- `GB` - United Kingdom
|
|
151
|
+
- `DE` - Germany
|
|
152
|
+
- `FR` - France
|
|
153
|
+
- `JP` - Japan
|
|
154
|
+
- `IN` - India
|
|
155
|
+
- `BR` - Brazil
|
|
156
|
+
|
|
157
|
+
Full list: 111+ countries supported. Use ISO 3166-1 alpha-2 codes.
|
|
158
|
+
|
|
159
|
+
## Use Cases
|
|
160
|
+
|
|
161
|
+
### Web Scraping for AI Agents
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
from langchain_proxyclaw import ProxyClawScraperTool
|
|
165
|
+
from langchain.agents import AgentType, initialize_agent
|
|
166
|
+
|
|
167
|
+
scraper = ProxyClawScraperTool(api_key="your_api_key")
|
|
168
|
+
|
|
169
|
+
agent = initialize_agent(
|
|
170
|
+
[scraper],
|
|
171
|
+
llm,
|
|
172
|
+
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Agent can now scrape without being blocked
|
|
176
|
+
agent.run("Find pricing information from https://competitor.com/pricing")
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### E-commerce Monitoring
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
session_tool = ProxyClawSessionTool(api_key="your_api_key")
|
|
183
|
+
|
|
184
|
+
# Same IP for login + price check
|
|
185
|
+
result = session_tool.invoke({
|
|
186
|
+
"urls": [
|
|
187
|
+
"https://shop.com/login",
|
|
188
|
+
"https://shop.com/product/123"
|
|
189
|
+
],
|
|
190
|
+
"country": "US"
|
|
191
|
+
})
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
### Market Research
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
from langchain_proxyclaw import ProxyClawTool
|
|
198
|
+
|
|
199
|
+
tool = ProxyClawTool(api_key="your_api_key")
|
|
200
|
+
|
|
201
|
+
# Check how a site appears from different countries
|
|
202
|
+
for country in ["US", "GB", "DE", "JP"]:
|
|
203
|
+
result = tool.invoke({
|
|
204
|
+
"url": "https://global-site.com",
|
|
205
|
+
"country": country
|
|
206
|
+
})
|
|
207
|
+
print(f"{country}: {result}")
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## LangChain Integration Tests
|
|
211
|
+
|
|
212
|
+
To run LangChain's standard integration tests:
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
pip install langchain-proxyclaw[dev]
|
|
216
|
+
pytest tests/ -v
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## Documentation
|
|
220
|
+
|
|
221
|
+
Full documentation: [docs.proxyclaw.ai](https://docs.proxyclaw.ai)
|
|
222
|
+
|
|
223
|
+
LangChain docs: [python.langchain.com](https://python.langchain.com)
|
|
224
|
+
|
|
225
|
+
## API Reference
|
|
226
|
+
|
|
227
|
+
See [ProxyClaw API Docs](https://docs.proxyclaw.ai/api) for details on:
|
|
228
|
+
- Authentication formats
|
|
229
|
+
- Country targeting
|
|
230
|
+
- Session management
|
|
231
|
+
- Bandwidth tracking
|
|
232
|
+
|
|
233
|
+
## Pricing
|
|
234
|
+
|
|
235
|
+
ProxyClaw uses pay-as-you-go pricing:
|
|
236
|
+
- $0.35/GB under 10TB
|
|
237
|
+
- $0.25/GB over 10TB
|
|
238
|
+
|
|
239
|
+
No minimums, no commitments. [Sign up](https://proxyclaw.ai)
|
|
240
|
+
|
|
241
|
+
## Support
|
|
242
|
+
|
|
243
|
+
- 📧 Email: support@iploop.io
|
|
244
|
+
- 💬 Discord: [discord.gg/clawd](https://discord.gg/clawd)
|
|
245
|
+
- 📖 Docs: [docs.proxyclaw.ai](https://docs.proxyclaw.ai)
|
|
246
|
+
|
|
247
|
+
## License
|
|
248
|
+
|
|
249
|
+
MIT License - see [LICENSE](LICENSE) file.
|
|
250
|
+
|
|
251
|
+
## Contributing
|
|
252
|
+
|
|
253
|
+
Contributions welcome! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
Built with ❤️ by the [IPLoop](https://iploop.io) team.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""LangChain integration for ProxyClaw residential proxy network."""
|
|
2
|
+
|
|
3
|
+
from langchain_proxyclaw.tools import (
|
|
4
|
+
ProxyClawTool,
|
|
5
|
+
ProxyClawSessionTool,
|
|
6
|
+
ProxyClawScraperTool,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
__version__ = "0.1.0"
|
|
10
|
+
__all__ = [
|
|
11
|
+
"ProxyClawTool",
|
|
12
|
+
"ProxyClawSessionTool",
|
|
13
|
+
"ProxyClawScraperTool",
|
|
14
|
+
]
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"""LangChain Tools for ProxyClaw proxy network."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
|
4
|
+
from urllib.parse import urljoin, urlparse
|
|
5
|
+
|
|
6
|
+
from langchain_core.callbacks import CallbackManagerForToolRun
|
|
7
|
+
from langchain_core.tools import BaseTool
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
from iploop import IPLoop
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ProxyClawRequestInput(BaseModel):
|
|
14
|
+
"""Input schema for ProxyClaw HTTP request."""
|
|
15
|
+
url: str = Field(..., description="URL to fetch")
|
|
16
|
+
method: str = Field(default="GET", description="HTTP method (GET, POST, PUT, DELETE)")
|
|
17
|
+
headers: Optional[Dict[str, str]] = Field(default=None, description="Optional HTTP headers")
|
|
18
|
+
data: Optional[Union[Dict[str, Any], str]] = Field(default=None, description="Request body data")
|
|
19
|
+
country: Optional[str] = Field(default=None, description="Country code for proxy location (e.g., 'US', 'GB')")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ProxyClawSessionInput(BaseModel):
|
|
23
|
+
"""Input schema for ProxyClaw sticky session."""
|
|
24
|
+
urls: List[str] = Field(..., description="List of URLs to fetch in the same session")
|
|
25
|
+
country: Optional[str] = Field(default=None, description="Country code for proxy location")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ProxyClawScraperInput(BaseModel):
|
|
29
|
+
"""Input schema for ProxyClaw web scraper with retries."""
|
|
30
|
+
url: str = Field(..., description="URL to scrape")
|
|
31
|
+
country: Optional[str] = Field(default=None, description="Country code for proxy location")
|
|
32
|
+
retries: int = Field(default=3, description="Number of retry attempts")
|
|
33
|
+
timeout: int = Field(default=30, description="Request timeout in seconds")
|
|
34
|
+
extract_links: bool = Field(default=False, description="Whether to extract links from the page")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ProxyClawTool(BaseTool):
|
|
38
|
+
"""Tool for making HTTP requests through ProxyClaw residential proxies.
|
|
39
|
+
|
|
40
|
+
This tool routes HTTP requests through ProxyClaw's residential proxy network,
|
|
41
|
+
providing access to 10M+ IPs across 111+ countries with anti-detection features.
|
|
42
|
+
|
|
43
|
+
Example:
|
|
44
|
+
.. code-block:: python
|
|
45
|
+
|
|
46
|
+
from langchain_proxyclaw import ProxyClawTool
|
|
47
|
+
|
|
48
|
+
tool = ProxyClawTool(api_key="your_api_key")
|
|
49
|
+
result = tool.invoke({
|
|
50
|
+
"url": "https://example.com",
|
|
51
|
+
"country": "US"
|
|
52
|
+
})
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
name: str = "proxyclaw_request"
|
|
56
|
+
description: str = (
|
|
57
|
+
"Make HTTP requests through ProxyClaw residential proxies. "
|
|
58
|
+
"Provides access to millions of residential IPs with anti-detection. "
|
|
59
|
+
"Input: URL, optional country code, headers. "
|
|
60
|
+
"Output: Response content, status code, headers."
|
|
61
|
+
)
|
|
62
|
+
args_schema: Type[BaseModel] = ProxyClawRequestInput
|
|
63
|
+
|
|
64
|
+
client: Any = Field(default=None, exclude=True)
|
|
65
|
+
api_key: Optional[str] = Field(default=None, exclude=True)
|
|
66
|
+
|
|
67
|
+
def __init__(self, api_key: Optional[str] = None, **kwargs: Any):
|
|
68
|
+
"""Initialize ProxyClawTool.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
api_key: ProxyClaw API key
|
|
72
|
+
**kwargs: Additional arguments passed to BaseTool
|
|
73
|
+
"""
|
|
74
|
+
super().__init__(**kwargs)
|
|
75
|
+
self.api_key = api_key
|
|
76
|
+
self.client = IPLoop(api_key=api_key) if api_key else None
|
|
77
|
+
|
|
78
|
+
def _run(
|
|
79
|
+
self,
|
|
80
|
+
url: str,
|
|
81
|
+
method: str = "GET",
|
|
82
|
+
headers: Optional[Dict[str, str]] = None,
|
|
83
|
+
data: Optional[Union[Dict[str, Any], str]] = None,
|
|
84
|
+
country: Optional[str] = None,
|
|
85
|
+
run_manager: Optional[CallbackManagerForToolRun] = None,
|
|
86
|
+
) -> str:
|
|
87
|
+
"""Execute HTTP request through ProxyClaw.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
url: Target URL
|
|
91
|
+
method: HTTP method
|
|
92
|
+
headers: Optional headers
|
|
93
|
+
data: Request body
|
|
94
|
+
country: Country code for proxy
|
|
95
|
+
run_manager: Callback manager
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Response content as string
|
|
99
|
+
"""
|
|
100
|
+
try:
|
|
101
|
+
if self.client is None:
|
|
102
|
+
return "Error: No API key provided"
|
|
103
|
+
|
|
104
|
+
# Create new client with country if specified
|
|
105
|
+
client = self.client
|
|
106
|
+
if country:
|
|
107
|
+
client = IPLoop(api_key=self.api_key, country=country)
|
|
108
|
+
|
|
109
|
+
# Make request
|
|
110
|
+
if method.upper() == "GET":
|
|
111
|
+
response = client.get(url, headers=headers)
|
|
112
|
+
elif method.upper() == "POST":
|
|
113
|
+
response = client.post(url, data=data, headers=headers)
|
|
114
|
+
elif method.upper() == "PUT":
|
|
115
|
+
response = client.put(url, data=data, headers=headers)
|
|
116
|
+
elif method.upper() == "DELETE":
|
|
117
|
+
response = client.delete(url, headers=headers)
|
|
118
|
+
else:
|
|
119
|
+
return f"Error: Unsupported method {method}"
|
|
120
|
+
|
|
121
|
+
result = {
|
|
122
|
+
"status_code": response.status_code if hasattr(response, 'status_code') else 200,
|
|
123
|
+
"url": url,
|
|
124
|
+
"content": response.text[:10000] if hasattr(response, 'text') else str(response)[:10000],
|
|
125
|
+
"success": True,
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
return str(result)
|
|
129
|
+
|
|
130
|
+
except Exception as e:
|
|
131
|
+
return f"Error: {str(e)}"
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class ProxyClawSessionTool(BaseTool):
|
|
135
|
+
"""Tool for proxy sessions through ProxyClaw.
|
|
136
|
+
|
|
137
|
+
Maintains session context across multiple requests.
|
|
138
|
+
|
|
139
|
+
Example:
|
|
140
|
+
.. code-block:: python
|
|
141
|
+
|
|
142
|
+
tool = ProxyClawSessionTool(api_key="your_api_key")
|
|
143
|
+
result = tool.invoke({
|
|
144
|
+
"urls": ["https://site.com/page1", "https://site.com/page2"],
|
|
145
|
+
"country": "GB"
|
|
146
|
+
})
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
name: str = "proxyclaw_session"
|
|
150
|
+
description: str = (
|
|
151
|
+
"Create a proxy session for multiple requests. "
|
|
152
|
+
"Useful for sites requiring consistent IP (login, checkout, etc.). "
|
|
153
|
+
"Input: List of URLs, country. "
|
|
154
|
+
"Output: List of responses."
|
|
155
|
+
)
|
|
156
|
+
args_schema: Type[BaseModel] = ProxyClawSessionInput
|
|
157
|
+
|
|
158
|
+
client: Any = Field(default=None, exclude=True)
|
|
159
|
+
api_key: Optional[str] = Field(default=None, exclude=True)
|
|
160
|
+
|
|
161
|
+
def __init__(self, api_key: Optional[str] = None, **kwargs: Any):
|
|
162
|
+
super().__init__(**kwargs)
|
|
163
|
+
self.api_key = api_key
|
|
164
|
+
self.client = IPLoop(api_key=api_key) if api_key else None
|
|
165
|
+
|
|
166
|
+
def _run(
|
|
167
|
+
self,
|
|
168
|
+
urls: List[str],
|
|
169
|
+
country: Optional[str] = None,
|
|
170
|
+
run_manager: Optional[CallbackManagerForToolRun] = None,
|
|
171
|
+
) -> str:
|
|
172
|
+
"""Execute multiple requests in a session.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
urls: List of URLs to fetch
|
|
176
|
+
country: Country code
|
|
177
|
+
run_manager: Callback manager
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
Combined results as string
|
|
181
|
+
"""
|
|
182
|
+
if self.client is None:
|
|
183
|
+
return "Error: No API key provided"
|
|
184
|
+
|
|
185
|
+
results = []
|
|
186
|
+
|
|
187
|
+
# Use session if available, otherwise create client with country
|
|
188
|
+
client = self.client
|
|
189
|
+
if country:
|
|
190
|
+
client = IPLoop(api_key=self.api_key, country=country)
|
|
191
|
+
|
|
192
|
+
for url in urls:
|
|
193
|
+
try:
|
|
194
|
+
response = client.get(url)
|
|
195
|
+
results.append({
|
|
196
|
+
"url": url,
|
|
197
|
+
"status": response.status_code if hasattr(response, 'status_code') else 200,
|
|
198
|
+
"success": True,
|
|
199
|
+
"preview": response.text[:500] if hasattr(response, 'text') else str(response)[:500],
|
|
200
|
+
})
|
|
201
|
+
except Exception as e:
|
|
202
|
+
results.append({"url": url, "error": str(e)})
|
|
203
|
+
|
|
204
|
+
return str({
|
|
205
|
+
"results": results,
|
|
206
|
+
"total": len(urls),
|
|
207
|
+
"successful": sum(1 for r in results if r.get("success")),
|
|
208
|
+
})
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class ProxyClawScraperTool(BaseTool):
|
|
212
|
+
"""Advanced web scraper with retries.
|
|
213
|
+
|
|
214
|
+
Automatically retries failed requests with different IPs
|
|
215
|
+
and can extract links and structured data.
|
|
216
|
+
|
|
217
|
+
Example:
|
|
218
|
+
.. code-block:: python
|
|
219
|
+
|
|
220
|
+
tool = ProxyClawScraperTool(api_key="your_api_key")
|
|
221
|
+
result = tool.invoke({
|
|
222
|
+
"url": "https://example.com/products",
|
|
223
|
+
"country": "US",
|
|
224
|
+
"retries": 3,
|
|
225
|
+
"extract_links": True
|
|
226
|
+
})
|
|
227
|
+
"""
|
|
228
|
+
|
|
229
|
+
name: str = "proxyclaw_scraper"
|
|
230
|
+
description: str = (
|
|
231
|
+
"Advanced web scraper with automatic retries and IP rotation. "
|
|
232
|
+
"Best for challenging sites with anti-bot protection. "
|
|
233
|
+
"Input: URL, country, retries, timeout, extract_links. "
|
|
234
|
+
"Output: Page content, extracted links, success status."
|
|
235
|
+
)
|
|
236
|
+
args_schema: Type[BaseModel] = ProxyClawScraperInput
|
|
237
|
+
|
|
238
|
+
client: Any = Field(default=None, exclude=True)
|
|
239
|
+
api_key: Optional[str] = Field(default=None, exclude=True)
|
|
240
|
+
|
|
241
|
+
def __init__(self, api_key: Optional[str] = None, **kwargs: Any):
|
|
242
|
+
super().__init__(**kwargs)
|
|
243
|
+
self.api_key = api_key
|
|
244
|
+
self.client = IPLoop(api_key=api_key) if api_key else None
|
|
245
|
+
|
|
246
|
+
def _run(
|
|
247
|
+
self,
|
|
248
|
+
url: str,
|
|
249
|
+
country: Optional[str] = None,
|
|
250
|
+
retries: int = 3,
|
|
251
|
+
timeout: int = 30,
|
|
252
|
+
extract_links: bool = False,
|
|
253
|
+
run_manager: Optional[CallbackManagerForToolRun] = None,
|
|
254
|
+
) -> str:
|
|
255
|
+
"""Scrape URL with retries and optional link extraction.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
url: Target URL
|
|
259
|
+
country: Country code
|
|
260
|
+
retries: Number of retry attempts
|
|
261
|
+
timeout: Request timeout
|
|
262
|
+
extract_links: Whether to extract links
|
|
263
|
+
run_manager: Callback manager
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Scraped data as string
|
|
267
|
+
"""
|
|
268
|
+
if self.client is None:
|
|
269
|
+
return "Error: No API key provided"
|
|
270
|
+
|
|
271
|
+
last_error = None
|
|
272
|
+
|
|
273
|
+
for attempt in range(retries):
|
|
274
|
+
try:
|
|
275
|
+
# Create client with country if specified
|
|
276
|
+
client = self.client
|
|
277
|
+
if country:
|
|
278
|
+
client = IPLoop(api_key=self.api_key, country=country)
|
|
279
|
+
|
|
280
|
+
response = client.get(url)
|
|
281
|
+
|
|
282
|
+
content = response.text if hasattr(response, 'text') else str(response)
|
|
283
|
+
result = {
|
|
284
|
+
"url": url,
|
|
285
|
+
"status_code": response.status_code if hasattr(response, 'status_code') else 200,
|
|
286
|
+
"success": True,
|
|
287
|
+
"attempts": attempt + 1,
|
|
288
|
+
"content_length": len(content),
|
|
289
|
+
"content_preview": content[:2000],
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
if extract_links:
|
|
293
|
+
try:
|
|
294
|
+
from bs4 import BeautifulSoup
|
|
295
|
+
soup = BeautifulSoup(content, 'html.parser')
|
|
296
|
+
links = [a.get('href') for a in soup.find_all('a', href=True)]
|
|
297
|
+
# Convert relative to absolute URLs
|
|
298
|
+
base_url = f"{urlparse(url).scheme}://{urlparse(url).netloc}"
|
|
299
|
+
links = [
|
|
300
|
+
urljoin(base_url, link) if link.startswith('/') else link
|
|
301
|
+
for link in links
|
|
302
|
+
]
|
|
303
|
+
result["extracted_links"] = list(set(links))[:50] # Limit links
|
|
304
|
+
except ImportError:
|
|
305
|
+
result["extracted_links"] = ["beautifulsoup4 not installed"]
|
|
306
|
+
|
|
307
|
+
return str(result)
|
|
308
|
+
|
|
309
|
+
except Exception as e:
|
|
310
|
+
last_error = str(e)
|
|
311
|
+
continue
|
|
312
|
+
|
|
313
|
+
return str({
|
|
314
|
+
"url": url,
|
|
315
|
+
"success": False,
|
|
316
|
+
"attempts": retries,
|
|
317
|
+
"error": last_error,
|
|
318
|
+
})
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Utility functions for langchain-proxyclaw."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def build_auth_string(
|
|
7
|
+
base_auth: str,
|
|
8
|
+
country: Optional[str] = None,
|
|
9
|
+
session_id: Optional[str] = None,
|
|
10
|
+
session_lifetime: Optional[int] = None,
|
|
11
|
+
city: Optional[str] = None,
|
|
12
|
+
rotation: bool = False,
|
|
13
|
+
) -> str:
|
|
14
|
+
"""Build ProxyClaw auth string with options.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
base_auth: Base auth string (customer_id:api_key)
|
|
18
|
+
country: Country code (e.g., 'US', 'GB')
|
|
19
|
+
session_id: Session ID for sticky sessions
|
|
20
|
+
session_lifetime: Session lifetime in minutes
|
|
21
|
+
city: City name for geo-targeting
|
|
22
|
+
rotation: Whether to enable IP rotation
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Formatted auth string
|
|
26
|
+
"""
|
|
27
|
+
auth = base_auth
|
|
28
|
+
|
|
29
|
+
if country:
|
|
30
|
+
auth = f"{auth}-country-{country}"
|
|
31
|
+
|
|
32
|
+
if city:
|
|
33
|
+
auth = f"{auth}-city-{city}"
|
|
34
|
+
|
|
35
|
+
if session_id:
|
|
36
|
+
auth = f"{auth}-session-{session_id}"
|
|
37
|
+
if session_lifetime:
|
|
38
|
+
auth = f"{auth}-lifetime-{session_lifetime}"
|
|
39
|
+
|
|
40
|
+
if rotation:
|
|
41
|
+
auth = f"{auth}-rotation-true"
|
|
42
|
+
|
|
43
|
+
return auth
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def validate_url(url: str) -> bool:
|
|
47
|
+
"""Validate URL format.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
url: URL to validate
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
True if valid URL
|
|
54
|
+
"""
|
|
55
|
+
from urllib.parse import urlparse
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
result = urlparse(url)
|
|
59
|
+
return all([result.scheme in ('http', 'https'), result.netloc])
|
|
60
|
+
except Exception:
|
|
61
|
+
return False
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def truncate_content(content: str, max_length: int = 10000) -> str:
|
|
65
|
+
"""Truncate content to max length with indicator.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
content: Content to truncate
|
|
69
|
+
max_length: Maximum length
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Truncated content
|
|
73
|
+
"""
|
|
74
|
+
if len(content) <= max_length:
|
|
75
|
+
return content
|
|
76
|
+
return content[:max_length] + f"\n... [truncated, total: {len(content)} chars]"
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "langchain-proxyclaw"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "LangChain integration for ProxyClaw - residential proxy network"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "IPLoop Team", email = "support@iploop.io"}
|
|
14
|
+
]
|
|
15
|
+
keywords = ["langchain", "proxy", "scraping", "residential", "proxyclaw", "iploop"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.9",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
26
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
27
|
+
]
|
|
28
|
+
dependencies = [
|
|
29
|
+
"langchain-core>=0.1.0",
|
|
30
|
+
"iploop-sdk>=1.8.0",
|
|
31
|
+
"pydantic>=2.0.0",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
dev = [
|
|
36
|
+
"pytest>=7.0.0",
|
|
37
|
+
"pytest-asyncio>=0.21.0",
|
|
38
|
+
"black>=23.0.0",
|
|
39
|
+
"mypy>=1.0.0",
|
|
40
|
+
"beautifulsoup4>=4.12.0",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
[project.urls]
|
|
44
|
+
Homepage = "https://proxyclaw.ai"
|
|
45
|
+
Documentation = "https://docs.proxyclaw.ai"
|
|
46
|
+
Repository = "https://github.com/iploop/langchain-proxyclaw"
|
|
47
|
+
"Bug Tracker" = "https://github.com/iploop/langchain-proxyclaw/issues"
|
|
48
|
+
|
|
49
|
+
[tool.hatch.build.targets.wheel]
|
|
50
|
+
packages = ["langchain_proxyclaw"]
|
|
51
|
+
|
|
52
|
+
[tool.black]
|
|
53
|
+
line-length = 100
|
|
54
|
+
target-version = ['py39']
|
|
55
|
+
|
|
56
|
+
[tool.mypy]
|
|
57
|
+
python_version = "3.9"
|
|
58
|
+
warn_return_any = true
|
|
59
|
+
warn_unused_configs = true
|
|
60
|
+
disallow_untyped_defs = true
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Tests for langchain-proxyclaw."""
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Tests for ProxyClaw LangChain tools."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from unittest.mock import MagicMock, patch
|
|
5
|
+
|
|
6
|
+
from langchain_proxyclaw import ProxyClawTool, ProxyClawSessionTool, ProxyClawScraperTool
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class MockResponse:
|
|
10
|
+
"""Mock HTTP response."""
|
|
11
|
+
def __init__(self, status_code=200, text="test content", url="https://example.com"):
|
|
12
|
+
self.status_code = status_code
|
|
13
|
+
self.text = text
|
|
14
|
+
self.url = url
|
|
15
|
+
self.headers = {"Content-Type": "text/html"}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pytest.fixture
|
|
19
|
+
def mock_iploop():
|
|
20
|
+
"""Fixture for mocked IPLoop client."""
|
|
21
|
+
with patch("langchain_proxyclaw.tools.IPLoop") as mock:
|
|
22
|
+
instance = MagicMock()
|
|
23
|
+
mock.return_value = instance
|
|
24
|
+
yield instance
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TestProxyClawTool:
|
|
28
|
+
"""Tests for ProxyClawTool."""
|
|
29
|
+
|
|
30
|
+
def test_initialization(self):
|
|
31
|
+
"""Test tool initialization."""
|
|
32
|
+
tool = ProxyClawTool(api_key="test_key")
|
|
33
|
+
assert tool.name == "proxyclaw_request"
|
|
34
|
+
assert tool.api_key == "test_key"
|
|
35
|
+
|
|
36
|
+
def test_basic_request(self, mock_iploop):
|
|
37
|
+
"""Test basic HTTP request."""
|
|
38
|
+
mock_iploop.get.return_value = MockResponse()
|
|
39
|
+
|
|
40
|
+
tool = ProxyClawTool(api_key="test_key")
|
|
41
|
+
result = tool._run(url="https://example.com")
|
|
42
|
+
|
|
43
|
+
assert "status_code" in result
|
|
44
|
+
assert "200" in result or "success" in result
|
|
45
|
+
mock_iploop.get.assert_called_once()
|
|
46
|
+
|
|
47
|
+
def test_request_with_country(self, mock_iploop):
|
|
48
|
+
"""Test request with country parameter."""
|
|
49
|
+
mock_iploop.get.return_value = MockResponse()
|
|
50
|
+
|
|
51
|
+
tool = ProxyClawTool(api_key="test_key")
|
|
52
|
+
result = tool._run(url="https://example.com", country="US")
|
|
53
|
+
|
|
54
|
+
# Should succeed - country param is processed
|
|
55
|
+
assert "status_code" in result or "success" in result
|
|
56
|
+
|
|
57
|
+
def test_post_request(self, mock_iploop):
|
|
58
|
+
"""Test POST request."""
|
|
59
|
+
mock_iploop.post.return_value = MockResponse()
|
|
60
|
+
|
|
61
|
+
tool = ProxyClawTool(api_key="test_key")
|
|
62
|
+
result = tool._run(
|
|
63
|
+
url="https://example.com/api",
|
|
64
|
+
method="POST",
|
|
65
|
+
data={"key": "value"}
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
mock_iploop.post.assert_called_once()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class TestProxyClawSessionTool:
|
|
72
|
+
"""Tests for ProxyClawSessionTool."""
|
|
73
|
+
|
|
74
|
+
def test_initialization(self):
|
|
75
|
+
"""Test tool initialization."""
|
|
76
|
+
tool = ProxyClawSessionTool(api_key="test_key")
|
|
77
|
+
assert tool.name == "proxyclaw_session"
|
|
78
|
+
|
|
79
|
+
def test_session_requests(self, mock_iploop):
|
|
80
|
+
"""Test multiple requests in session."""
|
|
81
|
+
mock_iploop.get.return_value = MockResponse()
|
|
82
|
+
|
|
83
|
+
tool = ProxyClawSessionTool(api_key="test_key")
|
|
84
|
+
result = tool._run(
|
|
85
|
+
urls=["https://example.com/1", "https://example.com/2"],
|
|
86
|
+
country="GB"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
assert mock_iploop.get.call_count == 2
|
|
90
|
+
assert "results" in result
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class TestProxyClawScraperTool:
|
|
94
|
+
"""Tests for ProxyClawScraperTool."""
|
|
95
|
+
|
|
96
|
+
def test_initialization(self):
|
|
97
|
+
"""Test tool initialization."""
|
|
98
|
+
tool = ProxyClawScraperTool(api_key="test_key")
|
|
99
|
+
assert tool.name == "proxyclaw_scraper"
|
|
100
|
+
|
|
101
|
+
def test_successful_scrape(self, mock_iploop):
|
|
102
|
+
"""Test successful scraping."""
|
|
103
|
+
mock_iploop.get.return_value = MockResponse(
|
|
104
|
+
text='<html><body><a href="/page2">Link</a></body></html>'
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
tool = ProxyClawScraperTool(api_key="test_key")
|
|
108
|
+
result = tool._run(url="https://example.com")
|
|
109
|
+
|
|
110
|
+
assert "success" in result
|
|
111
|
+
assert "True" in result
|
|
112
|
+
|
|
113
|
+
def test_scrape_with_retries(self, mock_iploop):
|
|
114
|
+
"""Test scraping with retry logic."""
|
|
115
|
+
# First call fails, second succeeds
|
|
116
|
+
mock_iploop.get.side_effect = [
|
|
117
|
+
Exception("Connection error"),
|
|
118
|
+
MockResponse()
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
tool = ProxyClawScraperTool(api_key="test_key")
|
|
122
|
+
result = tool._run(url="https://example.com", retries=2)
|
|
123
|
+
|
|
124
|
+
assert mock_iploop.get.call_count == 2
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class TestToolSchema:
|
|
128
|
+
"""Tests for tool input schemas."""
|
|
129
|
+
|
|
130
|
+
def test_proxyclaw_tool_schema(self):
|
|
131
|
+
"""Test ProxyClawTool schema."""
|
|
132
|
+
tool = ProxyClawTool()
|
|
133
|
+
schema = tool.args_schema
|
|
134
|
+
|
|
135
|
+
assert "url" in schema.model_fields
|
|
136
|
+
assert "method" in schema.model_fields
|
|
137
|
+
assert "country" in schema.model_fields
|
|
138
|
+
|
|
139
|
+
def test_session_tool_schema(self):
|
|
140
|
+
"""Test ProxyClawSessionTool schema."""
|
|
141
|
+
tool = ProxyClawSessionTool()
|
|
142
|
+
schema = tool.args_schema
|
|
143
|
+
|
|
144
|
+
assert "urls" in schema.model_fields
|
|
145
|
+
|
|
146
|
+
def test_scraper_tool_schema(self):
|
|
147
|
+
"""Test ProxyClawScraperTool schema."""
|
|
148
|
+
tool = ProxyClawScraperTool()
|
|
149
|
+
schema = tool.args_schema
|
|
150
|
+
|
|
151
|
+
assert "url" in schema.model_fields
|
|
152
|
+
assert "retries" in schema.model_fields
|
|
153
|
+
assert "extract_links" in schema.model_fields
|