webcrawlerapi 2.0.4__tar.gz → 2.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.6}/PKG-INFO +3 -3
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.6}/README.md +2 -2
- webcrawlerapi-2.0.6/pyproject.toml +79 -0
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.6}/setup.py +2 -2
- webcrawlerapi-2.0.6/tests/__init__.py +0 -0
- webcrawlerapi-2.0.6/tests/test_client.py +467 -0
- webcrawlerapi-2.0.6/tests/test_models.py +435 -0
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.6}/webcrawlerapi/__init__.py +5 -5
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.6}/webcrawlerapi/client.py +81 -72
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.6}/webcrawlerapi/models.py +75 -34
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.6}/webcrawlerapi.egg-info/PKG-INFO +3 -3
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.6}/webcrawlerapi.egg-info/SOURCES.txt +4 -0
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.6}/webcrawlerapi.egg-info/top_level.txt +1 -0
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.6}/setup.cfg +0 -0
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.6}/webcrawlerapi.egg-info/dependency_links.txt +0 -0
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.6}/webcrawlerapi.egg-info/requires.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: webcrawlerapi
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.6
|
|
4
4
|
Summary: Python SDK for WebCrawler API
|
|
5
5
|
Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
|
|
6
6
|
Author: Andrew
|
|
@@ -101,11 +101,11 @@ print(f"Cancellation response: {cancel_response['message']}")
|
|
|
101
101
|
```
|
|
102
102
|
|
|
103
103
|
### Scraping
|
|
104
|
-
|
|
104
|
+
Check a working code example of [scraping](https://github.com/WebCrawlerAPI/webcrawlerapi-examples/tree/master/python/scraping) and [scraping with a prompt](https://github.com/WebCrawlerAPI/webcrawlerapi-examples/tree/master/python/scraping_prompt)
|
|
105
105
|
```python
|
|
106
106
|
# Returns structured data directly
|
|
107
107
|
response = crawler.scrape(
|
|
108
|
-
|
|
108
|
+
url="https://webcrawlerapi.com"
|
|
109
109
|
)
|
|
110
110
|
if response.success:
|
|
111
111
|
print(response.markdown)
|
|
@@ -80,11 +80,11 @@ print(f"Cancellation response: {cancel_response['message']}")
|
|
|
80
80
|
```
|
|
81
81
|
|
|
82
82
|
### Scraping
|
|
83
|
-
|
|
83
|
+
Check a working code example of [scraping](https://github.com/WebCrawlerAPI/webcrawlerapi-examples/tree/master/python/scraping) and [scraping with a prompt](https://github.com/WebCrawlerAPI/webcrawlerapi-examples/tree/master/python/scraping_prompt)
|
|
84
84
|
```python
|
|
85
85
|
# Returns structured data directly
|
|
86
86
|
response = crawler.scrape(
|
|
87
|
-
|
|
87
|
+
url="https://webcrawlerapi.com"
|
|
88
88
|
)
|
|
89
89
|
if response.success:
|
|
90
90
|
print(response.markdown)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
[tool.black]
|
|
2
|
+
line-length = 88
|
|
3
|
+
target-version = ['py37']
|
|
4
|
+
include = '\.pyi?$'
|
|
5
|
+
extend-exclude = '''
|
|
6
|
+
/(
|
|
7
|
+
# directories
|
|
8
|
+
\.eggs
|
|
9
|
+
| \.git
|
|
10
|
+
| \.hg
|
|
11
|
+
| \.mypy_cache
|
|
12
|
+
| \.tox
|
|
13
|
+
| \.venv
|
|
14
|
+
| _build
|
|
15
|
+
| buck-out
|
|
16
|
+
| build
|
|
17
|
+
| dist
|
|
18
|
+
)/
|
|
19
|
+
'''
|
|
20
|
+
|
|
21
|
+
[tool.isort]
|
|
22
|
+
profile = "black"
|
|
23
|
+
multi_line_output = 3
|
|
24
|
+
line_length = 88
|
|
25
|
+
known_first_party = ["webcrawlerapi"]
|
|
26
|
+
known_third_party = ["requests", "pytest", "responses"]
|
|
27
|
+
|
|
28
|
+
[tool.mypy]
|
|
29
|
+
python_version = "3.7"
|
|
30
|
+
warn_return_any = true
|
|
31
|
+
warn_unused_configs = true
|
|
32
|
+
disallow_untyped_defs = false
|
|
33
|
+
disallow_incomplete_defs = false
|
|
34
|
+
check_untyped_defs = true
|
|
35
|
+
disallow_untyped_decorators = false
|
|
36
|
+
no_implicit_optional = true
|
|
37
|
+
warn_redundant_casts = true
|
|
38
|
+
warn_unused_ignores = true
|
|
39
|
+
warn_no_return = true
|
|
40
|
+
warn_unreachable = true
|
|
41
|
+
strict_equality = true
|
|
42
|
+
|
|
43
|
+
[[tool.mypy.overrides]]
|
|
44
|
+
module = "tests.*"
|
|
45
|
+
ignore_errors = true
|
|
46
|
+
|
|
47
|
+
[tool.pytest.ini_options]
|
|
48
|
+
testpaths = ["tests"]
|
|
49
|
+
python_files = ["test_*.py"]
|
|
50
|
+
python_classes = ["Test*"]
|
|
51
|
+
python_functions = ["test_*"]
|
|
52
|
+
addopts = "-v --tb=short"
|
|
53
|
+
filterwarnings = [
|
|
54
|
+
"ignore::DeprecationWarning",
|
|
55
|
+
"ignore::PendingDeprecationWarning",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
[tool.coverage.run]
|
|
59
|
+
source = ["webcrawlerapi"]
|
|
60
|
+
omit = [
|
|
61
|
+
"tests/*",
|
|
62
|
+
"venv/*",
|
|
63
|
+
"build/*",
|
|
64
|
+
"dist/*",
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
[tool.coverage.report]
|
|
68
|
+
exclude_lines = [
|
|
69
|
+
"pragma: no cover",
|
|
70
|
+
"def __repr__",
|
|
71
|
+
"if self.debug:",
|
|
72
|
+
"if settings.DEBUG",
|
|
73
|
+
"raise AssertionError",
|
|
74
|
+
"raise NotImplementedError",
|
|
75
|
+
"if 0:",
|
|
76
|
+
"if __name__ == .__main__.:",
|
|
77
|
+
"class .*\\bProtocol\\):",
|
|
78
|
+
"@(abc\\.)?abstractmethod",
|
|
79
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,467 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from unittest.mock import Mock, patch
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
import requests
|
|
6
|
+
import responses
|
|
7
|
+
|
|
8
|
+
from webcrawlerapi.client import WebCrawlerAPI
|
|
9
|
+
from webcrawlerapi.models import (
|
|
10
|
+
CrawlResponse,
|
|
11
|
+
Job,
|
|
12
|
+
JobItem,
|
|
13
|
+
ScrapeId,
|
|
14
|
+
ScrapeResponse,
|
|
15
|
+
ScrapeResponseError,
|
|
16
|
+
UploadS3Action,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TestWebCrawlerAPI:
|
|
21
|
+
"""Test suite for WebCrawlerAPI client."""
|
|
22
|
+
|
|
23
|
+
@pytest.fixture
|
|
24
|
+
def client(self):
|
|
25
|
+
"""Create a WebCrawlerAPI client for testing."""
|
|
26
|
+
return WebCrawlerAPI(api_key="test-api-key", base_url="https://api.test.com")
|
|
27
|
+
|
|
28
|
+
@pytest.fixture
|
|
29
|
+
def mock_job_data(self):
|
|
30
|
+
"""Mock job data for testing."""
|
|
31
|
+
return {
|
|
32
|
+
"id": "job-123",
|
|
33
|
+
"org_id": "org-456",
|
|
34
|
+
"url": "https://example.com",
|
|
35
|
+
"status": "done",
|
|
36
|
+
"scrape_type": "markdown",
|
|
37
|
+
"items_limit": 10,
|
|
38
|
+
"allow_subdomains": False,
|
|
39
|
+
"created_at": "2023-01-01T12:00:00.000Z",
|
|
40
|
+
"updated_at": "2023-01-01T12:30:00.000Z",
|
|
41
|
+
"finished_at": "2023-01-01T12:30:00.000Z",
|
|
42
|
+
"recommended_pull_delay_ms": 5000,
|
|
43
|
+
"job_items": [
|
|
44
|
+
{
|
|
45
|
+
"id": "item-1",
|
|
46
|
+
"job_id": "job-123",
|
|
47
|
+
"original_url": "https://example.com/page1",
|
|
48
|
+
"page_status_code": 200,
|
|
49
|
+
"status": "done",
|
|
50
|
+
"title": "Page 1",
|
|
51
|
+
"created_at": "2023-01-01T12:00:00.000Z",
|
|
52
|
+
"updated_at": "2023-01-01T12:15:00.000Z",
|
|
53
|
+
"cost": 1,
|
|
54
|
+
"markdown_content_url": "https://storage.test.com/content1.md",
|
|
55
|
+
}
|
|
56
|
+
],
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
def test_client_initialization(self):
|
|
60
|
+
"""Test client initialization with API key and base URL."""
|
|
61
|
+
client = WebCrawlerAPI("test-key", "https://custom.api.com")
|
|
62
|
+
assert client.api_key == "test-key"
|
|
63
|
+
assert client.base_url == "https://custom.api.com"
|
|
64
|
+
assert client.session.headers["Authorization"] == "Bearer test-key"
|
|
65
|
+
assert client.session.headers["Content-Type"] == "application/json"
|
|
66
|
+
|
|
67
|
+
def test_client_initialization_default_url(self):
|
|
68
|
+
"""Test client initialization with default base URL."""
|
|
69
|
+
client = WebCrawlerAPI("test-key")
|
|
70
|
+
assert client.base_url == "https://api.webcrawlerapi.com"
|
|
71
|
+
|
|
72
|
+
@responses.activate
|
|
73
|
+
def test_crawl_async_success(self, client):
|
|
74
|
+
"""Test successful asynchronous crawl request."""
|
|
75
|
+
responses.add(
|
|
76
|
+
responses.POST,
|
|
77
|
+
"https://api.test.com/v1/crawl",
|
|
78
|
+
json={"id": "crawl-123"},
|
|
79
|
+
status=200,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
result = client.crawl_async(
|
|
83
|
+
url="https://example.com",
|
|
84
|
+
scrape_type="markdown",
|
|
85
|
+
items_limit=5,
|
|
86
|
+
allow_subdomains=True,
|
|
87
|
+
respect_robots_txt=True,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
assert isinstance(result, CrawlResponse)
|
|
91
|
+
assert result.id == "crawl-123"
|
|
92
|
+
|
|
93
|
+
# Verify request payload
|
|
94
|
+
request = responses.calls[0].request
|
|
95
|
+
import json
|
|
96
|
+
|
|
97
|
+
payload = json.loads(request.body)
|
|
98
|
+
assert payload["url"] == "https://example.com"
|
|
99
|
+
assert payload["scrape_type"] == "markdown"
|
|
100
|
+
assert payload["items_limit"] == 5
|
|
101
|
+
assert payload["allow_subdomains"] is True
|
|
102
|
+
assert payload["respect_robots_txt"] is True
|
|
103
|
+
|
|
104
|
+
@responses.activate
|
|
105
|
+
def test_crawl_async_with_actions(self, client):
|
|
106
|
+
"""Test crawl_async with S3 upload action."""
|
|
107
|
+
responses.add(
|
|
108
|
+
responses.POST,
|
|
109
|
+
"https://api.test.com/v1/crawl",
|
|
110
|
+
json={"id": "crawl-456"},
|
|
111
|
+
status=200,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
s3_action = UploadS3Action(
|
|
115
|
+
path="crawl-results/",
|
|
116
|
+
access_key_id="AKIAEXAMPLE",
|
|
117
|
+
secret_access_key="secret123",
|
|
118
|
+
bucket="my-bucket",
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
result = client.crawl_async(url="https://example.com", actions=[s3_action])
|
|
122
|
+
|
|
123
|
+
assert result.id == "crawl-456"
|
|
124
|
+
|
|
125
|
+
# Verify action in payload
|
|
126
|
+
request = responses.calls[0].request
|
|
127
|
+
import json
|
|
128
|
+
|
|
129
|
+
payload = json.loads(request.body)
|
|
130
|
+
assert "actions" in payload
|
|
131
|
+
assert len(payload["actions"]) == 1
|
|
132
|
+
assert payload["actions"][0]["type"] == "upload_s3"
|
|
133
|
+
assert payload["actions"][0]["bucket"] == "my-bucket"
|
|
134
|
+
|
|
135
|
+
@responses.activate
|
|
136
|
+
def test_crawl_async_http_error(self, client):
|
|
137
|
+
"""Test crawl_async with HTTP error response."""
|
|
138
|
+
responses.add(
|
|
139
|
+
responses.POST,
|
|
140
|
+
"https://api.test.com/v1/crawl",
|
|
141
|
+
json={"error": "Invalid URL"},
|
|
142
|
+
status=400,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
with pytest.raises(requests.exceptions.HTTPError):
|
|
146
|
+
client.crawl_async(url="invalid-url")
|
|
147
|
+
|
|
148
|
+
@responses.activate
|
|
149
|
+
def test_get_job_success(self, client, mock_job_data):
|
|
150
|
+
"""Test successful job retrieval."""
|
|
151
|
+
responses.add(
|
|
152
|
+
responses.GET,
|
|
153
|
+
"https://api.test.com/v1/job/job-123",
|
|
154
|
+
json=mock_job_data,
|
|
155
|
+
status=200,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
job = client.get_job("job-123")
|
|
159
|
+
|
|
160
|
+
assert isinstance(job, Job)
|
|
161
|
+
assert job.id == "job-123"
|
|
162
|
+
assert job.status == "done"
|
|
163
|
+
assert job.scrape_type == "markdown"
|
|
164
|
+
assert len(job.job_items) == 1
|
|
165
|
+
assert job.job_items[0].title == "Page 1"
|
|
166
|
+
|
|
167
|
+
@responses.activate
|
|
168
|
+
def test_cancel_job_success(self, client):
|
|
169
|
+
"""Test successful job cancellation."""
|
|
170
|
+
responses.add(
|
|
171
|
+
responses.PUT,
|
|
172
|
+
"https://api.test.com/v1/job/job-123/cancel",
|
|
173
|
+
json={"message": "Job cancelled successfully"},
|
|
174
|
+
status=200,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
result = client.cancel_job("job-123")
|
|
178
|
+
assert result["message"] == "Job cancelled successfully"
|
|
179
|
+
|
|
180
|
+
@responses.activate
|
|
181
|
+
def test_crawl_with_polling_terminal_state(self, client, mock_job_data):
|
|
182
|
+
"""Test crawl method that polls until terminal state."""
|
|
183
|
+
# Mock crawl_async response
|
|
184
|
+
responses.add(
|
|
185
|
+
responses.POST,
|
|
186
|
+
"https://api.test.com/v1/crawl",
|
|
187
|
+
json={"id": "job-123"},
|
|
188
|
+
status=200,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Mock get_job response with terminal state
|
|
192
|
+
responses.add(
|
|
193
|
+
responses.GET,
|
|
194
|
+
"https://api.test.com/v1/job/job-123",
|
|
195
|
+
json=mock_job_data,
|
|
196
|
+
status=200,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
with patch("time.sleep") as mock_sleep:
|
|
200
|
+
job = client.crawl(url="https://example.com", max_polls=5)
|
|
201
|
+
|
|
202
|
+
assert isinstance(job, Job)
|
|
203
|
+
assert job.id == "job-123"
|
|
204
|
+
assert job.status == "done"
|
|
205
|
+
# Should not sleep since job is already in terminal state
|
|
206
|
+
mock_sleep.assert_not_called()
|
|
207
|
+
|
|
208
|
+
@responses.activate
|
|
209
|
+
def test_crawl_with_polling_max_polls_reached(self, client):
|
|
210
|
+
"""Test crawl method that reaches max_polls limit."""
|
|
211
|
+
# Mock crawl_async response
|
|
212
|
+
responses.add(
|
|
213
|
+
responses.POST,
|
|
214
|
+
"https://api.test.com/v1/crawl",
|
|
215
|
+
json={"id": "job-123"},
|
|
216
|
+
status=200,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Mock get_job responses with non-terminal state
|
|
220
|
+
in_progress_data = {
|
|
221
|
+
"id": "job-123",
|
|
222
|
+
"org_id": "org-456",
|
|
223
|
+
"url": "https://example.com",
|
|
224
|
+
"status": "in_progress",
|
|
225
|
+
"scrape_type": "markdown",
|
|
226
|
+
"items_limit": 10,
|
|
227
|
+
"allow_subdomains": False,
|
|
228
|
+
"created_at": "2023-01-01T12:00:00.000Z",
|
|
229
|
+
"updated_at": "2023-01-01T12:30:00.000Z",
|
|
230
|
+
"recommended_pull_delay_ms": 1000,
|
|
231
|
+
"job_items": [],
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
for _ in range(3):
|
|
235
|
+
responses.add(
|
|
236
|
+
responses.GET,
|
|
237
|
+
"https://api.test.com/v1/job/job-123",
|
|
238
|
+
json=in_progress_data,
|
|
239
|
+
status=200,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
with patch("time.sleep") as mock_sleep:
|
|
243
|
+
job = client.crawl(url="https://example.com", max_polls=3)
|
|
244
|
+
|
|
245
|
+
assert job.status == "in_progress"
|
|
246
|
+
# Should sleep 3 times (once after each poll)
|
|
247
|
+
assert mock_sleep.call_count == 3
|
|
248
|
+
mock_sleep.assert_called_with(1.0) # recommended_pull_delay_ms / 1000
|
|
249
|
+
|
|
250
|
+
@responses.activate
|
|
251
|
+
def test_scrape_async_success(self, client):
|
|
252
|
+
"""Test successful asynchronous scrape request."""
|
|
253
|
+
responses.add(
|
|
254
|
+
responses.POST,
|
|
255
|
+
"https://api.test.com/v2/scrape?async=true",
|
|
256
|
+
json={"id": "scrape-123"},
|
|
257
|
+
status=200,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
result = client.scrape_async(
|
|
261
|
+
url="https://example.com",
|
|
262
|
+
output_format="cleaned",
|
|
263
|
+
clean_selectors=".ads, .footer",
|
|
264
|
+
prompt="Extract main content",
|
|
265
|
+
respect_robots_txt=True,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
assert isinstance(result, ScrapeId)
|
|
269
|
+
assert result.id == "scrape-123"
|
|
270
|
+
|
|
271
|
+
# Verify request payload
|
|
272
|
+
request = responses.calls[0].request
|
|
273
|
+
import json
|
|
274
|
+
|
|
275
|
+
payload = json.loads(request.body)
|
|
276
|
+
assert payload["url"] == "https://example.com"
|
|
277
|
+
assert payload["output_format"] == "cleaned"
|
|
278
|
+
assert payload["clean_selectors"] == ".ads, .footer"
|
|
279
|
+
assert payload["prompt"] == "Extract main content"
|
|
280
|
+
assert payload["respect_robots_txt"] is True
|
|
281
|
+
|
|
282
|
+
@responses.activate
|
|
283
|
+
def test_scrape_async_error_response(self, client):
|
|
284
|
+
"""Test scrape_async with error response."""
|
|
285
|
+
responses.add(
|
|
286
|
+
responses.POST,
|
|
287
|
+
"https://api.test.com/v2/scrape?async=true",
|
|
288
|
+
json={"error": "Invalid URL format"},
|
|
289
|
+
status=400,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
with pytest.raises(requests.exceptions.HTTPError) as exc_info:
|
|
293
|
+
client.scrape_async(url="invalid-url")
|
|
294
|
+
|
|
295
|
+
assert "400" in str(exc_info.value)
|
|
296
|
+
assert "Invalid URL format" in str(exc_info.value)
|
|
297
|
+
|
|
298
|
+
@responses.activate
|
|
299
|
+
def test_get_scrape_success_done(self, client):
|
|
300
|
+
"""Test get_scrape with successful completion."""
|
|
301
|
+
scrape_data = {
|
|
302
|
+
"status": "done",
|
|
303
|
+
"success": True,
|
|
304
|
+
"markdown": "# Test Content",
|
|
305
|
+
"cleaned_content": "Test Content",
|
|
306
|
+
"raw_content": "<h1>Test Content</h1>",
|
|
307
|
+
"page_status_code": 200,
|
|
308
|
+
"page_title": "Test Page",
|
|
309
|
+
"structured_data": {"title": "Test Page"},
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
responses.add(
|
|
313
|
+
responses.GET,
|
|
314
|
+
"https://api.test.com/v2/scrape/scrape-123",
|
|
315
|
+
json=scrape_data,
|
|
316
|
+
status=200,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
result = client.get_scrape("scrape-123")
|
|
320
|
+
|
|
321
|
+
assert isinstance(result, ScrapeResponse)
|
|
322
|
+
assert result.status == "done"
|
|
323
|
+
assert result.success is True
|
|
324
|
+
assert result.markdown == "# Test Content"
|
|
325
|
+
assert result.page_title == "Test Page"
|
|
326
|
+
|
|
327
|
+
@responses.activate
|
|
328
|
+
def test_get_scrape_error_status(self, client):
|
|
329
|
+
"""Test get_scrape with error status."""
|
|
330
|
+
error_data = {
|
|
331
|
+
"status": "error",
|
|
332
|
+
"success": False,
|
|
333
|
+
"error_code": "TIMEOUT",
|
|
334
|
+
"error_message": "Request timed out",
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
responses.add(
|
|
338
|
+
responses.GET,
|
|
339
|
+
"https://api.test.com/v2/scrape/scrape-123",
|
|
340
|
+
json=error_data,
|
|
341
|
+
status=200,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
result = client.get_scrape("scrape-123")
|
|
345
|
+
|
|
346
|
+
assert isinstance(result, ScrapeResponseError)
|
|
347
|
+
assert result.status == "error"
|
|
348
|
+
assert result.success is False
|
|
349
|
+
assert result.error_code == "TIMEOUT"
|
|
350
|
+
assert result.error_message == "Request timed out"
|
|
351
|
+
|
|
352
|
+
@responses.activate
|
|
353
|
+
def test_get_scrape_in_progress(self, client):
|
|
354
|
+
"""Test get_scrape with in_progress status."""
|
|
355
|
+
progress_data = {"status": "in_progress", "success": False}
|
|
356
|
+
|
|
357
|
+
responses.add(
|
|
358
|
+
responses.GET,
|
|
359
|
+
"https://api.test.com/v2/scrape/scrape-123",
|
|
360
|
+
json=progress_data,
|
|
361
|
+
status=200,
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
result = client.get_scrape("scrape-123")
|
|
365
|
+
|
|
366
|
+
assert isinstance(result, ScrapeResponse)
|
|
367
|
+
assert result.status == "in_progress"
|
|
368
|
+
assert result.success is False
|
|
369
|
+
|
|
370
|
+
@responses.activate
|
|
371
|
+
def test_scrape_with_polling_success(self, client):
|
|
372
|
+
"""Test scrape method that polls until completion."""
|
|
373
|
+
# Mock scrape_async response
|
|
374
|
+
responses.add(
|
|
375
|
+
responses.POST,
|
|
376
|
+
"https://api.test.com/v2/scrape?async=true",
|
|
377
|
+
json={"id": "scrape-123"},
|
|
378
|
+
status=200,
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
# Mock get_scrape response with done status
|
|
382
|
+
scrape_data = {
|
|
383
|
+
"status": "done",
|
|
384
|
+
"success": True,
|
|
385
|
+
"markdown": "# Scraped Content",
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
responses.add(
|
|
389
|
+
responses.GET,
|
|
390
|
+
"https://api.test.com/v2/scrape/scrape-123",
|
|
391
|
+
json=scrape_data,
|
|
392
|
+
status=200,
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
with patch("time.sleep") as mock_sleep:
|
|
396
|
+
result = client.scrape(url="https://example.com")
|
|
397
|
+
|
|
398
|
+
assert isinstance(result, ScrapeResponse)
|
|
399
|
+
assert result.status == "done"
|
|
400
|
+
assert result.markdown == "# Scraped Content"
|
|
401
|
+
# Should not sleep since scrape is already done
|
|
402
|
+
mock_sleep.assert_not_called()
|
|
403
|
+
|
|
404
|
+
@responses.activate
|
|
405
|
+
def test_scrape_with_polling_error(self, client):
|
|
406
|
+
"""Test scrape method that polls and gets error."""
|
|
407
|
+
# Mock scrape_async response
|
|
408
|
+
responses.add(
|
|
409
|
+
responses.POST,
|
|
410
|
+
"https://api.test.com/v2/scrape?async=true",
|
|
411
|
+
json={"id": "scrape-123"},
|
|
412
|
+
status=200,
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
# Mock get_scrape response with error
|
|
416
|
+
error_data = {
|
|
417
|
+
"status": "error",
|
|
418
|
+
"success": False,
|
|
419
|
+
"error_code": "FETCH_ERROR",
|
|
420
|
+
"error_message": "Failed to fetch page",
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
responses.add(
|
|
424
|
+
responses.GET,
|
|
425
|
+
"https://api.test.com/v2/scrape/scrape-123",
|
|
426
|
+
json=error_data,
|
|
427
|
+
status=200,
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
with patch("time.sleep") as mock_sleep:
|
|
431
|
+
result = client.scrape(url="https://example.com")
|
|
432
|
+
|
|
433
|
+
assert isinstance(result, ScrapeResponseError)
|
|
434
|
+
assert result.error_code == "FETCH_ERROR"
|
|
435
|
+
# Should not sleep since error is immediate
|
|
436
|
+
mock_sleep.assert_not_called()
|
|
437
|
+
|
|
438
|
+
@responses.activate
|
|
439
|
+
def test_scrape_with_polling_max_polls(self, client):
|
|
440
|
+
"""Test scrape method that reaches max_polls."""
|
|
441
|
+
# Mock scrape_async response
|
|
442
|
+
responses.add(
|
|
443
|
+
responses.POST,
|
|
444
|
+
"https://api.test.com/v2/scrape?async=true",
|
|
445
|
+
json={"id": "scrape-123"},
|
|
446
|
+
status=200,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
# Mock get_scrape responses with in_progress status
|
|
450
|
+
progress_data = {"status": "in_progress", "success": False}
|
|
451
|
+
|
|
452
|
+
for _ in range(3):
|
|
453
|
+
responses.add(
|
|
454
|
+
responses.GET,
|
|
455
|
+
"https://api.test.com/v2/scrape/scrape-123",
|
|
456
|
+
json=progress_data,
|
|
457
|
+
status=200,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
with patch("time.sleep") as mock_sleep:
|
|
461
|
+
result = client.scrape(url="https://example.com", max_polls=3)
|
|
462
|
+
|
|
463
|
+
assert isinstance(result, ScrapeResponse)
|
|
464
|
+
assert result.status == "in_progress"
|
|
465
|
+
# Should sleep 3 times (once after each poll)
|
|
466
|
+
assert mock_sleep.call_count == 3
|
|
467
|
+
mock_sleep.assert_called_with(5) # DEFAULT_POLL_DELAY_SECONDS
|