webcrawlerapi 2.0.4__tar.gz → 2.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.4
3
+ Version: 2.0.6
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
@@ -101,11 +101,11 @@ print(f"Cancellation response: {cancel_response['message']}")
101
101
  ```
102
102
 
103
103
  ### Scraping
104
-
104
+ Check a working code example of [scraping](https://github.com/WebCrawlerAPI/webcrawlerapi-examples/tree/master/python/scraping) and [scraping with a prompt](https://github.com/WebCrawlerAPI/webcrawlerapi-examples/tree/master/python/scraping_prompt)
105
105
  ```python
106
106
  # Returns structured data directly
107
107
  response = crawler.scrape(
108
- "url": "https://webcrawlerapi.com"
108
+ url="https://webcrawlerapi.com"
109
109
  )
110
110
  if response.success:
111
111
  print(response.markdown)
@@ -80,11 +80,11 @@ print(f"Cancellation response: {cancel_response['message']}")
80
80
  ```
81
81
 
82
82
  ### Scraping
83
-
83
+ Check a working code example of [scraping](https://github.com/WebCrawlerAPI/webcrawlerapi-examples/tree/master/python/scraping) and [scraping with a prompt](https://github.com/WebCrawlerAPI/webcrawlerapi-examples/tree/master/python/scraping_prompt)
84
84
  ```python
85
85
  # Returns structured data directly
86
86
  response = crawler.scrape(
87
- "url": "https://webcrawlerapi.com"
87
+ url="https://webcrawlerapi.com"
88
88
  )
89
89
  if response.success:
90
90
  print(response.markdown)
@@ -0,0 +1,79 @@
1
+ [tool.black]
2
+ line-length = 88
3
+ target-version = ['py37']
4
+ include = '\.pyi?$'
5
+ extend-exclude = '''
6
+ /(
7
+ # directories
8
+ \.eggs
9
+ | \.git
10
+ | \.hg
11
+ | \.mypy_cache
12
+ | \.tox
13
+ | \.venv
14
+ | _build
15
+ | buck-out
16
+ | build
17
+ | dist
18
+ )/
19
+ '''
20
+
21
+ [tool.isort]
22
+ profile = "black"
23
+ multi_line_output = 3
24
+ line_length = 88
25
+ known_first_party = ["webcrawlerapi"]
26
+ known_third_party = ["requests", "pytest", "responses"]
27
+
28
+ [tool.mypy]
29
+ python_version = "3.7"
30
+ warn_return_any = true
31
+ warn_unused_configs = true
32
+ disallow_untyped_defs = false
33
+ disallow_incomplete_defs = false
34
+ check_untyped_defs = true
35
+ disallow_untyped_decorators = false
36
+ no_implicit_optional = true
37
+ warn_redundant_casts = true
38
+ warn_unused_ignores = true
39
+ warn_no_return = true
40
+ warn_unreachable = true
41
+ strict_equality = true
42
+
43
+ [[tool.mypy.overrides]]
44
+ module = "tests.*"
45
+ ignore_errors = true
46
+
47
+ [tool.pytest.ini_options]
48
+ testpaths = ["tests"]
49
+ python_files = ["test_*.py"]
50
+ python_classes = ["Test*"]
51
+ python_functions = ["test_*"]
52
+ addopts = "-v --tb=short"
53
+ filterwarnings = [
54
+ "ignore::DeprecationWarning",
55
+ "ignore::PendingDeprecationWarning",
56
+ ]
57
+
58
+ [tool.coverage.run]
59
+ source = ["webcrawlerapi"]
60
+ omit = [
61
+ "tests/*",
62
+ "venv/*",
63
+ "build/*",
64
+ "dist/*",
65
+ ]
66
+
67
+ [tool.coverage.report]
68
+ exclude_lines = [
69
+ "pragma: no cover",
70
+ "def __repr__",
71
+ "if self.debug:",
72
+ "if settings.DEBUG",
73
+ "raise AssertionError",
74
+ "raise NotImplementedError",
75
+ "if 0:",
76
+ "if __name__ == .__main__.:",
77
+ "class .*\\bProtocol\\):",
78
+ "@(abc\\.)?abstractmethod",
79
+ ]
@@ -1,8 +1,8 @@
1
- from setuptools import setup, find_packages
1
+ from setuptools import find_packages, setup
2
2
 
3
3
  setup(
4
4
  name="webcrawlerapi",
5
- version="2.0.4",
5
+ version="2.0.6",
6
6
  packages=find_packages(),
7
7
  install_requires=[
8
8
  "requests>=2.25.0",
File without changes
@@ -0,0 +1,467 @@
1
+ from datetime import datetime
2
+ from unittest.mock import Mock, patch
3
+
4
+ import pytest
5
+ import requests
6
+ import responses
7
+
8
+ from webcrawlerapi.client import WebCrawlerAPI
9
+ from webcrawlerapi.models import (
10
+ CrawlResponse,
11
+ Job,
12
+ JobItem,
13
+ ScrapeId,
14
+ ScrapeResponse,
15
+ ScrapeResponseError,
16
+ UploadS3Action,
17
+ )
18
+
19
+
20
+ class TestWebCrawlerAPI:
21
+ """Test suite for WebCrawlerAPI client."""
22
+
23
+ @pytest.fixture
24
+ def client(self):
25
+ """Create a WebCrawlerAPI client for testing."""
26
+ return WebCrawlerAPI(api_key="test-api-key", base_url="https://api.test.com")
27
+
28
+ @pytest.fixture
29
+ def mock_job_data(self):
30
+ """Mock job data for testing."""
31
+ return {
32
+ "id": "job-123",
33
+ "org_id": "org-456",
34
+ "url": "https://example.com",
35
+ "status": "done",
36
+ "scrape_type": "markdown",
37
+ "items_limit": 10,
38
+ "allow_subdomains": False,
39
+ "created_at": "2023-01-01T12:00:00.000Z",
40
+ "updated_at": "2023-01-01T12:30:00.000Z",
41
+ "finished_at": "2023-01-01T12:30:00.000Z",
42
+ "recommended_pull_delay_ms": 5000,
43
+ "job_items": [
44
+ {
45
+ "id": "item-1",
46
+ "job_id": "job-123",
47
+ "original_url": "https://example.com/page1",
48
+ "page_status_code": 200,
49
+ "status": "done",
50
+ "title": "Page 1",
51
+ "created_at": "2023-01-01T12:00:00.000Z",
52
+ "updated_at": "2023-01-01T12:15:00.000Z",
53
+ "cost": 1,
54
+ "markdown_content_url": "https://storage.test.com/content1.md",
55
+ }
56
+ ],
57
+ }
58
+
59
+ def test_client_initialization(self):
60
+ """Test client initialization with API key and base URL."""
61
+ client = WebCrawlerAPI("test-key", "https://custom.api.com")
62
+ assert client.api_key == "test-key"
63
+ assert client.base_url == "https://custom.api.com"
64
+ assert client.session.headers["Authorization"] == "Bearer test-key"
65
+ assert client.session.headers["Content-Type"] == "application/json"
66
+
67
+ def test_client_initialization_default_url(self):
68
+ """Test client initialization with default base URL."""
69
+ client = WebCrawlerAPI("test-key")
70
+ assert client.base_url == "https://api.webcrawlerapi.com"
71
+
72
+ @responses.activate
73
+ def test_crawl_async_success(self, client):
74
+ """Test successful asynchronous crawl request."""
75
+ responses.add(
76
+ responses.POST,
77
+ "https://api.test.com/v1/crawl",
78
+ json={"id": "crawl-123"},
79
+ status=200,
80
+ )
81
+
82
+ result = client.crawl_async(
83
+ url="https://example.com",
84
+ scrape_type="markdown",
85
+ items_limit=5,
86
+ allow_subdomains=True,
87
+ respect_robots_txt=True,
88
+ )
89
+
90
+ assert isinstance(result, CrawlResponse)
91
+ assert result.id == "crawl-123"
92
+
93
+ # Verify request payload
94
+ request = responses.calls[0].request
95
+ import json
96
+
97
+ payload = json.loads(request.body)
98
+ assert payload["url"] == "https://example.com"
99
+ assert payload["scrape_type"] == "markdown"
100
+ assert payload["items_limit"] == 5
101
+ assert payload["allow_subdomains"] is True
102
+ assert payload["respect_robots_txt"] is True
103
+
104
+ @responses.activate
105
+ def test_crawl_async_with_actions(self, client):
106
+ """Test crawl_async with S3 upload action."""
107
+ responses.add(
108
+ responses.POST,
109
+ "https://api.test.com/v1/crawl",
110
+ json={"id": "crawl-456"},
111
+ status=200,
112
+ )
113
+
114
+ s3_action = UploadS3Action(
115
+ path="crawl-results/",
116
+ access_key_id="AKIAEXAMPLE",
117
+ secret_access_key="secret123",
118
+ bucket="my-bucket",
119
+ )
120
+
121
+ result = client.crawl_async(url="https://example.com", actions=[s3_action])
122
+
123
+ assert result.id == "crawl-456"
124
+
125
+ # Verify action in payload
126
+ request = responses.calls[0].request
127
+ import json
128
+
129
+ payload = json.loads(request.body)
130
+ assert "actions" in payload
131
+ assert len(payload["actions"]) == 1
132
+ assert payload["actions"][0]["type"] == "upload_s3"
133
+ assert payload["actions"][0]["bucket"] == "my-bucket"
134
+
135
+ @responses.activate
136
+ def test_crawl_async_http_error(self, client):
137
+ """Test crawl_async with HTTP error response."""
138
+ responses.add(
139
+ responses.POST,
140
+ "https://api.test.com/v1/crawl",
141
+ json={"error": "Invalid URL"},
142
+ status=400,
143
+ )
144
+
145
+ with pytest.raises(requests.exceptions.HTTPError):
146
+ client.crawl_async(url="invalid-url")
147
+
148
+ @responses.activate
149
+ def test_get_job_success(self, client, mock_job_data):
150
+ """Test successful job retrieval."""
151
+ responses.add(
152
+ responses.GET,
153
+ "https://api.test.com/v1/job/job-123",
154
+ json=mock_job_data,
155
+ status=200,
156
+ )
157
+
158
+ job = client.get_job("job-123")
159
+
160
+ assert isinstance(job, Job)
161
+ assert job.id == "job-123"
162
+ assert job.status == "done"
163
+ assert job.scrape_type == "markdown"
164
+ assert len(job.job_items) == 1
165
+ assert job.job_items[0].title == "Page 1"
166
+
167
+ @responses.activate
168
+ def test_cancel_job_success(self, client):
169
+ """Test successful job cancellation."""
170
+ responses.add(
171
+ responses.PUT,
172
+ "https://api.test.com/v1/job/job-123/cancel",
173
+ json={"message": "Job cancelled successfully"},
174
+ status=200,
175
+ )
176
+
177
+ result = client.cancel_job("job-123")
178
+ assert result["message"] == "Job cancelled successfully"
179
+
180
+ @responses.activate
181
+ def test_crawl_with_polling_terminal_state(self, client, mock_job_data):
182
+ """Test crawl method that polls until terminal state."""
183
+ # Mock crawl_async response
184
+ responses.add(
185
+ responses.POST,
186
+ "https://api.test.com/v1/crawl",
187
+ json={"id": "job-123"},
188
+ status=200,
189
+ )
190
+
191
+ # Mock get_job response with terminal state
192
+ responses.add(
193
+ responses.GET,
194
+ "https://api.test.com/v1/job/job-123",
195
+ json=mock_job_data,
196
+ status=200,
197
+ )
198
+
199
+ with patch("time.sleep") as mock_sleep:
200
+ job = client.crawl(url="https://example.com", max_polls=5)
201
+
202
+ assert isinstance(job, Job)
203
+ assert job.id == "job-123"
204
+ assert job.status == "done"
205
+ # Should not sleep since job is already in terminal state
206
+ mock_sleep.assert_not_called()
207
+
208
+ @responses.activate
209
+ def test_crawl_with_polling_max_polls_reached(self, client):
210
+ """Test crawl method that reaches max_polls limit."""
211
+ # Mock crawl_async response
212
+ responses.add(
213
+ responses.POST,
214
+ "https://api.test.com/v1/crawl",
215
+ json={"id": "job-123"},
216
+ status=200,
217
+ )
218
+
219
+ # Mock get_job responses with non-terminal state
220
+ in_progress_data = {
221
+ "id": "job-123",
222
+ "org_id": "org-456",
223
+ "url": "https://example.com",
224
+ "status": "in_progress",
225
+ "scrape_type": "markdown",
226
+ "items_limit": 10,
227
+ "allow_subdomains": False,
228
+ "created_at": "2023-01-01T12:00:00.000Z",
229
+ "updated_at": "2023-01-01T12:30:00.000Z",
230
+ "recommended_pull_delay_ms": 1000,
231
+ "job_items": [],
232
+ }
233
+
234
+ for _ in range(3):
235
+ responses.add(
236
+ responses.GET,
237
+ "https://api.test.com/v1/job/job-123",
238
+ json=in_progress_data,
239
+ status=200,
240
+ )
241
+
242
+ with patch("time.sleep") as mock_sleep:
243
+ job = client.crawl(url="https://example.com", max_polls=3)
244
+
245
+ assert job.status == "in_progress"
246
+ # Should sleep 3 times (once after each poll)
247
+ assert mock_sleep.call_count == 3
248
+ mock_sleep.assert_called_with(1.0) # recommended_pull_delay_ms / 1000
249
+
250
+ @responses.activate
251
+ def test_scrape_async_success(self, client):
252
+ """Test successful asynchronous scrape request."""
253
+ responses.add(
254
+ responses.POST,
255
+ "https://api.test.com/v2/scrape?async=true",
256
+ json={"id": "scrape-123"},
257
+ status=200,
258
+ )
259
+
260
+ result = client.scrape_async(
261
+ url="https://example.com",
262
+ output_format="cleaned",
263
+ clean_selectors=".ads, .footer",
264
+ prompt="Extract main content",
265
+ respect_robots_txt=True,
266
+ )
267
+
268
+ assert isinstance(result, ScrapeId)
269
+ assert result.id == "scrape-123"
270
+
271
+ # Verify request payload
272
+ request = responses.calls[0].request
273
+ import json
274
+
275
+ payload = json.loads(request.body)
276
+ assert payload["url"] == "https://example.com"
277
+ assert payload["output_format"] == "cleaned"
278
+ assert payload["clean_selectors"] == ".ads, .footer"
279
+ assert payload["prompt"] == "Extract main content"
280
+ assert payload["respect_robots_txt"] is True
281
+
282
+ @responses.activate
283
+ def test_scrape_async_error_response(self, client):
284
+ """Test scrape_async with error response."""
285
+ responses.add(
286
+ responses.POST,
287
+ "https://api.test.com/v2/scrape?async=true",
288
+ json={"error": "Invalid URL format"},
289
+ status=400,
290
+ )
291
+
292
+ with pytest.raises(requests.exceptions.HTTPError) as exc_info:
293
+ client.scrape_async(url="invalid-url")
294
+
295
+ assert "400" in str(exc_info.value)
296
+ assert "Invalid URL format" in str(exc_info.value)
297
+
298
+ @responses.activate
299
+ def test_get_scrape_success_done(self, client):
300
+ """Test get_scrape with successful completion."""
301
+ scrape_data = {
302
+ "status": "done",
303
+ "success": True,
304
+ "markdown": "# Test Content",
305
+ "cleaned_content": "Test Content",
306
+ "raw_content": "<h1>Test Content</h1>",
307
+ "page_status_code": 200,
308
+ "page_title": "Test Page",
309
+ "structured_data": {"title": "Test Page"},
310
+ }
311
+
312
+ responses.add(
313
+ responses.GET,
314
+ "https://api.test.com/v2/scrape/scrape-123",
315
+ json=scrape_data,
316
+ status=200,
317
+ )
318
+
319
+ result = client.get_scrape("scrape-123")
320
+
321
+ assert isinstance(result, ScrapeResponse)
322
+ assert result.status == "done"
323
+ assert result.success is True
324
+ assert result.markdown == "# Test Content"
325
+ assert result.page_title == "Test Page"
326
+
327
+ @responses.activate
328
+ def test_get_scrape_error_status(self, client):
329
+ """Test get_scrape with error status."""
330
+ error_data = {
331
+ "status": "error",
332
+ "success": False,
333
+ "error_code": "TIMEOUT",
334
+ "error_message": "Request timed out",
335
+ }
336
+
337
+ responses.add(
338
+ responses.GET,
339
+ "https://api.test.com/v2/scrape/scrape-123",
340
+ json=error_data,
341
+ status=200,
342
+ )
343
+
344
+ result = client.get_scrape("scrape-123")
345
+
346
+ assert isinstance(result, ScrapeResponseError)
347
+ assert result.status == "error"
348
+ assert result.success is False
349
+ assert result.error_code == "TIMEOUT"
350
+ assert result.error_message == "Request timed out"
351
+
352
+ @responses.activate
353
+ def test_get_scrape_in_progress(self, client):
354
+ """Test get_scrape with in_progress status."""
355
+ progress_data = {"status": "in_progress", "success": False}
356
+
357
+ responses.add(
358
+ responses.GET,
359
+ "https://api.test.com/v2/scrape/scrape-123",
360
+ json=progress_data,
361
+ status=200,
362
+ )
363
+
364
+ result = client.get_scrape("scrape-123")
365
+
366
+ assert isinstance(result, ScrapeResponse)
367
+ assert result.status == "in_progress"
368
+ assert result.success is False
369
+
370
+ @responses.activate
371
+ def test_scrape_with_polling_success(self, client):
372
+ """Test scrape method that polls until completion."""
373
+ # Mock scrape_async response
374
+ responses.add(
375
+ responses.POST,
376
+ "https://api.test.com/v2/scrape?async=true",
377
+ json={"id": "scrape-123"},
378
+ status=200,
379
+ )
380
+
381
+ # Mock get_scrape response with done status
382
+ scrape_data = {
383
+ "status": "done",
384
+ "success": True,
385
+ "markdown": "# Scraped Content",
386
+ }
387
+
388
+ responses.add(
389
+ responses.GET,
390
+ "https://api.test.com/v2/scrape/scrape-123",
391
+ json=scrape_data,
392
+ status=200,
393
+ )
394
+
395
+ with patch("time.sleep") as mock_sleep:
396
+ result = client.scrape(url="https://example.com")
397
+
398
+ assert isinstance(result, ScrapeResponse)
399
+ assert result.status == "done"
400
+ assert result.markdown == "# Scraped Content"
401
+ # Should not sleep since scrape is already done
402
+ mock_sleep.assert_not_called()
403
+
404
+ @responses.activate
405
+ def test_scrape_with_polling_error(self, client):
406
+ """Test scrape method that polls and gets error."""
407
+ # Mock scrape_async response
408
+ responses.add(
409
+ responses.POST,
410
+ "https://api.test.com/v2/scrape?async=true",
411
+ json={"id": "scrape-123"},
412
+ status=200,
413
+ )
414
+
415
+ # Mock get_scrape response with error
416
+ error_data = {
417
+ "status": "error",
418
+ "success": False,
419
+ "error_code": "FETCH_ERROR",
420
+ "error_message": "Failed to fetch page",
421
+ }
422
+
423
+ responses.add(
424
+ responses.GET,
425
+ "https://api.test.com/v2/scrape/scrape-123",
426
+ json=error_data,
427
+ status=200,
428
+ )
429
+
430
+ with patch("time.sleep") as mock_sleep:
431
+ result = client.scrape(url="https://example.com")
432
+
433
+ assert isinstance(result, ScrapeResponseError)
434
+ assert result.error_code == "FETCH_ERROR"
435
+ # Should not sleep since error is immediate
436
+ mock_sleep.assert_not_called()
437
+
438
+ @responses.activate
439
+ def test_scrape_with_polling_max_polls(self, client):
440
+ """Test scrape method that reaches max_polls."""
441
+ # Mock scrape_async response
442
+ responses.add(
443
+ responses.POST,
444
+ "https://api.test.com/v2/scrape?async=true",
445
+ json={"id": "scrape-123"},
446
+ status=200,
447
+ )
448
+
449
+ # Mock get_scrape responses with in_progress status
450
+ progress_data = {"status": "in_progress", "success": False}
451
+
452
+ for _ in range(3):
453
+ responses.add(
454
+ responses.GET,
455
+ "https://api.test.com/v2/scrape/scrape-123",
456
+ json=progress_data,
457
+ status=200,
458
+ )
459
+
460
+ with patch("time.sleep") as mock_sleep:
461
+ result = client.scrape(url="https://example.com", max_polls=3)
462
+
463
+ assert isinstance(result, ScrapeResponse)
464
+ assert result.status == "in_progress"
465
+ # Should sleep 3 times (once after each poll)
466
+ assert mock_sleep.call_count == 3
467
+ mock_sleep.assert_called_with(5) # DEFAULT_POLL_DELAY_SECONDS