firecrawl 4.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- firecrawl/__init__.py +87 -0
- firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
- firecrawl/__tests__/e2e/v2/test_map.py +61 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
- firecrawl/__tests__/e2e/v2/test_search.py +270 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
- firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
- firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
- firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +281 -0
- firecrawl/firecrawl.backup.py +4635 -0
- firecrawl/types.py +167 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +5164 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +967 -0
- firecrawl/v2/client_async.py +408 -0
- firecrawl/v2/methods/agent.py +144 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/agent.py +137 -0
- firecrawl/v2/methods/aio/batch.py +188 -0
- firecrawl/v2/methods/aio/crawl.py +351 -0
- firecrawl/v2/methods/aio/extract.py +133 -0
- firecrawl/v2/methods/aio/map.py +65 -0
- firecrawl/v2/methods/aio/scrape.py +33 -0
- firecrawl/v2/methods/aio/search.py +176 -0
- firecrawl/v2/methods/aio/usage.py +89 -0
- firecrawl/v2/methods/batch.py +499 -0
- firecrawl/v2/methods/crawl.py +592 -0
- firecrawl/v2/methods/extract.py +161 -0
- firecrawl/v2/methods/map.py +83 -0
- firecrawl/v2/methods/scrape.py +64 -0
- firecrawl/v2/methods/search.py +215 -0
- firecrawl/v2/methods/usage.py +84 -0
- firecrawl/v2/types.py +1143 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +178 -0
- firecrawl/v2/utils/http_client_async.py +69 -0
- firecrawl/v2/utils/normalize.py +125 -0
- firecrawl/v2/utils/validation.py +692 -0
- firecrawl/v2/watcher.py +301 -0
- firecrawl/v2/watcher_async.py +243 -0
- firecrawl-4.12.0.dist-info/METADATA +234 -0
- firecrawl-4.12.0.dist-info/RECORD +92 -0
- firecrawl-4.12.0.dist-info/WHEEL +5 -0
- firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
- firecrawl-4.12.0.dist-info/top_level.txt +2 -0
- tests/test_agent_integration.py +277 -0
- tests/test_api_key_handling.py +44 -0
- tests/test_change_tracking.py +98 -0
- tests/test_timeout_conversion.py +117 -0
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from firecrawl.v2.types import JsonFormat, ScrapeOptions, PDFParser
|
|
3
|
+
from firecrawl.v2.utils.validation import validate_scrape_options, prepare_scrape_options
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestValidateScrapeOptions:
|
|
7
|
+
"""Unit tests for validate_scrape_options function."""
|
|
8
|
+
|
|
9
|
+
def test_validate_none_options(self):
|
|
10
|
+
"""Test validation with None options."""
|
|
11
|
+
result = validate_scrape_options(None)
|
|
12
|
+
assert result is None
|
|
13
|
+
|
|
14
|
+
def test_validate_valid_options(self):
|
|
15
|
+
"""Test validation with valid options."""
|
|
16
|
+
options = ScrapeOptions(
|
|
17
|
+
formats=["markdown"],
|
|
18
|
+
timeout=30000,
|
|
19
|
+
wait_for=2000
|
|
20
|
+
)
|
|
21
|
+
result = validate_scrape_options(options)
|
|
22
|
+
assert result == options
|
|
23
|
+
|
|
24
|
+
def test_validate_invalid_timeout(self):
|
|
25
|
+
"""Test validation with invalid timeout."""
|
|
26
|
+
options = ScrapeOptions(timeout=0)
|
|
27
|
+
with pytest.raises(ValueError, match="Timeout must be positive"):
|
|
28
|
+
validate_scrape_options(options)
|
|
29
|
+
|
|
30
|
+
def test_validate_negative_timeout(self):
|
|
31
|
+
"""Test validation with negative timeout."""
|
|
32
|
+
options = ScrapeOptions(timeout=-1000)
|
|
33
|
+
with pytest.raises(ValueError, match="Timeout must be positive"):
|
|
34
|
+
validate_scrape_options(options)
|
|
35
|
+
|
|
36
|
+
def test_validate_invalid_wait_for(self):
|
|
37
|
+
"""Test validation with invalid wait_for."""
|
|
38
|
+
options = ScrapeOptions(wait_for=-500)
|
|
39
|
+
with pytest.raises(ValueError, match="wait_for must be non-negative"):
|
|
40
|
+
validate_scrape_options(options)
|
|
41
|
+
|
|
42
|
+
def test_validate_zero_wait_for(self):
|
|
43
|
+
"""Test validation with zero wait_for (should be valid)."""
|
|
44
|
+
options = ScrapeOptions(wait_for=0)
|
|
45
|
+
result = validate_scrape_options(options)
|
|
46
|
+
assert result == options
|
|
47
|
+
|
|
48
|
+
def test_validate_complex_options(self):
|
|
49
|
+
"""Test validation with complex options."""
|
|
50
|
+
options = ScrapeOptions(
|
|
51
|
+
formats=["markdown", "html"],
|
|
52
|
+
headers={"User-Agent": "Test"},
|
|
53
|
+
include_tags=["h1", "h2"],
|
|
54
|
+
exclude_tags=["nav"],
|
|
55
|
+
only_main_content=False,
|
|
56
|
+
timeout=15000,
|
|
57
|
+
wait_for=2000,
|
|
58
|
+
mobile=True,
|
|
59
|
+
skip_tls_verification=True,
|
|
60
|
+
remove_base64_images=False,
|
|
61
|
+
raw_html=True,
|
|
62
|
+
screenshot_full_page=True
|
|
63
|
+
)
|
|
64
|
+
result = validate_scrape_options(options)
|
|
65
|
+
assert result == options
|
|
66
|
+
|
|
67
|
+
def test_validate_multiple_invalid_fields(self):
|
|
68
|
+
"""Test validation with multiple invalid fields."""
|
|
69
|
+
options = ScrapeOptions(timeout=-1000, wait_for=-500)
|
|
70
|
+
with pytest.raises(ValueError, match="Timeout must be positive"):
|
|
71
|
+
validate_scrape_options(options)
|
|
72
|
+
# Should fail on first invalid field (timeout)
|
|
73
|
+
|
|
74
|
+
def test_validate_edge_cases(self):
|
|
75
|
+
"""Test validation with edge case values."""
|
|
76
|
+
# Test with very large timeout
|
|
77
|
+
options = ScrapeOptions(timeout=999999)
|
|
78
|
+
result = validate_scrape_options(options)
|
|
79
|
+
assert result == options
|
|
80
|
+
|
|
81
|
+
# Test with very large wait_for
|
|
82
|
+
options = ScrapeOptions(wait_for=999999)
|
|
83
|
+
result = validate_scrape_options(options)
|
|
84
|
+
assert result == options
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class TestPrepareScrapeOptions:
|
|
88
|
+
"""Unit tests for prepare_scrape_options function."""
|
|
89
|
+
|
|
90
|
+
def test_prepare_none_options(self):
|
|
91
|
+
"""Test preparation with None options."""
|
|
92
|
+
result = prepare_scrape_options(None)
|
|
93
|
+
assert result is None
|
|
94
|
+
|
|
95
|
+
def test_prepare_basic_options(self):
|
|
96
|
+
"""Test preparation with basic options."""
|
|
97
|
+
options = ScrapeOptions(
|
|
98
|
+
formats=["markdown"],
|
|
99
|
+
timeout=30000,
|
|
100
|
+
wait_for=2000
|
|
101
|
+
)
|
|
102
|
+
result = prepare_scrape_options(options)
|
|
103
|
+
|
|
104
|
+
assert isinstance(result, dict)
|
|
105
|
+
assert "formats" in result
|
|
106
|
+
assert "timeout" in result
|
|
107
|
+
assert "waitFor" in result
|
|
108
|
+
assert result["timeout"] == 30000
|
|
109
|
+
assert result["waitFor"] == 2000
|
|
110
|
+
|
|
111
|
+
def test_prepare_snake_case_conversion(self):
|
|
112
|
+
"""Test snake_case to camelCase conversion."""
|
|
113
|
+
options = ScrapeOptions(
|
|
114
|
+
include_tags=["h1", "h2"],
|
|
115
|
+
exclude_tags=["nav"],
|
|
116
|
+
only_main_content=False,
|
|
117
|
+
wait_for=2000,
|
|
118
|
+
skip_tls_verification=True,
|
|
119
|
+
remove_base64_images=False
|
|
120
|
+
# Note: raw_html should be in formats array, not as a separate field
|
|
121
|
+
)
|
|
122
|
+
result = prepare_scrape_options(options)
|
|
123
|
+
|
|
124
|
+
# Check conversions
|
|
125
|
+
assert "includeTags" in result
|
|
126
|
+
assert result["includeTags"] == ["h1", "h2"]
|
|
127
|
+
assert "excludeTags" in result
|
|
128
|
+
assert result["excludeTags"] == ["nav"]
|
|
129
|
+
assert "onlyMainContent" in result
|
|
130
|
+
assert result["onlyMainContent"] is False
|
|
131
|
+
assert "waitFor" in result
|
|
132
|
+
assert result["waitFor"] == 2000
|
|
133
|
+
assert "skipTlsVerification" in result
|
|
134
|
+
assert result["skipTlsVerification"] is True
|
|
135
|
+
assert "removeBase64Images" in result
|
|
136
|
+
assert result["removeBase64Images"] is False
|
|
137
|
+
|
|
138
|
+
# Check that snake_case fields are not present
|
|
139
|
+
assert "include_tags" not in result
|
|
140
|
+
assert "exclude_tags" not in result
|
|
141
|
+
assert "only_main_content" not in result
|
|
142
|
+
assert "wait_for" not in result
|
|
143
|
+
assert "skip_tls_verification" not in result
|
|
144
|
+
assert "remove_base64_images" not in result
|
|
145
|
+
|
|
146
|
+
def test_prepare_complex_options(self):
|
|
147
|
+
"""Test preparation with complex options."""
|
|
148
|
+
options = ScrapeOptions(
|
|
149
|
+
formats=["markdown", "html", "rawHtml"],
|
|
150
|
+
headers={"User-Agent": "Test Bot"},
|
|
151
|
+
include_tags=["h1", "h2", "h3"],
|
|
152
|
+
exclude_tags=["nav", "footer"],
|
|
153
|
+
only_main_content=False,
|
|
154
|
+
timeout=15000,
|
|
155
|
+
wait_for=2000,
|
|
156
|
+
mobile=True,
|
|
157
|
+
skip_tls_verification=True,
|
|
158
|
+
remove_base64_images=False
|
|
159
|
+
)
|
|
160
|
+
result = prepare_scrape_options(options)
|
|
161
|
+
|
|
162
|
+
# Check all fields are present and converted
|
|
163
|
+
assert "formats" in result
|
|
164
|
+
assert "headers" in result
|
|
165
|
+
assert "includeTags" in result
|
|
166
|
+
assert "excludeTags" in result
|
|
167
|
+
assert "onlyMainContent" in result
|
|
168
|
+
assert "timeout" in result
|
|
169
|
+
assert "waitFor" in result
|
|
170
|
+
assert "mobile" in result
|
|
171
|
+
assert "skipTlsVerification" in result
|
|
172
|
+
assert "removeBase64Images" in result
|
|
173
|
+
|
|
174
|
+
# Check values
|
|
175
|
+
assert result["formats"] == ["markdown", "html", "rawHtml"]
|
|
176
|
+
assert result["headers"] == {"User-Agent": "Test Bot"}
|
|
177
|
+
assert result["includeTags"] == ["h1", "h2", "h3"]
|
|
178
|
+
assert result["excludeTags"] == ["nav", "footer"]
|
|
179
|
+
assert result["onlyMainContent"] is False
|
|
180
|
+
assert result["timeout"] == 15000
|
|
181
|
+
assert result["waitFor"] == 2000
|
|
182
|
+
assert result["mobile"] is True
|
|
183
|
+
assert result["skipTlsVerification"] is True
|
|
184
|
+
assert result["removeBase64Images"] is False
|
|
185
|
+
|
|
186
|
+
def test_prepare_invalid_options(self):
|
|
187
|
+
"""Test preparation with invalid options (should raise error)."""
|
|
188
|
+
options = ScrapeOptions(timeout=-1000)
|
|
189
|
+
with pytest.raises(ValueError, match="Timeout must be positive"):
|
|
190
|
+
prepare_scrape_options(options)
|
|
191
|
+
|
|
192
|
+
def test_prepare_empty_options(self):
|
|
193
|
+
"""Test preparation with empty options."""
|
|
194
|
+
options = ScrapeOptions() # All defaults
|
|
195
|
+
result = prepare_scrape_options(options)
|
|
196
|
+
|
|
197
|
+
# Should return dict with default values
|
|
198
|
+
assert isinstance(result, dict)
|
|
199
|
+
assert "onlyMainContent" in result
|
|
200
|
+
assert result["onlyMainContent"] is True
|
|
201
|
+
assert "mobile" in result
|
|
202
|
+
assert result["mobile"] is False
|
|
203
|
+
|
|
204
|
+
def test_prepare_none_values(self):
|
|
205
|
+
"""Test preparation with None values in options."""
|
|
206
|
+
options = ScrapeOptions(
|
|
207
|
+
formats=None,
|
|
208
|
+
timeout=None,
|
|
209
|
+
wait_for=None,
|
|
210
|
+
include_tags=None,
|
|
211
|
+
exclude_tags=None
|
|
212
|
+
)
|
|
213
|
+
result = prepare_scrape_options(options)
|
|
214
|
+
|
|
215
|
+
# Should only include non-None values
|
|
216
|
+
assert isinstance(result, dict)
|
|
217
|
+
# Should have default values for required fields
|
|
218
|
+
assert "onlyMainContent" in result
|
|
219
|
+
assert "mobile" in result
|
|
220
|
+
|
|
221
|
+
def test_format_schema_conversion(self):
|
|
222
|
+
"""Test that Format schema is properly handled."""
|
|
223
|
+
# Create a JsonFormat object with schema
|
|
224
|
+
format_obj = JsonFormat(
|
|
225
|
+
type="json",
|
|
226
|
+
prompt="Extract product info",
|
|
227
|
+
schema={"type": "object", "properties": {"name": {"type": "string"}}}
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
dumped = format_obj.model_dump()
|
|
231
|
+
assert "schema" in dumped
|
|
232
|
+
assert dumped["schema"] == {"type": "object", "properties": {"name": {"type": "string"}}}
|
|
233
|
+
|
|
234
|
+
def test_prepare_new_v2_fields(self):
|
|
235
|
+
"""Test preparation with new v2 fields."""
|
|
236
|
+
from firecrawl.v2.types import Viewport, ScreenshotAction
|
|
237
|
+
|
|
238
|
+
viewport = Viewport(width=1920, height=1080)
|
|
239
|
+
screenshot_action = ScreenshotAction(
|
|
240
|
+
type="screenshot",
|
|
241
|
+
full_page=True,
|
|
242
|
+
quality=90,
|
|
243
|
+
viewport=viewport
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
options = ScrapeOptions(
|
|
247
|
+
fast_mode=True,
|
|
248
|
+
use_mock="test-mock",
|
|
249
|
+
block_ads=False,
|
|
250
|
+
store_in_cache=False,
|
|
251
|
+
max_age=7200000, # 2 hours
|
|
252
|
+
actions=[screenshot_action],
|
|
253
|
+
parsers=["pdf"]
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
result = prepare_scrape_options(options)
|
|
257
|
+
|
|
258
|
+
# Check new field conversions
|
|
259
|
+
assert "fastMode" in result
|
|
260
|
+
assert result["fastMode"] is True
|
|
261
|
+
assert "useMock" in result
|
|
262
|
+
assert result["useMock"] == "test-mock"
|
|
263
|
+
assert "blockAds" in result
|
|
264
|
+
assert result["blockAds"] is False
|
|
265
|
+
assert "storeInCache" in result
|
|
266
|
+
assert result["storeInCache"] is False
|
|
267
|
+
assert "maxAge" in result
|
|
268
|
+
assert result["maxAge"] == 7200000
|
|
269
|
+
|
|
270
|
+
# Check actions conversion
|
|
271
|
+
assert "actions" in result
|
|
272
|
+
assert len(result["actions"]) == 1
|
|
273
|
+
action = result["actions"][0]
|
|
274
|
+
assert action["type"] == "screenshot"
|
|
275
|
+
assert action["fullPage"] is True
|
|
276
|
+
assert action["quality"] == 90
|
|
277
|
+
assert "viewport" in action
|
|
278
|
+
assert action["viewport"]["width"] == 1920
|
|
279
|
+
assert action["viewport"]["height"] == 1080
|
|
280
|
+
|
|
281
|
+
# Check parsers
|
|
282
|
+
assert "parsers" in result
|
|
283
|
+
assert result["parsers"] == ["pdf"]
|
|
284
|
+
|
|
285
|
+
# Check that snake_case fields are not present
|
|
286
|
+
assert "fast_mode" not in result
|
|
287
|
+
assert "use_mock" not in result
|
|
288
|
+
assert "block_ads" not in result
|
|
289
|
+
assert "store_in_cache" not in result
|
|
290
|
+
assert "max_age" not in result
|
|
291
|
+
|
|
292
|
+
def test_prepare_parsers_max_pages_dict(self):
|
|
293
|
+
"""Ensure parser dicts convert max_pages to maxPages."""
|
|
294
|
+
options = ScrapeOptions(
|
|
295
|
+
parsers=[{"type": "pdf", "max_pages": 3}]
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
result = prepare_scrape_options(options)
|
|
299
|
+
|
|
300
|
+
assert "parsers" in result
|
|
301
|
+
assert result["parsers"][0]["maxPages"] == 3
|
|
302
|
+
assert "max_pages" not in result["parsers"][0]
|
|
303
|
+
|
|
304
|
+
def test_prepare_parsers_max_pages_model(self):
|
|
305
|
+
"""Ensure parser models convert max_pages to maxPages."""
|
|
306
|
+
parser = PDFParser(max_pages=5)
|
|
307
|
+
options = ScrapeOptions(parsers=[parser])
|
|
308
|
+
|
|
309
|
+
result = prepare_scrape_options(options)
|
|
310
|
+
|
|
311
|
+
assert result["parsers"][0]["maxPages"] == 5
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import time
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from firecrawl.v2.watcher import Watcher
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DummyHttpClient:
|
|
10
|
+
def __init__(self, api_url: str = "http://localhost", api_key: str = "TEST"):
|
|
11
|
+
self.api_url = api_url
|
|
12
|
+
self.api_key = api_key
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DummyClient:
|
|
16
|
+
def __init__(self):
|
|
17
|
+
self.http_client = DummyHttpClient()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FakeWebSocket:
|
|
21
|
+
def __init__(self, messages):
|
|
22
|
+
# messages: list of dicts to be json-dumped
|
|
23
|
+
self._messages = list(messages)
|
|
24
|
+
|
|
25
|
+
async def recv(self):
|
|
26
|
+
if not self._messages:
|
|
27
|
+
# No more messages; block a bit to allow loop to end
|
|
28
|
+
await asyncio.sleep(0.01)
|
|
29
|
+
# Simulate disconnect
|
|
30
|
+
raise asyncio.CancelledError()
|
|
31
|
+
msg = self._messages.pop(0)
|
|
32
|
+
return json.dumps(msg)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class FakeConnect:
|
|
36
|
+
def __init__(self, ws: FakeWebSocket):
|
|
37
|
+
self._ws = ws
|
|
38
|
+
|
|
39
|
+
async def __aenter__(self):
|
|
40
|
+
return self._ws
|
|
41
|
+
|
|
42
|
+
async def __aexit__(self, exc_type, exc, tb):
|
|
43
|
+
return False
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@pytest.mark.parametrize("kind", ["crawl", "batch"])
|
|
47
|
+
def test_ws_watcher_document_and_done(monkeypatch, kind):
|
|
48
|
+
# Prepare messages: one document then done
|
|
49
|
+
messages = [
|
|
50
|
+
{"type": "document", "data": {"url": "https://example.com", "rawHtml": "<html>"}},
|
|
51
|
+
{"type": "done", "data": {"status": "completed", "data": []}},
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
ws = FakeWebSocket(messages)
|
|
55
|
+
|
|
56
|
+
def fake_connect(uri, *args, **kwargs):
|
|
57
|
+
return FakeConnect(ws)
|
|
58
|
+
|
|
59
|
+
import websockets
|
|
60
|
+
monkeypatch.setattr(websockets, "connect", fake_connect)
|
|
61
|
+
|
|
62
|
+
client = DummyClient()
|
|
63
|
+
watcher = Watcher(client, job_id="jid", kind=kind)
|
|
64
|
+
|
|
65
|
+
events = {"document": 0, "done": 0}
|
|
66
|
+
statuses = []
|
|
67
|
+
|
|
68
|
+
watcher.add_event_listener("document", lambda d: events.__setitem__("document", events["document"] + 1))
|
|
69
|
+
watcher.add_event_listener("done", lambda d: events.__setitem__("done", events["done"] + 1))
|
|
70
|
+
watcher.add_listener(lambda s: statuses.append(s.status))
|
|
71
|
+
|
|
72
|
+
watcher.start()
|
|
73
|
+
|
|
74
|
+
# Wait for thread to finish
|
|
75
|
+
deadline = time.time() + 2
|
|
76
|
+
while watcher._thread and watcher._thread.is_alive() and time.time() < deadline:
|
|
77
|
+
time.sleep(0.01)
|
|
78
|
+
|
|
79
|
+
watcher.stop()
|
|
80
|
+
|
|
81
|
+
assert events["document"] >= 1
|
|
82
|
+
assert events["done"] == 1
|
|
83
|
+
assert statuses[-1] in ("completed", "failed", "cancelled")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_ws_watcher_error_event(monkeypatch):
|
|
87
|
+
messages = [
|
|
88
|
+
{"type": "error", "error": "boom", "data": {"status": "failed"}},
|
|
89
|
+
]
|
|
90
|
+
ws = FakeWebSocket(messages)
|
|
91
|
+
|
|
92
|
+
def fake_connect(uri, *args, **kwargs):
|
|
93
|
+
return FakeConnect(ws)
|
|
94
|
+
|
|
95
|
+
import websockets
|
|
96
|
+
monkeypatch.setattr(websockets, "connect", fake_connect)
|
|
97
|
+
|
|
98
|
+
client = DummyClient()
|
|
99
|
+
watcher = Watcher(client, job_id="jid", kind="crawl")
|
|
100
|
+
|
|
101
|
+
seen_error = {"count": 0}
|
|
102
|
+
watcher.add_event_listener("error", lambda d: seen_error.__setitem__("count", seen_error["count"] + 1))
|
|
103
|
+
|
|
104
|
+
watcher.start()
|
|
105
|
+
|
|
106
|
+
deadline = time.time() + 2
|
|
107
|
+
while watcher._thread and watcher._thread.is_alive() and time.time() < deadline:
|
|
108
|
+
time.sleep(0.01)
|
|
109
|
+
|
|
110
|
+
watcher.stop()
|
|
111
|
+
|
|
112
|
+
assert seen_error["count"] == 1
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@pytest.mark.parametrize("kind", ["crawl", "batch"])
|
|
116
|
+
def test_ws_watcher_catchup_dispatches_documents_and_updates_status(monkeypatch, kind):
|
|
117
|
+
messages = [
|
|
118
|
+
{
|
|
119
|
+
"type": "catchup",
|
|
120
|
+
"data": {
|
|
121
|
+
"status": "scraping",
|
|
122
|
+
"data": [
|
|
123
|
+
{"url": "https://example.com/1", "rawHtml": "<html>1</html>"},
|
|
124
|
+
{"url": "https://example.com/2", "rawHtml": "<html>2</html>"},
|
|
125
|
+
],
|
|
126
|
+
},
|
|
127
|
+
}
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
ws = FakeWebSocket(messages)
|
|
131
|
+
|
|
132
|
+
def fake_connect(uri, *args, **kwargs):
|
|
133
|
+
return FakeConnect(ws)
|
|
134
|
+
|
|
135
|
+
import websockets
|
|
136
|
+
monkeypatch.setattr(websockets, "connect", fake_connect)
|
|
137
|
+
|
|
138
|
+
client = DummyClient()
|
|
139
|
+
watcher = Watcher(client, job_id="jid", kind=kind)
|
|
140
|
+
|
|
141
|
+
events = {"document": 0}
|
|
142
|
+
statuses = []
|
|
143
|
+
|
|
144
|
+
watcher.add_event_listener("document", lambda d: events.__setitem__("document", events["document"] + 1))
|
|
145
|
+
watcher.add_listener(lambda s: statuses.append(s.status))
|
|
146
|
+
|
|
147
|
+
watcher.start()
|
|
148
|
+
|
|
149
|
+
deadline = time.time() + 2
|
|
150
|
+
while watcher._thread and watcher._thread.is_alive() and time.time() < deadline:
|
|
151
|
+
time.sleep(0.01)
|
|
152
|
+
|
|
153
|
+
watcher.stop()
|
|
154
|
+
|
|
155
|
+
assert events["document"] == 2
|
|
156
|
+
assert statuses[-1] == "scraping"
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@pytest.mark.parametrize("kind", ["crawl", "batch"])
|
|
160
|
+
def test_ws_watcher_status_only_terminal_snapshot_triggers_done(monkeypatch, kind):
|
|
161
|
+
# No explicit type, only a terminal status snapshot
|
|
162
|
+
messages = [
|
|
163
|
+
{"data": {"status": "completed", "data": []}},
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
ws = FakeWebSocket(messages)
|
|
167
|
+
|
|
168
|
+
def fake_connect(uri, *args, **kwargs):
|
|
169
|
+
return FakeConnect(ws)
|
|
170
|
+
|
|
171
|
+
import websockets
|
|
172
|
+
monkeypatch.setattr(websockets, "connect", fake_connect)
|
|
173
|
+
|
|
174
|
+
client = DummyClient()
|
|
175
|
+
watcher = Watcher(client, job_id="jid", kind=kind)
|
|
176
|
+
|
|
177
|
+
events = {"done": 0}
|
|
178
|
+
statuses = []
|
|
179
|
+
|
|
180
|
+
watcher.add_event_listener("done", lambda d: events.__setitem__("done", events["done"] + 1))
|
|
181
|
+
watcher.add_listener(lambda s: statuses.append(s.status))
|
|
182
|
+
|
|
183
|
+
watcher.start()
|
|
184
|
+
|
|
185
|
+
deadline = time.time() + 2
|
|
186
|
+
while watcher._thread and watcher._thread.is_alive() and time.time() < deadline:
|
|
187
|
+
time.sleep(0.01)
|
|
188
|
+
|
|
189
|
+
watcher.stop()
|
|
190
|
+
|
|
191
|
+
assert events["done"] == 1
|
|
192
|
+
assert statuses[-1] == "completed"
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def test_ws_watcher_batch_cancelled_snapshot_no_done_event(monkeypatch):
|
|
196
|
+
# Batch-only: cancelled snapshot should end without a 'done' event
|
|
197
|
+
messages = [
|
|
198
|
+
{"data": {"status": "cancelled", "data": []}},
|
|
199
|
+
]
|
|
200
|
+
|
|
201
|
+
ws = FakeWebSocket(messages)
|
|
202
|
+
|
|
203
|
+
def fake_connect(uri, *args, **kwargs):
|
|
204
|
+
return FakeConnect(ws)
|
|
205
|
+
|
|
206
|
+
import websockets
|
|
207
|
+
monkeypatch.setattr(websockets, "connect", fake_connect)
|
|
208
|
+
|
|
209
|
+
client = DummyClient()
|
|
210
|
+
watcher = Watcher(client, job_id="jid", kind="batch")
|
|
211
|
+
|
|
212
|
+
events = {"done": 0}
|
|
213
|
+
statuses = []
|
|
214
|
+
|
|
215
|
+
watcher.add_event_listener("done", lambda d: events.__setitem__("done", events["done"] + 1))
|
|
216
|
+
watcher.add_listener(lambda s: statuses.append(s.status))
|
|
217
|
+
|
|
218
|
+
watcher.start()
|
|
219
|
+
|
|
220
|
+
deadline = time.time() + 2
|
|
221
|
+
while watcher._thread and watcher._thread.is_alive() and time.time() < deadline:
|
|
222
|
+
time.sleep(0.01)
|
|
223
|
+
|
|
224
|
+
watcher.stop()
|
|
225
|
+
|
|
226
|
+
assert events["done"] == 0
|
|
227
|
+
assert statuses[-1] == "cancelled"
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def test_ws_watcher_propagates_authorization_header(monkeypatch):
|
|
231
|
+
# Ensure Authorization header is forwarded to websockets.connect
|
|
232
|
+
messages = [
|
|
233
|
+
{"type": "done", "data": {"status": "completed", "data": []}},
|
|
234
|
+
]
|
|
235
|
+
|
|
236
|
+
ws = FakeWebSocket(messages)
|
|
237
|
+
|
|
238
|
+
captured_headers = {"headers": None}
|
|
239
|
+
|
|
240
|
+
def fake_connect(uri, *args, **kwargs):
|
|
241
|
+
captured_headers["headers"] = kwargs.get("additional_headers")
|
|
242
|
+
return FakeConnect(ws)
|
|
243
|
+
|
|
244
|
+
import websockets
|
|
245
|
+
monkeypatch.setattr(websockets, "connect", fake_connect)
|
|
246
|
+
|
|
247
|
+
client = DummyClient()
|
|
248
|
+
watcher = Watcher(client, job_id="jid", kind="crawl")
|
|
249
|
+
|
|
250
|
+
watcher.start()
|
|
251
|
+
|
|
252
|
+
deadline = time.time() + 2
|
|
253
|
+
while watcher._thread and watcher._thread.is_alive() and time.time() < deadline:
|
|
254
|
+
time.sleep(0.01)
|
|
255
|
+
|
|
256
|
+
watcher.stop()
|
|
257
|
+
|
|
258
|
+
assert captured_headers["headers"] is not None
|
|
259
|
+
# Expect an Authorization header with Bearer token
|
|
260
|
+
assert any(h[0] == "Authorization" and "Bearer" in h[1] for h in captured_headers["headers"])
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
@pytest.mark.parametrize("kind", ["crawl", "batch"])
|
|
264
|
+
def test_ws_watcher_normalizes_document_fields_in_snapshot(monkeypatch, kind):
|
|
265
|
+
# Status-only snapshot with camelCase fields should be normalized in emitted job
|
|
266
|
+
messages = [
|
|
267
|
+
{"data": {"status": "completed", "data": [
|
|
268
|
+
{"url": "https://example.com/x", "rawHtml": "<x>", "changeTracking": {"modes": ["git-diff"]}}
|
|
269
|
+
]}},
|
|
270
|
+
]
|
|
271
|
+
|
|
272
|
+
ws = FakeWebSocket(messages)
|
|
273
|
+
|
|
274
|
+
def fake_connect(uri, *args, **kwargs):
|
|
275
|
+
return FakeConnect(ws)
|
|
276
|
+
|
|
277
|
+
import websockets
|
|
278
|
+
monkeypatch.setattr(websockets, "connect", fake_connect)
|
|
279
|
+
|
|
280
|
+
client = DummyClient()
|
|
281
|
+
watcher = Watcher(client, job_id="jid", kind=kind)
|
|
282
|
+
|
|
283
|
+
jobs = []
|
|
284
|
+
watcher.add_listener(lambda j: jobs.append(j))
|
|
285
|
+
|
|
286
|
+
watcher.start()
|
|
287
|
+
|
|
288
|
+
deadline = time.time() + 2
|
|
289
|
+
while watcher._thread and watcher._thread.is_alive() and time.time() < deadline:
|
|
290
|
+
time.sleep(0.01)
|
|
291
|
+
|
|
292
|
+
watcher.stop()
|
|
293
|
+
|
|
294
|
+
assert jobs, "No job snapshots emitted"
|
|
295
|
+
last_job = jobs[-1]
|
|
296
|
+
assert last_job.status == "completed"
|
|
297
|
+
assert last_job.data and last_job.data[0].raw_html == "<x>"
|
|
298
|
+
assert last_job.data[0].change_tracking is not None
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
@pytest.mark.parametrize("kind", ["crawl", "batch"])
|
|
302
|
+
def test_ws_watcher_uses_correct_ws_uri(monkeypatch, kind):
|
|
303
|
+
# Verify WS URI uses the correct path per kind and http->ws scheme
|
|
304
|
+
messages = [
|
|
305
|
+
{"type": "done", "data": {"status": "completed", "data": []}},
|
|
306
|
+
]
|
|
307
|
+
|
|
308
|
+
ws = FakeWebSocket(messages)
|
|
309
|
+
|
|
310
|
+
captured_uri = {"uri": None}
|
|
311
|
+
|
|
312
|
+
def fake_connect(uri, *args, **kwargs):
|
|
313
|
+
captured_uri["uri"] = uri
|
|
314
|
+
return FakeConnect(ws)
|
|
315
|
+
|
|
316
|
+
import websockets
|
|
317
|
+
monkeypatch.setattr(websockets, "connect", fake_connect)
|
|
318
|
+
|
|
319
|
+
client = DummyClient()
|
|
320
|
+
watcher = Watcher(client, job_id="jid", kind=kind)
|
|
321
|
+
|
|
322
|
+
watcher.start()
|
|
323
|
+
|
|
324
|
+
deadline = time.time() + 2
|
|
325
|
+
while watcher._thread and watcher._thread.is_alive() and time.time() < deadline:
|
|
326
|
+
time.sleep(0.01)
|
|
327
|
+
|
|
328
|
+
watcher.stop()
|
|
329
|
+
|
|
330
|
+
assert captured_uri["uri"] is not None
|
|
331
|
+
expected = "ws://localhost/v2/crawl/jid" if kind == "crawl" else "ws://localhost/v2/batch/scrape/jid"
|
|
332
|
+
assert captured_uri["uri"] == expected
|