firecrawl 4.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. firecrawl/__init__.py +87 -0
  2. firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
  10. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  11. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  13. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
  14. firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
  15. firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
  16. firecrawl/__tests__/e2e/v2/test_map.py +61 -0
  17. firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
  18. firecrawl/__tests__/e2e/v2/test_search.py +270 -0
  19. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  20. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  21. firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
  28. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  29. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  30. firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
  31. firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
  32. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  33. firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
  34. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  35. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  36. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  37. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
  38. firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
  39. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
  40. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
  41. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  42. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  43. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  44. firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
  45. firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
  46. firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
  47. firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
  48. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  49. firecrawl/client.py +281 -0
  50. firecrawl/firecrawl.backup.py +4635 -0
  51. firecrawl/types.py +167 -0
  52. firecrawl/v1/__init__.py +14 -0
  53. firecrawl/v1/client.py +5164 -0
  54. firecrawl/v2/__init__.py +4 -0
  55. firecrawl/v2/client.py +967 -0
  56. firecrawl/v2/client_async.py +408 -0
  57. firecrawl/v2/methods/agent.py +144 -0
  58. firecrawl/v2/methods/aio/__init__.py +1 -0
  59. firecrawl/v2/methods/aio/agent.py +137 -0
  60. firecrawl/v2/methods/aio/batch.py +188 -0
  61. firecrawl/v2/methods/aio/crawl.py +351 -0
  62. firecrawl/v2/methods/aio/extract.py +133 -0
  63. firecrawl/v2/methods/aio/map.py +65 -0
  64. firecrawl/v2/methods/aio/scrape.py +33 -0
  65. firecrawl/v2/methods/aio/search.py +176 -0
  66. firecrawl/v2/methods/aio/usage.py +89 -0
  67. firecrawl/v2/methods/batch.py +499 -0
  68. firecrawl/v2/methods/crawl.py +592 -0
  69. firecrawl/v2/methods/extract.py +161 -0
  70. firecrawl/v2/methods/map.py +83 -0
  71. firecrawl/v2/methods/scrape.py +64 -0
  72. firecrawl/v2/methods/search.py +215 -0
  73. firecrawl/v2/methods/usage.py +84 -0
  74. firecrawl/v2/types.py +1143 -0
  75. firecrawl/v2/utils/__init__.py +9 -0
  76. firecrawl/v2/utils/error_handler.py +107 -0
  77. firecrawl/v2/utils/get_version.py +15 -0
  78. firecrawl/v2/utils/http_client.py +178 -0
  79. firecrawl/v2/utils/http_client_async.py +69 -0
  80. firecrawl/v2/utils/normalize.py +125 -0
  81. firecrawl/v2/utils/validation.py +692 -0
  82. firecrawl/v2/watcher.py +301 -0
  83. firecrawl/v2/watcher_async.py +243 -0
  84. firecrawl-4.12.0.dist-info/METADATA +234 -0
  85. firecrawl-4.12.0.dist-info/RECORD +92 -0
  86. firecrawl-4.12.0.dist-info/WHEEL +5 -0
  87. firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
  88. firecrawl-4.12.0.dist-info/top_level.txt +2 -0
  89. tests/test_agent_integration.py +277 -0
  90. tests/test_api_key_handling.py +44 -0
  91. tests/test_change_tracking.py +98 -0
  92. tests/test_timeout_conversion.py +117 -0
@@ -0,0 +1,311 @@
1
+ import pytest
2
+ from firecrawl.v2.types import JsonFormat, ScrapeOptions, PDFParser
3
+ from firecrawl.v2.utils.validation import validate_scrape_options, prepare_scrape_options
4
+
5
+
6
+ class TestValidateScrapeOptions:
7
+ """Unit tests for validate_scrape_options function."""
8
+
9
+ def test_validate_none_options(self):
10
+ """Test validation with None options."""
11
+ result = validate_scrape_options(None)
12
+ assert result is None
13
+
14
+ def test_validate_valid_options(self):
15
+ """Test validation with valid options."""
16
+ options = ScrapeOptions(
17
+ formats=["markdown"],
18
+ timeout=30000,
19
+ wait_for=2000
20
+ )
21
+ result = validate_scrape_options(options)
22
+ assert result == options
23
+
24
+ def test_validate_invalid_timeout(self):
25
+ """Test validation with invalid timeout."""
26
+ options = ScrapeOptions(timeout=0)
27
+ with pytest.raises(ValueError, match="Timeout must be positive"):
28
+ validate_scrape_options(options)
29
+
30
+ def test_validate_negative_timeout(self):
31
+ """Test validation with negative timeout."""
32
+ options = ScrapeOptions(timeout=-1000)
33
+ with pytest.raises(ValueError, match="Timeout must be positive"):
34
+ validate_scrape_options(options)
35
+
36
+ def test_validate_invalid_wait_for(self):
37
+ """Test validation with invalid wait_for."""
38
+ options = ScrapeOptions(wait_for=-500)
39
+ with pytest.raises(ValueError, match="wait_for must be non-negative"):
40
+ validate_scrape_options(options)
41
+
42
+ def test_validate_zero_wait_for(self):
43
+ """Test validation with zero wait_for (should be valid)."""
44
+ options = ScrapeOptions(wait_for=0)
45
+ result = validate_scrape_options(options)
46
+ assert result == options
47
+
48
+ def test_validate_complex_options(self):
49
+ """Test validation with complex options."""
50
+ options = ScrapeOptions(
51
+ formats=["markdown", "html"],
52
+ headers={"User-Agent": "Test"},
53
+ include_tags=["h1", "h2"],
54
+ exclude_tags=["nav"],
55
+ only_main_content=False,
56
+ timeout=15000,
57
+ wait_for=2000,
58
+ mobile=True,
59
+ skip_tls_verification=True,
60
+ remove_base64_images=False,
61
+ raw_html=True,
62
+ screenshot_full_page=True
63
+ )
64
+ result = validate_scrape_options(options)
65
+ assert result == options
66
+
67
+ def test_validate_multiple_invalid_fields(self):
68
+ """Test validation with multiple invalid fields."""
69
+ options = ScrapeOptions(timeout=-1000, wait_for=-500)
70
+ with pytest.raises(ValueError, match="Timeout must be positive"):
71
+ validate_scrape_options(options)
72
+ # Should fail on first invalid field (timeout)
73
+
74
+ def test_validate_edge_cases(self):
75
+ """Test validation with edge case values."""
76
+ # Test with very large timeout
77
+ options = ScrapeOptions(timeout=999999)
78
+ result = validate_scrape_options(options)
79
+ assert result == options
80
+
81
+ # Test with very large wait_for
82
+ options = ScrapeOptions(wait_for=999999)
83
+ result = validate_scrape_options(options)
84
+ assert result == options
85
+
86
+
87
+ class TestPrepareScrapeOptions:
88
+ """Unit tests for prepare_scrape_options function."""
89
+
90
+ def test_prepare_none_options(self):
91
+ """Test preparation with None options."""
92
+ result = prepare_scrape_options(None)
93
+ assert result is None
94
+
95
+ def test_prepare_basic_options(self):
96
+ """Test preparation with basic options."""
97
+ options = ScrapeOptions(
98
+ formats=["markdown"],
99
+ timeout=30000,
100
+ wait_for=2000
101
+ )
102
+ result = prepare_scrape_options(options)
103
+
104
+ assert isinstance(result, dict)
105
+ assert "formats" in result
106
+ assert "timeout" in result
107
+ assert "waitFor" in result
108
+ assert result["timeout"] == 30000
109
+ assert result["waitFor"] == 2000
110
+
111
+ def test_prepare_snake_case_conversion(self):
112
+ """Test snake_case to camelCase conversion."""
113
+ options = ScrapeOptions(
114
+ include_tags=["h1", "h2"],
115
+ exclude_tags=["nav"],
116
+ only_main_content=False,
117
+ wait_for=2000,
118
+ skip_tls_verification=True,
119
+ remove_base64_images=False
120
+ # Note: raw_html should be in formats array, not as a separate field
121
+ )
122
+ result = prepare_scrape_options(options)
123
+
124
+ # Check conversions
125
+ assert "includeTags" in result
126
+ assert result["includeTags"] == ["h1", "h2"]
127
+ assert "excludeTags" in result
128
+ assert result["excludeTags"] == ["nav"]
129
+ assert "onlyMainContent" in result
130
+ assert result["onlyMainContent"] is False
131
+ assert "waitFor" in result
132
+ assert result["waitFor"] == 2000
133
+ assert "skipTlsVerification" in result
134
+ assert result["skipTlsVerification"] is True
135
+ assert "removeBase64Images" in result
136
+ assert result["removeBase64Images"] is False
137
+
138
+ # Check that snake_case fields are not present
139
+ assert "include_tags" not in result
140
+ assert "exclude_tags" not in result
141
+ assert "only_main_content" not in result
142
+ assert "wait_for" not in result
143
+ assert "skip_tls_verification" not in result
144
+ assert "remove_base64_images" not in result
145
+
146
+ def test_prepare_complex_options(self):
147
+ """Test preparation with complex options."""
148
+ options = ScrapeOptions(
149
+ formats=["markdown", "html", "rawHtml"],
150
+ headers={"User-Agent": "Test Bot"},
151
+ include_tags=["h1", "h2", "h3"],
152
+ exclude_tags=["nav", "footer"],
153
+ only_main_content=False,
154
+ timeout=15000,
155
+ wait_for=2000,
156
+ mobile=True,
157
+ skip_tls_verification=True,
158
+ remove_base64_images=False
159
+ )
160
+ result = prepare_scrape_options(options)
161
+
162
+ # Check all fields are present and converted
163
+ assert "formats" in result
164
+ assert "headers" in result
165
+ assert "includeTags" in result
166
+ assert "excludeTags" in result
167
+ assert "onlyMainContent" in result
168
+ assert "timeout" in result
169
+ assert "waitFor" in result
170
+ assert "mobile" in result
171
+ assert "skipTlsVerification" in result
172
+ assert "removeBase64Images" in result
173
+
174
+ # Check values
175
+ assert result["formats"] == ["markdown", "html", "rawHtml"]
176
+ assert result["headers"] == {"User-Agent": "Test Bot"}
177
+ assert result["includeTags"] == ["h1", "h2", "h3"]
178
+ assert result["excludeTags"] == ["nav", "footer"]
179
+ assert result["onlyMainContent"] is False
180
+ assert result["timeout"] == 15000
181
+ assert result["waitFor"] == 2000
182
+ assert result["mobile"] is True
183
+ assert result["skipTlsVerification"] is True
184
+ assert result["removeBase64Images"] is False
185
+
186
+ def test_prepare_invalid_options(self):
187
+ """Test preparation with invalid options (should raise error)."""
188
+ options = ScrapeOptions(timeout=-1000)
189
+ with pytest.raises(ValueError, match="Timeout must be positive"):
190
+ prepare_scrape_options(options)
191
+
192
+ def test_prepare_empty_options(self):
193
+ """Test preparation with empty options."""
194
+ options = ScrapeOptions() # All defaults
195
+ result = prepare_scrape_options(options)
196
+
197
+ # Should return dict with default values
198
+ assert isinstance(result, dict)
199
+ assert "onlyMainContent" in result
200
+ assert result["onlyMainContent"] is True
201
+ assert "mobile" in result
202
+ assert result["mobile"] is False
203
+
204
+ def test_prepare_none_values(self):
205
+ """Test preparation with None values in options."""
206
+ options = ScrapeOptions(
207
+ formats=None,
208
+ timeout=None,
209
+ wait_for=None,
210
+ include_tags=None,
211
+ exclude_tags=None
212
+ )
213
+ result = prepare_scrape_options(options)
214
+
215
+ # Should only include non-None values
216
+ assert isinstance(result, dict)
217
+ # Should have default values for required fields
218
+ assert "onlyMainContent" in result
219
+ assert "mobile" in result
220
+
221
+ def test_format_schema_conversion(self):
222
+ """Test that Format schema is properly handled."""
223
+ # Create a JsonFormat object with schema
224
+ format_obj = JsonFormat(
225
+ type="json",
226
+ prompt="Extract product info",
227
+ schema={"type": "object", "properties": {"name": {"type": "string"}}}
228
+ )
229
+
230
+ dumped = format_obj.model_dump()
231
+ assert "schema" in dumped
232
+ assert dumped["schema"] == {"type": "object", "properties": {"name": {"type": "string"}}}
233
+
234
+ def test_prepare_new_v2_fields(self):
235
+ """Test preparation with new v2 fields."""
236
+ from firecrawl.v2.types import Viewport, ScreenshotAction
237
+
238
+ viewport = Viewport(width=1920, height=1080)
239
+ screenshot_action = ScreenshotAction(
240
+ type="screenshot",
241
+ full_page=True,
242
+ quality=90,
243
+ viewport=viewport
244
+ )
245
+
246
+ options = ScrapeOptions(
247
+ fast_mode=True,
248
+ use_mock="test-mock",
249
+ block_ads=False,
250
+ store_in_cache=False,
251
+ max_age=7200000, # 2 hours
252
+ actions=[screenshot_action],
253
+ parsers=["pdf"]
254
+ )
255
+
256
+ result = prepare_scrape_options(options)
257
+
258
+ # Check new field conversions
259
+ assert "fastMode" in result
260
+ assert result["fastMode"] is True
261
+ assert "useMock" in result
262
+ assert result["useMock"] == "test-mock"
263
+ assert "blockAds" in result
264
+ assert result["blockAds"] is False
265
+ assert "storeInCache" in result
266
+ assert result["storeInCache"] is False
267
+ assert "maxAge" in result
268
+ assert result["maxAge"] == 7200000
269
+
270
+ # Check actions conversion
271
+ assert "actions" in result
272
+ assert len(result["actions"]) == 1
273
+ action = result["actions"][0]
274
+ assert action["type"] == "screenshot"
275
+ assert action["fullPage"] is True
276
+ assert action["quality"] == 90
277
+ assert "viewport" in action
278
+ assert action["viewport"]["width"] == 1920
279
+ assert action["viewport"]["height"] == 1080
280
+
281
+ # Check parsers
282
+ assert "parsers" in result
283
+ assert result["parsers"] == ["pdf"]
284
+
285
+ # Check that snake_case fields are not present
286
+ assert "fast_mode" not in result
287
+ assert "use_mock" not in result
288
+ assert "block_ads" not in result
289
+ assert "store_in_cache" not in result
290
+ assert "max_age" not in result
291
+
292
+ def test_prepare_parsers_max_pages_dict(self):
293
+ """Ensure parser dicts convert max_pages to maxPages."""
294
+ options = ScrapeOptions(
295
+ parsers=[{"type": "pdf", "max_pages": 3}]
296
+ )
297
+
298
+ result = prepare_scrape_options(options)
299
+
300
+ assert "parsers" in result
301
+ assert result["parsers"][0]["maxPages"] == 3
302
+ assert "max_pages" not in result["parsers"][0]
303
+
304
+ def test_prepare_parsers_max_pages_model(self):
305
+ """Ensure parser models convert max_pages to maxPages."""
306
+ parser = PDFParser(max_pages=5)
307
+ options = ScrapeOptions(parsers=[parser])
308
+
309
+ result = prepare_scrape_options(options)
310
+
311
+ assert result["parsers"][0]["maxPages"] == 5
@@ -0,0 +1,332 @@
1
+ import asyncio
2
+ import json
3
+ import time
4
+ import pytest
5
+
6
+ from firecrawl.v2.watcher import Watcher
7
+
8
+
9
+ class DummyHttpClient:
10
+ def __init__(self, api_url: str = "http://localhost", api_key: str = "TEST"):
11
+ self.api_url = api_url
12
+ self.api_key = api_key
13
+
14
+
15
+ class DummyClient:
16
+ def __init__(self):
17
+ self.http_client = DummyHttpClient()
18
+
19
+
20
+ class FakeWebSocket:
21
+ def __init__(self, messages):
22
+ # messages: list of dicts to be json-dumped
23
+ self._messages = list(messages)
24
+
25
+ async def recv(self):
26
+ if not self._messages:
27
+ # No more messages; block a bit to allow loop to end
28
+ await asyncio.sleep(0.01)
29
+ # Simulate disconnect
30
+ raise asyncio.CancelledError()
31
+ msg = self._messages.pop(0)
32
+ return json.dumps(msg)
33
+
34
+
35
+ class FakeConnect:
36
+ def __init__(self, ws: FakeWebSocket):
37
+ self._ws = ws
38
+
39
+ async def __aenter__(self):
40
+ return self._ws
41
+
42
+ async def __aexit__(self, exc_type, exc, tb):
43
+ return False
44
+
45
+
46
+ @pytest.mark.parametrize("kind", ["crawl", "batch"])
47
+ def test_ws_watcher_document_and_done(monkeypatch, kind):
48
+ # Prepare messages: one document then done
49
+ messages = [
50
+ {"type": "document", "data": {"url": "https://example.com", "rawHtml": "<html>"}},
51
+ {"type": "done", "data": {"status": "completed", "data": []}},
52
+ ]
53
+
54
+ ws = FakeWebSocket(messages)
55
+
56
+ def fake_connect(uri, *args, **kwargs):
57
+ return FakeConnect(ws)
58
+
59
+ import websockets
60
+ monkeypatch.setattr(websockets, "connect", fake_connect)
61
+
62
+ client = DummyClient()
63
+ watcher = Watcher(client, job_id="jid", kind=kind)
64
+
65
+ events = {"document": 0, "done": 0}
66
+ statuses = []
67
+
68
+ watcher.add_event_listener("document", lambda d: events.__setitem__("document", events["document"] + 1))
69
+ watcher.add_event_listener("done", lambda d: events.__setitem__("done", events["done"] + 1))
70
+ watcher.add_listener(lambda s: statuses.append(s.status))
71
+
72
+ watcher.start()
73
+
74
+ # Wait for thread to finish
75
+ deadline = time.time() + 2
76
+ while watcher._thread and watcher._thread.is_alive() and time.time() < deadline:
77
+ time.sleep(0.01)
78
+
79
+ watcher.stop()
80
+
81
+ assert events["document"] >= 1
82
+ assert events["done"] == 1
83
+ assert statuses[-1] in ("completed", "failed", "cancelled")
84
+
85
+
86
+ def test_ws_watcher_error_event(monkeypatch):
87
+ messages = [
88
+ {"type": "error", "error": "boom", "data": {"status": "failed"}},
89
+ ]
90
+ ws = FakeWebSocket(messages)
91
+
92
+ def fake_connect(uri, *args, **kwargs):
93
+ return FakeConnect(ws)
94
+
95
+ import websockets
96
+ monkeypatch.setattr(websockets, "connect", fake_connect)
97
+
98
+ client = DummyClient()
99
+ watcher = Watcher(client, job_id="jid", kind="crawl")
100
+
101
+ seen_error = {"count": 0}
102
+ watcher.add_event_listener("error", lambda d: seen_error.__setitem__("count", seen_error["count"] + 1))
103
+
104
+ watcher.start()
105
+
106
+ deadline = time.time() + 2
107
+ while watcher._thread and watcher._thread.is_alive() and time.time() < deadline:
108
+ time.sleep(0.01)
109
+
110
+ watcher.stop()
111
+
112
+ assert seen_error["count"] == 1
113
+
114
+
115
+ @pytest.mark.parametrize("kind", ["crawl", "batch"])
116
+ def test_ws_watcher_catchup_dispatches_documents_and_updates_status(monkeypatch, kind):
117
+ messages = [
118
+ {
119
+ "type": "catchup",
120
+ "data": {
121
+ "status": "scraping",
122
+ "data": [
123
+ {"url": "https://example.com/1", "rawHtml": "<html>1</html>"},
124
+ {"url": "https://example.com/2", "rawHtml": "<html>2</html>"},
125
+ ],
126
+ },
127
+ }
128
+ ]
129
+
130
+ ws = FakeWebSocket(messages)
131
+
132
+ def fake_connect(uri, *args, **kwargs):
133
+ return FakeConnect(ws)
134
+
135
+ import websockets
136
+ monkeypatch.setattr(websockets, "connect", fake_connect)
137
+
138
+ client = DummyClient()
139
+ watcher = Watcher(client, job_id="jid", kind=kind)
140
+
141
+ events = {"document": 0}
142
+ statuses = []
143
+
144
+ watcher.add_event_listener("document", lambda d: events.__setitem__("document", events["document"] + 1))
145
+ watcher.add_listener(lambda s: statuses.append(s.status))
146
+
147
+ watcher.start()
148
+
149
+ deadline = time.time() + 2
150
+ while watcher._thread and watcher._thread.is_alive() and time.time() < deadline:
151
+ time.sleep(0.01)
152
+
153
+ watcher.stop()
154
+
155
+ assert events["document"] == 2
156
+ assert statuses[-1] == "scraping"
157
+
158
+
159
+ @pytest.mark.parametrize("kind", ["crawl", "batch"])
160
+ def test_ws_watcher_status_only_terminal_snapshot_triggers_done(monkeypatch, kind):
161
+ # No explicit type, only a terminal status snapshot
162
+ messages = [
163
+ {"data": {"status": "completed", "data": []}},
164
+ ]
165
+
166
+ ws = FakeWebSocket(messages)
167
+
168
+ def fake_connect(uri, *args, **kwargs):
169
+ return FakeConnect(ws)
170
+
171
+ import websockets
172
+ monkeypatch.setattr(websockets, "connect", fake_connect)
173
+
174
+ client = DummyClient()
175
+ watcher = Watcher(client, job_id="jid", kind=kind)
176
+
177
+ events = {"done": 0}
178
+ statuses = []
179
+
180
+ watcher.add_event_listener("done", lambda d: events.__setitem__("done", events["done"] + 1))
181
+ watcher.add_listener(lambda s: statuses.append(s.status))
182
+
183
+ watcher.start()
184
+
185
+ deadline = time.time() + 2
186
+ while watcher._thread and watcher._thread.is_alive() and time.time() < deadline:
187
+ time.sleep(0.01)
188
+
189
+ watcher.stop()
190
+
191
+ assert events["done"] == 1
192
+ assert statuses[-1] == "completed"
193
+
194
+
195
+ def test_ws_watcher_batch_cancelled_snapshot_no_done_event(monkeypatch):
196
+ # Batch-only: cancelled snapshot should end without a 'done' event
197
+ messages = [
198
+ {"data": {"status": "cancelled", "data": []}},
199
+ ]
200
+
201
+ ws = FakeWebSocket(messages)
202
+
203
+ def fake_connect(uri, *args, **kwargs):
204
+ return FakeConnect(ws)
205
+
206
+ import websockets
207
+ monkeypatch.setattr(websockets, "connect", fake_connect)
208
+
209
+ client = DummyClient()
210
+ watcher = Watcher(client, job_id="jid", kind="batch")
211
+
212
+ events = {"done": 0}
213
+ statuses = []
214
+
215
+ watcher.add_event_listener("done", lambda d: events.__setitem__("done", events["done"] + 1))
216
+ watcher.add_listener(lambda s: statuses.append(s.status))
217
+
218
+ watcher.start()
219
+
220
+ deadline = time.time() + 2
221
+ while watcher._thread and watcher._thread.is_alive() and time.time() < deadline:
222
+ time.sleep(0.01)
223
+
224
+ watcher.stop()
225
+
226
+ assert events["done"] == 0
227
+ assert statuses[-1] == "cancelled"
228
+
229
+
230
+ def test_ws_watcher_propagates_authorization_header(monkeypatch):
231
+ # Ensure Authorization header is forwarded to websockets.connect
232
+ messages = [
233
+ {"type": "done", "data": {"status": "completed", "data": []}},
234
+ ]
235
+
236
+ ws = FakeWebSocket(messages)
237
+
238
+ captured_headers = {"headers": None}
239
+
240
+ def fake_connect(uri, *args, **kwargs):
241
+ captured_headers["headers"] = kwargs.get("additional_headers")
242
+ return FakeConnect(ws)
243
+
244
+ import websockets
245
+ monkeypatch.setattr(websockets, "connect", fake_connect)
246
+
247
+ client = DummyClient()
248
+ watcher = Watcher(client, job_id="jid", kind="crawl")
249
+
250
+ watcher.start()
251
+
252
+ deadline = time.time() + 2
253
+ while watcher._thread and watcher._thread.is_alive() and time.time() < deadline:
254
+ time.sleep(0.01)
255
+
256
+ watcher.stop()
257
+
258
+ assert captured_headers["headers"] is not None
259
+ # Expect an Authorization header with Bearer token
260
+ assert any(h[0] == "Authorization" and "Bearer" in h[1] for h in captured_headers["headers"])
261
+
262
+
263
+ @pytest.mark.parametrize("kind", ["crawl", "batch"])
264
+ def test_ws_watcher_normalizes_document_fields_in_snapshot(monkeypatch, kind):
265
+ # Status-only snapshot with camelCase fields should be normalized in emitted job
266
+ messages = [
267
+ {"data": {"status": "completed", "data": [
268
+ {"url": "https://example.com/x", "rawHtml": "<x>", "changeTracking": {"modes": ["git-diff"]}}
269
+ ]}},
270
+ ]
271
+
272
+ ws = FakeWebSocket(messages)
273
+
274
+ def fake_connect(uri, *args, **kwargs):
275
+ return FakeConnect(ws)
276
+
277
+ import websockets
278
+ monkeypatch.setattr(websockets, "connect", fake_connect)
279
+
280
+ client = DummyClient()
281
+ watcher = Watcher(client, job_id="jid", kind=kind)
282
+
283
+ jobs = []
284
+ watcher.add_listener(lambda j: jobs.append(j))
285
+
286
+ watcher.start()
287
+
288
+ deadline = time.time() + 2
289
+ while watcher._thread and watcher._thread.is_alive() and time.time() < deadline:
290
+ time.sleep(0.01)
291
+
292
+ watcher.stop()
293
+
294
+ assert jobs, "No job snapshots emitted"
295
+ last_job = jobs[-1]
296
+ assert last_job.status == "completed"
297
+ assert last_job.data and last_job.data[0].raw_html == "<x>"
298
+ assert last_job.data[0].change_tracking is not None
299
+
300
+
301
+ @pytest.mark.parametrize("kind", ["crawl", "batch"])
302
+ def test_ws_watcher_uses_correct_ws_uri(monkeypatch, kind):
303
+ # Verify WS URI uses the correct path per kind and http->ws scheme
304
+ messages = [
305
+ {"type": "done", "data": {"status": "completed", "data": []}},
306
+ ]
307
+
308
+ ws = FakeWebSocket(messages)
309
+
310
+ captured_uri = {"uri": None}
311
+
312
+ def fake_connect(uri, *args, **kwargs):
313
+ captured_uri["uri"] = uri
314
+ return FakeConnect(ws)
315
+
316
+ import websockets
317
+ monkeypatch.setattr(websockets, "connect", fake_connect)
318
+
319
+ client = DummyClient()
320
+ watcher = Watcher(client, job_id="jid", kind=kind)
321
+
322
+ watcher.start()
323
+
324
+ deadline = time.time() + 2
325
+ while watcher._thread and watcher._thread.is_alive() and time.time() < deadline:
326
+ time.sleep(0.01)
327
+
328
+ watcher.stop()
329
+
330
+ assert captured_uri["uri"] is not None
331
+ expected = "ws://localhost/v2/crawl/jid" if kind == "crawl" else "ws://localhost/v2/batch/scrape/jid"
332
+ assert captured_uri["uri"] == expected