firecrawl 4.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. firecrawl/__init__.py +87 -0
  2. firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
  10. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  11. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  13. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
  14. firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
  15. firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
  16. firecrawl/__tests__/e2e/v2/test_map.py +61 -0
  17. firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
  18. firecrawl/__tests__/e2e/v2/test_search.py +270 -0
  19. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  20. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  21. firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
  28. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  29. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  30. firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
  31. firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
  32. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  33. firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
  34. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  35. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  36. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  37. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
  38. firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
  39. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
  40. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
  41. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  42. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  43. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  44. firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
  45. firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
  46. firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
  47. firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
  48. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  49. firecrawl/client.py +281 -0
  50. firecrawl/firecrawl.backup.py +4635 -0
  51. firecrawl/types.py +167 -0
  52. firecrawl/v1/__init__.py +14 -0
  53. firecrawl/v1/client.py +5164 -0
  54. firecrawl/v2/__init__.py +4 -0
  55. firecrawl/v2/client.py +967 -0
  56. firecrawl/v2/client_async.py +408 -0
  57. firecrawl/v2/methods/agent.py +144 -0
  58. firecrawl/v2/methods/aio/__init__.py +1 -0
  59. firecrawl/v2/methods/aio/agent.py +137 -0
  60. firecrawl/v2/methods/aio/batch.py +188 -0
  61. firecrawl/v2/methods/aio/crawl.py +351 -0
  62. firecrawl/v2/methods/aio/extract.py +133 -0
  63. firecrawl/v2/methods/aio/map.py +65 -0
  64. firecrawl/v2/methods/aio/scrape.py +33 -0
  65. firecrawl/v2/methods/aio/search.py +176 -0
  66. firecrawl/v2/methods/aio/usage.py +89 -0
  67. firecrawl/v2/methods/batch.py +499 -0
  68. firecrawl/v2/methods/crawl.py +592 -0
  69. firecrawl/v2/methods/extract.py +161 -0
  70. firecrawl/v2/methods/map.py +83 -0
  71. firecrawl/v2/methods/scrape.py +64 -0
  72. firecrawl/v2/methods/search.py +215 -0
  73. firecrawl/v2/methods/usage.py +84 -0
  74. firecrawl/v2/types.py +1143 -0
  75. firecrawl/v2/utils/__init__.py +9 -0
  76. firecrawl/v2/utils/error_handler.py +107 -0
  77. firecrawl/v2/utils/get_version.py +15 -0
  78. firecrawl/v2/utils/http_client.py +178 -0
  79. firecrawl/v2/utils/http_client_async.py +69 -0
  80. firecrawl/v2/utils/normalize.py +125 -0
  81. firecrawl/v2/utils/validation.py +692 -0
  82. firecrawl/v2/watcher.py +301 -0
  83. firecrawl/v2/watcher_async.py +243 -0
  84. firecrawl-4.12.0.dist-info/METADATA +234 -0
  85. firecrawl-4.12.0.dist-info/RECORD +92 -0
  86. firecrawl-4.12.0.dist-info/WHEEL +5 -0
  87. firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
  88. firecrawl-4.12.0.dist-info/top_level.txt +2 -0
  89. tests/test_agent_integration.py +277 -0
  90. tests/test_api_key_handling.py +44 -0
  91. tests/test_change_tracking.py +98 -0
  92. tests/test_timeout_conversion.py +117 -0
@@ -0,0 +1,278 @@
1
+ import pytest
2
+ import time
3
+ import os
4
+ from dotenv import load_dotenv
5
+ from firecrawl import Firecrawl
6
+ from firecrawl.v2.types import ScrapeOptions
7
+
8
+ load_dotenv()
9
+
10
+ if not os.getenv("API_KEY"):
11
+ raise ValueError("API_KEY is not set")
12
+
13
+ if not os.getenv("API_URL"):
14
+ raise ValueError("API_URL is not set")
15
+
16
+ class TestCrawlE2E:
17
+ """End-to-end tests for crawl functionality."""
18
+
19
+ def setup_method(self):
20
+ """Set up test client."""
21
+ self.client = Firecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
22
+
23
+ def test_start_crawl_minimal_request(self):
24
+ """Test starting a crawl with minimal parameters."""
25
+ crawl_job = self.client.start_crawl("https://docs.firecrawl.dev", limit=3)
26
+
27
+ assert crawl_job.id is not None
28
+ assert crawl_job.url is not None
29
+
30
+ def test_start_crawl_with_options(self):
31
+ """Test starting a crawl with options."""
32
+ crawl_job = self.client.start_crawl(
33
+ "https://docs.firecrawl.dev",
34
+ limit=5,
35
+ max_discovery_depth=2
36
+ )
37
+
38
+ assert crawl_job.id is not None
39
+ assert crawl_job.url is not None
40
+
41
+ def test_start_crawl_with_prompt(self):
42
+ """Test starting a crawl with prompt."""
43
+ crawl_job = self.client.start_crawl(
44
+ "https://firecrawl.dev",
45
+ prompt="Extract all blog posts",
46
+ limit=3
47
+ )
48
+
49
+ assert crawl_job.id is not None
50
+ assert crawl_job.url is not None
51
+
52
+ def test_get_crawl_status(self):
53
+ """Test getting crawl status."""
54
+ # First start a crawl
55
+ start_job = self.client.start_crawl("https://docs.firecrawl.dev", limit=3)
56
+ assert start_job.id is not None
57
+
58
+ job_id = start_job.id
59
+
60
+ # Get status
61
+ status_job = self.client.get_crawl_status(job_id)
62
+
63
+ assert status_job.status in ["scraping", "completed", "failed"]
64
+ assert status_job.completed >= 0
65
+ assert status_job.expires_at is not None
66
+ assert status_job.next is None
67
+ assert isinstance(status_job.data, list)
68
+
69
+ def test_cancel_crawl(self):
70
+ """Test canceling a crawl."""
71
+ start_job = self.client.start_crawl("https://docs.firecrawl.dev", limit=3)
72
+ assert start_job.id is not None
73
+
74
+ job_id = start_job.id
75
+ cancel_job = self.client.cancel_crawl(job_id)
76
+
77
+ time.sleep(5)
78
+ assert cancel_job == True
79
+
80
+ def test_get_crawl_errors(self):
81
+ """Test getting crawl errors."""
82
+ # First start a crawl
83
+ start_job = self.client.start_crawl("https://docs.firecrawl.dev", limit=3)
84
+ assert start_job.id is not None
85
+
86
+ job_id = start_job.id
87
+
88
+ # Get errors (should work even if no errors exist)
89
+ errors_response = self.client.get_crawl_errors(job_id)
90
+
91
+ # Verify the response structure
92
+ assert hasattr(errors_response, 'errors')
93
+ assert hasattr(errors_response, 'robots_blocked')
94
+ assert isinstance(errors_response.errors, list)
95
+ assert isinstance(errors_response.robots_blocked, list)
96
+
97
+ # Errors list should contain dictionaries with expected fields
98
+ for error in errors_response.errors:
99
+ assert isinstance(error, dict)
100
+ assert 'id' in error
101
+ assert 'timestamp' in error
102
+ assert 'url' in error
103
+ assert 'error' in error
104
+ assert isinstance(error['id'], str)
105
+ assert isinstance(error['timestamp'], str)
106
+ assert isinstance(error['url'], str)
107
+ assert isinstance(error['error'], str)
108
+
109
+ # Robots blocked should be a list of strings
110
+ for blocked_url in errors_response.robots_blocked:
111
+ assert isinstance(blocked_url, str)
112
+
113
+ def test_get_crawl_errors_with_invalid_job_id(self):
114
+ """Test getting crawl errors with an invalid job ID."""
115
+ with pytest.raises(Exception):
116
+ self.client.get_crawl_errors("invalid-job-id-12345")
117
+
118
+ def test_get_active_crawls(self):
119
+ """Test getting active crawls."""
120
+ # Get active crawls
121
+ active_crawls_response = self.client.active_crawls()
122
+
123
+ # Verify the response structure
124
+ assert hasattr(active_crawls_response, 'success')
125
+ assert hasattr(active_crawls_response, 'crawls')
126
+ assert isinstance(active_crawls_response.success, bool)
127
+ assert isinstance(active_crawls_response.crawls, list)
128
+
129
+ # Each crawl should have the required fields
130
+ for crawl in active_crawls_response.crawls:
131
+ assert hasattr(crawl, 'id')
132
+ assert hasattr(crawl, 'team_id')
133
+ assert hasattr(crawl, 'url')
134
+ assert isinstance(crawl.id, str)
135
+ assert isinstance(crawl.team_id, str)
136
+ assert isinstance(crawl.url, str)
137
+
138
+ # Options field is optional but if present should be a dict
139
+ if hasattr(crawl, 'options') and crawl.options is not None:
140
+ assert isinstance(crawl.options, dict)
141
+
142
+ def test_get_active_crawls_with_running_crawl(self):
143
+ """Test getting active crawls when there's a running crawl."""
144
+ # Start a crawl
145
+ start_job = self.client.start_crawl("https://docs.firecrawl.dev", limit=5)
146
+ assert start_job.id is not None
147
+
148
+ # Get active crawls
149
+ active_crawls_response = self.client.active_crawls()
150
+
151
+ # Verify the response structure
152
+ assert hasattr(active_crawls_response, 'success')
153
+ assert hasattr(active_crawls_response, 'crawls')
154
+ assert isinstance(active_crawls_response.success, bool)
155
+ assert isinstance(active_crawls_response.crawls, list)
156
+
157
+ # The started crawl should be in the active crawls list
158
+ active_crawl_ids = [crawl.id for crawl in active_crawls_response.crawls]
159
+ assert start_job.id in active_crawl_ids
160
+
161
+ # Cancel the crawl to clean up
162
+ self.client.cancel_crawl(start_job.id)
163
+
164
+ def test_crawl_with_wait(self):
165
+ """Test crawl with wait for completion."""
166
+ crawl_job = self.client.crawl(
167
+ "docs.firecrawl.dev",
168
+ limit=3,
169
+ max_discovery_depth=2,
170
+ poll_interval=1,
171
+ timeout=120,
172
+ integration="_e2e-test",
173
+ )
174
+
175
+ assert crawl_job.status in ["completed", "failed"]
176
+ assert crawl_job.completed >= 0
177
+ assert crawl_job.total >= 0
178
+ assert isinstance(crawl_job.data, list)
179
+
180
+ def test_crawl_with_prompt_and_wait(self):
181
+ """Test crawl with prompt and wait for completion."""
182
+ crawl_job = self.client.crawl(
183
+ "https://docs.firecrawl.dev",
184
+ prompt="Extract all blog posts",
185
+ limit=3,
186
+ poll_interval=1,
187
+ timeout=120
188
+ )
189
+
190
+ assert crawl_job.status in ["completed", "failed"]
191
+ assert crawl_job.completed >= 0
192
+ assert crawl_job.total >= 0
193
+ assert isinstance(crawl_job.data, list)
194
+
195
+ def test_crawl_with_scrape_options(self):
196
+ """Test crawl with scrape options."""
197
+ scrape_opts = ScrapeOptions(
198
+ formats=["markdown", "links"],
199
+ only_main_content=False,
200
+ mobile=True,
201
+ )
202
+
203
+ crawl_job = self.client.start_crawl(
204
+ "https://docs.firecrawl.dev",
205
+ limit=2,
206
+ scrape_options=scrape_opts
207
+ )
208
+
209
+ assert crawl_job.id is not None
210
+
211
+ def test_crawl_with_json_format_object(self):
212
+ """Crawl with scrape_options including a JSON format object (prompt + schema)."""
213
+ json_schema = {
214
+ "type": "object",
215
+ "properties": {
216
+ "title": {"type": "string"}
217
+ },
218
+ "required": ["title"],
219
+ }
220
+ scrape_opts = ScrapeOptions(
221
+ formats=[{"type": "json", "prompt": "Extract page title", "schema": json_schema}]
222
+ )
223
+ crawl_job = self.client.start_crawl(
224
+ "https://docs.firecrawl.dev",
225
+ limit=2,
226
+ scrape_options=scrape_opts
227
+ )
228
+ assert crawl_job.id is not None
229
+
230
+ def test_crawl_all_parameters(self):
231
+ """Test crawl with all possible parameters."""
232
+ scrape_opts = ScrapeOptions(
233
+ formats=["markdown", "html"],
234
+ headers={"User-Agent": "Test Bot"},
235
+ include_tags=["h1", "h2"],
236
+ exclude_tags=["nav"],
237
+ only_main_content=False,
238
+ timeout=15000,
239
+ wait_for=2000,
240
+ mobile=True,
241
+ skip_tls_verification=True,
242
+ remove_base64_images=False
243
+ )
244
+
245
+ crawl_job = self.client.start_crawl(
246
+ "https://docs.firecrawl.dev",
247
+ prompt="Extract all blog posts and documentation",
248
+ include_paths=["/blog/*", "/docs/*"],
249
+ exclude_paths=["/admin/*"],
250
+ max_discovery_depth=3,
251
+ ignore_sitemap=False,
252
+ ignore_query_parameters=True,
253
+ limit=5,
254
+ crawl_entire_domain=True,
255
+ allow_external_links=False,
256
+ allow_subdomains=True,
257
+ delay=1,
258
+ max_concurrency=2,
259
+ webhook="https://example.com/hook",
260
+ scrape_options=scrape_opts,
261
+ zero_data_retention=False,
262
+ integration="_e2e-test",
263
+ )
264
+
265
+ assert crawl_job.id is not None
266
+
267
+
268
+ def test_crawl_params_preview(self):
269
+ """Test crawl_params function."""
270
+ params_data = self.client.crawl_params_preview(
271
+ "https://docs.firecrawl.dev",
272
+ "Extract all blog posts and documentation"
273
+ )
274
+
275
+ assert params_data is not None
276
+ assert params_data.limit is not None or params_data.include_paths is not None or params_data.max_discovery_depth is not None
277
+ assert 'blog/.*' in params_data.include_paths
278
+ assert 'docs/.*' in params_data.include_paths
@@ -0,0 +1,55 @@
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from firecrawl import Firecrawl
4
+
5
+ load_dotenv()
6
+
7
+ if not os.getenv("API_KEY"):
8
+ raise ValueError("API_KEY is not set")
9
+
10
+ if not os.getenv("API_URL"):
11
+ raise ValueError("API_URL is not set")
12
+
13
+
14
+ class TestExtractE2E:
15
+ """E2E tests for v2 client extract (proxied to v1)."""
16
+
17
+ def setup_method(self):
18
+ self.client = Firecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
19
+
20
+ def test_extract_minimal_with_prompt(self):
21
+ resp = self.client.extract(
22
+ urls=["https://docs.firecrawl.dev"],
23
+ prompt="Extract the main page title",
24
+ )
25
+
26
+ assert hasattr(resp, "success")
27
+ assert resp.success is True or resp.success is False
28
+ # data may be None if backend omits; presence depends on implementation
29
+
30
+ def test_extract_with_schema(self):
31
+ schema = {
32
+ "type": "object",
33
+ "properties": {
34
+ "title": {"type": "string"}
35
+ },
36
+ "required": ["title"],
37
+ }
38
+
39
+ resp = self.client.extract(
40
+ urls=["https://docs.firecrawl.dev"],
41
+ schema=schema,
42
+ prompt="Extract the main page title",
43
+ show_sources=True,
44
+ enable_web_search=False,
45
+ integration="_e2e-test",
46
+ )
47
+
48
+ assert hasattr(resp, "success")
49
+ # if backend includes sources, ensure structure is a dict (do not fail if omitted)
50
+ if hasattr(resp, "sources") and resp.sources is not None:
51
+ assert isinstance(resp.sources, dict)
52
+
53
+ # check if resp.data schema is equal to schema
54
+ assert isinstance(resp.data, dict)
55
+ assert resp.data["title"] is not None
@@ -0,0 +1,61 @@
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from firecrawl import Firecrawl
4
+
5
+
6
+ load_dotenv()
7
+
8
+ if not os.getenv("API_KEY"):
9
+ raise ValueError("API_KEY is not set")
10
+
11
+ if not os.getenv("API_URL"):
12
+ raise ValueError("API_URL is not set")
13
+
14
+
15
+ import pytest
16
+
17
+
18
+ class TestMapE2E:
19
+ """End-to-end tests for map functionality (v2)."""
20
+
21
+ def setup_method(self):
22
+ self.client = Firecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
23
+
24
+ def test_map_minimal_request(self):
25
+ resp = self.client.map("https://docs.firecrawl.dev")
26
+
27
+ assert hasattr(resp, "links") and resp.links is not None
28
+ assert isinstance(resp.links, list)
29
+
30
+ # Basic sanity checks on at least one link
31
+ if len(resp.links) > 0:
32
+ first = resp.links[0]
33
+ assert hasattr(first, "url")
34
+ assert isinstance(first.url, str) and first.url.startswith("http")
35
+
36
+ @pytest.mark.parametrize(
37
+ "sitemap",
38
+ [
39
+ "only",
40
+ "skip",
41
+ "include",
42
+ ],
43
+ )
44
+ def test_map_with_options(self, sitemap):
45
+ resp = self.client.map(
46
+ "https://docs.firecrawl.dev",
47
+ search="docs",
48
+ include_subdomains=True, limit=10,
49
+ sitemap=sitemap,
50
+ timeout=15000,
51
+ integration="_e2e-test",
52
+ )
53
+
54
+ assert hasattr(resp, "links") and isinstance(resp.links, list)
55
+
56
+ # Limit should be respected (server-side)
57
+ assert len(resp.links) <= 10
58
+
59
+ for link in resp.links:
60
+ assert hasattr(link, "url")
61
+ assert isinstance(link.url, str) and link.url.startswith("http")
@@ -0,0 +1,191 @@
1
+ import os
2
+ import pytest
3
+ from dotenv import load_dotenv
4
+ from firecrawl import Firecrawl
5
+ import json as _json
6
+ import pytest
7
+ from firecrawl.v2.types import Viewport, ScreenshotAction, Document
8
+
9
+ load_dotenv()
10
+
11
+ if not os.getenv("API_KEY"):
12
+ raise ValueError("API_KEY is not set")
13
+
14
+ if not os.getenv("API_URL"):
15
+ raise ValueError("API_URL is not set")
16
+
17
+
18
+ class TestScrapeE2E:
19
+ """End-to-end tests for scrape functionality (v2)."""
20
+
21
+ def setup_method(self):
22
+ self.client = Firecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
23
+
24
+ def _assert_valid_document(self, doc: Document):
25
+ assert isinstance(doc, Document)
26
+ # At least one main content field should be present
27
+ assert (
28
+ (doc.markdown is not None and len(doc.markdown) > 0)
29
+ or (doc.html is not None and len(doc.html) > 0)
30
+ or (doc.raw_html is not None and len(doc.raw_html) > 0)
31
+ or (doc.summary is not None and len(doc.summary) > 0)
32
+ )
33
+ # Metadata should exist with a source URL or title when available
34
+ assert doc.metadata is not None
35
+
36
+ def test_scrape_minimal(self):
37
+ """Scrape a URL with minimal parameters and return a document."""
38
+ doc = self.client.scrape("https://docs.firecrawl.dev")
39
+ self._assert_valid_document(doc)
40
+
41
+ def test_scrape_with_options_markdown(self):
42
+ """Scrape with simple markdown format and options passed as kwargs."""
43
+ doc = self.client.scrape(
44
+ "https://docs.firecrawl.dev",
45
+ formats=["markdown"],
46
+ only_main_content=False,
47
+ mobile=False,
48
+ )
49
+ self._assert_valid_document(doc)
50
+
51
+ def test_scrape_with_screenshot_action_viewport(self):
52
+ """Scrape with a screenshot action including viewport passed as kwargs."""
53
+ viewport = Viewport(width=800, height=600)
54
+ action = ScreenshotAction(full_page=False, quality=80, viewport=viewport)
55
+ doc = self.client.scrape(
56
+ "https://docs.firecrawl.dev",
57
+ formats=["markdown"],
58
+ actions=[action],
59
+ )
60
+ self._assert_valid_document(doc)
61
+
62
+ @pytest.mark.parametrize("fmt,expect_field", [
63
+ ("markdown", "markdown"),
64
+ ("html", "html"),
65
+ ("raw_html", "raw_html"),
66
+ ("links", "links"),
67
+ ("screenshot", "screenshot"),
68
+ ("summary", "summary"),
69
+ ])
70
+ def test_scrape_basic_formats(self, fmt, expect_field):
71
+ """Verify basic formats request succeeds and expected fields are present when applicable."""
72
+ doc = self.client.scrape(
73
+ "https://docs.firecrawl.dev",
74
+ formats=[fmt],
75
+ )
76
+ # For formats that are not content (links/screenshot/json), skip main-content assertion
77
+ if expect_field not in {"links", "screenshot"}:
78
+ self._assert_valid_document(doc)
79
+ if expect_field == "markdown":
80
+ assert doc.markdown is not None
81
+ elif expect_field == "html":
82
+ assert doc.html is not None
83
+ elif expect_field == "raw_html":
84
+ assert doc.raw_html is not None
85
+ elif expect_field == "screenshot":
86
+ assert doc.screenshot is not None
87
+ elif expect_field == "links":
88
+ assert isinstance(doc.links, list)
89
+ assert len(doc.links) > 0
90
+
91
+ def test_scrape_with_json_format_object(self):
92
+ """Scrape with JSON format object (requires prompt and schema)."""
93
+ json_schema = {
94
+ "type": "object",
95
+ "properties": {
96
+ "title": {"type": "string"}
97
+ },
98
+ "required": ["title"],
99
+ }
100
+ doc = self.client.scrape(
101
+ "https://docs.firecrawl.dev",
102
+ formats=[{"type": "json", "prompt": "Extract page title", "schema": json_schema}],
103
+ only_main_content=True,
104
+ )
105
+ # JSON format may not include main content fields; ensure request succeeded
106
+ assert isinstance(doc, Document)
107
+ # If backend returns extracted json content, it should be present under `json`
108
+ # (Do not fail if backend omits it; existence depends on implementation)
109
+ # if hasattr(doc, 'json'):
110
+ # assert doc.json is not None
111
+
112
+ def test_scrape_invalid_url(self):
113
+ """Scrape should fail with empty or invalid URLs."""
114
+ with pytest.raises(ValueError, match="URL cannot be empty"):
115
+ self.client.scrape("")
116
+
117
+ with pytest.raises(ValueError, match="URL cannot be empty"):
118
+ self.client.scrape(" ")
119
+
120
+ def test_scrape_with_all_params(self):
121
+ """Comprehensive scrape using multiple formats and options."""
122
+ json_schema = {
123
+ "type": "object",
124
+ "properties": {"title": {"type": "string"}},
125
+ "required": ["title"],
126
+ }
127
+ doc = self.client.scrape(
128
+ "https://docs.firecrawl.dev",
129
+ formats=[
130
+ "markdown",
131
+ "raw_html",
132
+ {"type": "screenshot", "full_page": False, "quality": 70},
133
+ {"type": "json", "prompt": "Extract title", "schema": json_schema},
134
+ {"type": "summary" },
135
+ ],
136
+ headers={"User-Agent": "E2E"},
137
+ include_tags=["main"],
138
+ exclude_tags=["nav"],
139
+ only_main_content=True,
140
+ timeout=20000,
141
+ wait_for=500,
142
+ mobile=False,
143
+ parsers=["pdf"],
144
+ actions=[],
145
+ skip_tls_verification=False,
146
+ remove_base64_images=False,
147
+ fast_mode=False,
148
+ use_mock=None,
149
+ block_ads=False,
150
+ proxy="basic",
151
+ max_age=0,
152
+ store_in_cache=False,
153
+ integration="_e2e-test",
154
+ )
155
+ assert isinstance(doc, Document)
156
+
157
+ def test_scrape_images_format(self):
158
+ """Test images format extraction."""
159
+ doc = self.client.scrape(
160
+ "https://firecrawl.dev",
161
+ formats=["images"]
162
+ )
163
+ assert isinstance(doc, Document)
164
+ assert doc.images is not None
165
+ assert isinstance(doc.images, list)
166
+ assert len(doc.images) > 0
167
+ # Should find firecrawl logo/branding images
168
+ assert any("firecrawl" in img.lower() or "logo" in img.lower() for img in doc.images)
169
+
170
+ def test_scrape_images_with_multiple_formats(self):
171
+ """Test images format works with other formats."""
172
+ doc = self.client.scrape(
173
+ "https://github.com",
174
+ formats=["markdown", "links", "images"]
175
+ )
176
+ assert isinstance(doc, Document)
177
+ assert doc.markdown is not None
178
+ assert doc.links is not None
179
+ assert doc.images is not None
180
+ assert isinstance(doc.images, list)
181
+ assert len(doc.images) > 0
182
+
183
+ # Images should find content not available in links format
184
+ image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico']
185
+ link_images = [
186
+ link for link in (doc.links or [])
187
+ if any(ext in link.lower() for ext in image_extensions)
188
+ ]
189
+
190
+ # Should discover additional images beyond those with obvious extensions
191
+ assert len(doc.images) >= len(link_images)