firecrawl-py 3.1.1__tar.gz → 3.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

Files changed (87) hide show
  1. {firecrawl_py-3.1.1/firecrawl_py.egg-info → firecrawl_py-3.2.1}/PKG-INFO +2 -2
  2. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__init__.py +1 -1
  3. firecrawl_py-3.2.1/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +248 -0
  4. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/test_search.py +10 -6
  5. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/types.py +6 -2
  6. firecrawl_py-3.2.1/firecrawl/v2/methods/aio/search.py +172 -0
  7. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/methods/search.py +52 -43
  8. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/types.py +30 -6
  9. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/utils/http_client_async.py +1 -0
  10. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1/firecrawl_py.egg-info}/PKG-INFO +2 -2
  11. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl_py.egg-info/requires.txt +1 -1
  12. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/pyproject.toml +1 -1
  13. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/setup.py +1 -1
  14. firecrawl_py-3.1.1/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +0 -183
  15. firecrawl_py-3.1.1/firecrawl/v2/methods/aio/search.py +0 -55
  16. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/LICENSE +0 -0
  17. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/README.md +0 -0
  18. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +0 -0
  19. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +0 -0
  20. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +0 -0
  21. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +0 -0
  22. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +0 -0
  23. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +0 -0
  24. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +0 -0
  25. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/conftest.py +0 -0
  26. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/test_async.py +0 -0
  27. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +0 -0
  28. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/test_crawl.py +0 -0
  29. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/test_extract.py +0 -0
  30. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/test_map.py +0 -0
  31. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/test_scrape.py +0 -0
  32. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/test_usage.py +0 -0
  33. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/test_watcher.py +0 -0
  34. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +0 -0
  35. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +0 -0
  36. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +0 -0
  37. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +0 -0
  38. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +0 -0
  39. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +0 -0
  40. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +0 -0
  41. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +0 -0
  42. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +0 -0
  43. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +0 -0
  44. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +0 -0
  45. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +0 -0
  46. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +0 -0
  47. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +0 -0
  48. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +0 -0
  49. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +0 -0
  50. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +0 -0
  51. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/methods/test_webhook.py +0 -0
  52. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/utils/test_validation.py +0 -0
  53. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +0 -0
  54. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/client.py +0 -0
  55. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/firecrawl.backup.py +0 -0
  56. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v1/__init__.py +0 -0
  57. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v1/client.py +0 -0
  58. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/__init__.py +0 -0
  59. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/client.py +0 -0
  60. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/client_async.py +0 -0
  61. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/methods/aio/__init__.py +0 -0
  62. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/methods/aio/batch.py +0 -0
  63. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/methods/aio/crawl.py +0 -0
  64. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/methods/aio/extract.py +0 -0
  65. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/methods/aio/map.py +0 -0
  66. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/methods/aio/scrape.py +0 -0
  67. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/methods/aio/usage.py +0 -0
  68. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/methods/batch.py +0 -0
  69. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/methods/crawl.py +0 -0
  70. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/methods/extract.py +0 -0
  71. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/methods/map.py +0 -0
  72. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/methods/scrape.py +0 -0
  73. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/methods/usage.py +0 -0
  74. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/utils/__init__.py +0 -0
  75. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/utils/error_handler.py +0 -0
  76. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/utils/get_version.py +0 -0
  77. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/utils/http_client.py +0 -0
  78. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/utils/normalize.py +0 -0
  79. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/utils/validation.py +0 -0
  80. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/watcher.py +0 -0
  81. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/watcher_async.py +0 -0
  82. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl_py.egg-info/SOURCES.txt +0 -0
  83. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl_py.egg-info/dependency_links.txt +0 -0
  84. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl_py.egg-info/top_level.txt +0 -0
  85. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/setup.cfg +0 -0
  86. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/tests/test_change_tracking.py +0 -0
  87. {firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/tests/test_timeout_conversion.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: firecrawl-py
3
- Version: 3.1.1
3
+ Version: 3.2.1
4
4
  Summary: Python SDK for Firecrawl API
5
5
  Home-page: https://github.com/firecrawl/firecrawl
6
6
  Author: Mendable.ai
@@ -38,7 +38,7 @@ Requires-Dist: httpx
38
38
  Requires-Dist: python-dotenv
39
39
  Requires-Dist: websockets
40
40
  Requires-Dist: nest-asyncio
41
- Requires-Dist: pydantic
41
+ Requires-Dist: pydantic>=2.0
42
42
  Requires-Dist: aiohttp
43
43
  Dynamic: author
44
44
  Dynamic: home-page
@@ -17,7 +17,7 @@ from .v1 import (
17
17
  V1ChangeTrackingOptions,
18
18
  )
19
19
 
20
- __version__ = "3.1.1"
20
+ __version__ = "3.2.1"
21
21
 
22
22
  # Define the logger for the Firecrawl project
23
23
  logger: logging.Logger = logging.getLogger("firecrawl")
@@ -0,0 +1,248 @@
1
+ import os
2
+ import pytest
3
+ from dotenv import load_dotenv
4
+ from firecrawl import AsyncFirecrawl
5
+ from firecrawl.types import (
6
+ SearchData,
7
+ Document,
8
+ ScrapeOptions,
9
+ ScrapeFormats,
10
+ SearchResultWeb,
11
+ SearchResultNews,
12
+ SearchResultImages,
13
+ )
14
+
15
+ load_dotenv()
16
+
17
+ firecrawl = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
18
+
19
+ def _collect_texts(entries):
20
+ texts = []
21
+ for r in entries or []:
22
+ title = getattr(r, 'title', None) if hasattr(r, 'title') else None
23
+ desc = getattr(r, 'description', None) if hasattr(r, 'description') else None
24
+ if title:
25
+ texts.append(str(title).lower())
26
+ if desc:
27
+ texts.append(str(desc).lower())
28
+ return texts
29
+
30
+ def _is_document(entry) -> bool:
31
+ try:
32
+ from firecrawl.v2.types import Document
33
+ return isinstance(entry, Document) or \
34
+ hasattr(entry, 'markdown') or \
35
+ hasattr(entry, 'html') or \
36
+ hasattr(entry, 'raw_html') or \
37
+ hasattr(entry, 'json') or \
38
+ hasattr(entry, 'screenshot') or \
39
+ hasattr(entry, 'change_tracking') or \
40
+ hasattr(entry, 'summary')
41
+ except Exception:
42
+ return hasattr(entry, 'markdown') or \
43
+ hasattr(entry, 'html') or \
44
+ hasattr(entry, 'raw_html') or \
45
+ hasattr(entry, 'json') or \
46
+ hasattr(entry, 'screenshot') or \
47
+ hasattr(entry, 'change_tracking') or \
48
+ hasattr(entry, 'summary')
49
+
50
+ @pytest.mark.asyncio
51
+ async def test_async_search_minimal_request():
52
+ results = await firecrawl.search(
53
+ query="What is the capital of France?"
54
+ )
55
+ assert isinstance(results, SearchData)
56
+ assert hasattr(results, 'web')
57
+ assert results.web is not None
58
+ assert len(results.web) > 0
59
+ assert hasattr(results, 'news')
60
+ assert results.news is None
61
+ assert hasattr(results, 'images')
62
+ assert results.images is None
63
+
64
+ for result in results.web:
65
+ assert isinstance(result, SearchResultWeb)
66
+ assert hasattr(result, 'url')
67
+ assert hasattr(result, 'title')
68
+ assert hasattr(result, 'description')
69
+ assert result.url.startswith('http')
70
+ assert result.title is not None
71
+ assert result.description is not None
72
+
73
+ all_text = ' '.join(_collect_texts(results.web))
74
+ assert 'paris' in all_text
75
+
76
+ assert results.news is None
77
+ assert results.images is None
78
+
79
+ @pytest.mark.asyncio
80
+ async def test_async_search_with_sources():
81
+ results = await firecrawl.search(
82
+ query="firecrawl",
83
+ sources=["web", "news", "images"],
84
+ limit=3
85
+ )
86
+ assert isinstance(results, SearchData)
87
+ assert results.web is not None
88
+ assert len(results.web) <= 3
89
+ assert isinstance(results.web[0], SearchResultWeb)
90
+
91
+ if results.news is not None:
92
+ assert len(results.news) <= 3
93
+ assert isinstance(results.news[0], SearchResultNews)
94
+
95
+ if results.images is not None:
96
+ assert len(results.images) <= 3
97
+ assert isinstance(results.images[0], SearchResultImages)
98
+
99
+ web_titles = [result.title.lower() for result in results.web]
100
+ web_descriptions = [result.description.lower() for result in results.web]
101
+ all_web_text = ' '.join(web_titles + web_descriptions)
102
+ assert 'firecrawl' in all_web_text
103
+
104
+ @pytest.mark.asyncio
105
+ async def test_async_search_result_structure():
106
+ results = await firecrawl.search(
107
+ query="test query",
108
+ limit=1
109
+ )
110
+ if results.web and len(results.web) > 0:
111
+ result = results.web[0]
112
+ assert hasattr(result, 'url')
113
+ assert hasattr(result, 'title')
114
+ assert hasattr(result, 'description')
115
+ assert isinstance(result.url, str)
116
+ assert isinstance(result.title, str) or result.title is None
117
+ assert isinstance(result.description, str) or result.description is None
118
+ assert result.url.startswith('http')
119
+
120
+ @pytest.mark.asyncio
121
+ async def test_async_search_all_parameters():
122
+ from firecrawl.types import ScrapeOptions, Location, WaitAction
123
+ schema = {
124
+ "type": "object",
125
+ "properties": {
126
+ "title": {"type": "string"},
127
+ "description": {"type": "string"},
128
+ "url": {"type": "string"}
129
+ },
130
+ "required": ["title", "description"]
131
+ }
132
+ results = await firecrawl.search(
133
+ query="artificial intelligence",
134
+ sources=[
135
+ {"type": "web"},
136
+ {"type": "news"}
137
+ ],
138
+ limit=3,
139
+ tbs="qdr:m",
140
+ location="US",
141
+ ignore_invalid_urls=True,
142
+ timeout=60000,
143
+ scrape_options=ScrapeOptions(
144
+ formats=[
145
+ "markdown",
146
+ "html",
147
+ {
148
+ "type": "json",
149
+ "prompt": "Extract the title and description from the page",
150
+ "schema": schema
151
+ },
152
+ {"type": "summary"}
153
+ ],
154
+ headers={"User-Agent": "Firecrawl-Test/1.0"},
155
+ include_tags=["h1", "h2", "p"],
156
+ exclude_tags=["nav", "footer"],
157
+ only_main_content=True,
158
+ wait_for=2000,
159
+ mobile=False,
160
+ skip_tls_verification=False,
161
+ remove_base64_images=True,
162
+ block_ads=True,
163
+ proxy="basic",
164
+ max_age=3600000,
165
+ store_in_cache=True,
166
+ location=Location(
167
+ country="US",
168
+ languages=["en"]
169
+ ),
170
+ actions=[
171
+ WaitAction(milliseconds=1000)
172
+ ]
173
+ )
174
+ )
175
+ assert isinstance(results, SearchData)
176
+ assert hasattr(results, 'web')
177
+ assert hasattr(results, 'news')
178
+ assert hasattr(results, 'images')
179
+ assert results.web is not None
180
+ assert len(results.web) <= 3
181
+
182
+ non_doc_entries = [r for r in (results.web or []) if not _is_document(r)]
183
+ if non_doc_entries:
184
+ all_web_text = ' '.join(_collect_texts(non_doc_entries))
185
+ ai_terms = ['artificial', 'intelligence', 'ai', 'machine', 'learning']
186
+ assert any(term in all_web_text for term in ai_terms)
187
+
188
+ for result in results.web:
189
+ assert isinstance(result, (SearchResultWeb, Document))
190
+ if isinstance(result, Document):
191
+ assert (result.markdown is not None) or (result.html is not None)
192
+ else:
193
+ assert hasattr(result, 'url')
194
+ assert isinstance(result.url, str) and result.url.startswith('http')
195
+
196
+ if results.news is not None:
197
+ assert len(results.news) <= 3
198
+ for result in results.news:
199
+ assert isinstance(result, (SearchResultNews, Document))
200
+ if isinstance(result, Document):
201
+ assert (result.markdown is not None) or (result.html is not None)
202
+ else:
203
+ assert hasattr(result, 'url')
204
+ assert isinstance(result.url, str) and result.url.startswith('http')
205
+
206
+ assert results.images is None
207
+
208
+ @pytest.mark.asyncio
209
+ async def test_async_search_formats_flexibility():
210
+ # Test with list format
211
+ results1 = await firecrawl.search(
212
+ query="python programming",
213
+ limit=1,
214
+ scrape_options=ScrapeOptions(
215
+ formats=["markdown"]
216
+ )
217
+ )
218
+ # Test with ScrapeFormats object
219
+ results2 = await firecrawl.search(
220
+ query="python programming",
221
+ limit=1,
222
+ scrape_options=ScrapeOptions(
223
+ formats=ScrapeFormats(markdown=True)
224
+ )
225
+ )
226
+ assert isinstance(results1, SearchData)
227
+ assert isinstance(results2, SearchData)
228
+ assert results1.web is not None
229
+ assert results2.web is not None
230
+
231
+ @pytest.mark.asyncio
232
+ async def test_async_search_with_json_format_object():
233
+ json_schema = {
234
+ "type": "object",
235
+ "properties": {
236
+ "title": {"type": "string"}
237
+ },
238
+ "required": ["title"],
239
+ }
240
+ results = await firecrawl.search(
241
+ query="site:docs.firecrawl.dev",
242
+ limit=1,
243
+ scrape_options=ScrapeOptions(
244
+ formats=[{"type": "json", "prompt": "Extract page title", "schema": json_schema}]
245
+ ),
246
+ )
247
+ assert isinstance(results, SearchData)
248
+ assert results.web is not None and len(results.web) >= 0
@@ -1,7 +1,7 @@
1
1
  from firecrawl import Firecrawl
2
2
  import os
3
3
  from dotenv import load_dotenv
4
- from firecrawl.types import SearchData, SearchResult, Document, ScrapeFormats, ScrapeOptions
4
+ from firecrawl.types import SearchData, Document, ScrapeOptions, SearchResultWeb, SearchResultNews, SearchResultImages
5
5
 
6
6
  load_dotenv()
7
7
 
@@ -53,7 +53,7 @@ def test_search_minimal_request():
53
53
  assert results.images is None
54
54
 
55
55
  for result in results.web:
56
- assert isinstance(result, SearchResult)
56
+ assert isinstance(result, SearchResultWeb)
57
57
  assert hasattr(result, 'url')
58
58
  assert hasattr(result, 'title')
59
59
  assert hasattr(result, 'description')
@@ -73,7 +73,7 @@ def test_search_with_sources():
73
73
  """Test search with specific sources."""
74
74
  results = firecrawl.search(
75
75
  query="firecrawl",
76
- sources=["web", "news"],
76
+ sources=["web", "news", "images"],
77
77
  limit=3
78
78
  )
79
79
 
@@ -81,11 +81,15 @@ def test_search_with_sources():
81
81
 
82
82
  assert results.web is not None
83
83
  assert len(results.web) <= 3
84
+ assert isinstance(results.web[0], SearchResultWeb)
84
85
 
85
86
  if results.news is not None:
86
87
  assert len(results.news) <= 3
88
+ assert isinstance(results.news[0], SearchResultNews)
87
89
 
88
- assert results.images is None
90
+ if results.images is not None:
91
+ assert len(results.images) <= 3
92
+ assert isinstance(results.images[0], SearchResultImages)
89
93
 
90
94
  web_titles = [result.title.lower() for result in results.web]
91
95
  web_descriptions = [result.description.lower() for result in results.web]
@@ -193,7 +197,7 @@ def test_search_all_parameters():
193
197
 
194
198
  # Test that each result has proper structure
195
199
  for result in results.web:
196
- assert isinstance(result, (SearchResult, Document))
200
+ assert isinstance(result, (SearchResultWeb, Document))
197
201
  if isinstance(result, Document):
198
202
  # Document path: ensure content present
199
203
  assert (result.markdown is not None) or (result.html is not None)
@@ -206,7 +210,7 @@ def test_search_all_parameters():
206
210
  if results.news is not None:
207
211
  assert len(results.news) <= 3
208
212
  for result in results.news:
209
- assert isinstance(result, (SearchResult, Document))
213
+ assert isinstance(result, (SearchResultNews, Document))
210
214
  if isinstance(result, Document):
211
215
  assert (result.markdown is not None) or (result.html is not None)
212
216
  else:
@@ -48,7 +48,9 @@ from .v2.types import (
48
48
  JsonFormat,
49
49
  FormatOption,
50
50
  SearchRequest,
51
- SearchResult,
51
+ SearchResultWeb,
52
+ SearchResultNews,
53
+ SearchResultImages,
52
54
  SearchData,
53
55
  SearchResponse,
54
56
 
@@ -124,7 +126,9 @@ __all__ = [
124
126
  'JsonFormat',
125
127
  'FormatOption',
126
128
  'SearchRequest',
127
- 'SearchResult',
129
+ 'SearchResultWeb',
130
+ 'SearchResultNews',
131
+ 'SearchResultImages',
128
132
  'SearchData',
129
133
  'SearchResponse',
130
134
 
@@ -0,0 +1,172 @@
1
+ import re
2
+ from typing import Dict, Any, Union, List, TypeVar, Type
3
+ from ...types import (
4
+ SearchRequest,
5
+ SearchData,
6
+ Document,
7
+ SearchResultWeb,
8
+ SearchResultNews,
9
+ SearchResultImages,
10
+ )
11
+ from ...utils.http_client_async import AsyncHttpClient
12
+ from ...utils.error_handler import handle_response_error
13
+ from ...utils.validation import validate_scrape_options, prepare_scrape_options
14
+
15
+ T = TypeVar("T")
16
+
17
+ async def search(
18
+ client: AsyncHttpClient,
19
+ request: SearchRequest
20
+ ) -> SearchData:
21
+ """
22
+ Async search for documents.
23
+
24
+ Args:
25
+ client: Async HTTP client instance
26
+ request: Search request
27
+
28
+ Returns:
29
+ SearchData with search results grouped by source type
30
+
31
+ Raises:
32
+ FirecrawlError: If the search operation fails
33
+ """
34
+ request_data = _prepare_search_request(request)
35
+ try:
36
+ response = await client.post("/v2/search", request_data)
37
+ if response.status_code != 200:
38
+ handle_response_error(response, "search")
39
+ response_data = response.json()
40
+ if not response_data.get("success"):
41
+ handle_response_error(response, "search")
42
+ data = response_data.get("data", {}) or {}
43
+ out = SearchData()
44
+ if "web" in data:
45
+ out.web = _transform_array(data["web"], SearchResultWeb)
46
+ if "news" in data:
47
+ out.news = _transform_array(data["news"], SearchResultNews)
48
+ if "images" in data:
49
+ out.images = _transform_array(data["images"], SearchResultImages)
50
+ return out
51
+ except Exception as err:
52
+ if hasattr(err, "response"):
53
+ handle_response_error(getattr(err, "response"), "search")
54
+ raise err
55
+
56
+ def _transform_array(arr: List[Any], result_type: Type[T]) -> List[Union[T, Document]]:
57
+ """
58
+ Transforms an array of items into a list of result_type or Document.
59
+ If the item dict contains any of the special keys, it is treated as a Document.
60
+ Otherwise, it is treated as result_type.
61
+ If the item is not a dict, it is wrapped as result_type with url=item.
62
+ """
63
+ results: List[Union[T, Document]] = []
64
+ for item in arr:
65
+ if item and isinstance(item, dict):
66
+ if (
67
+ "markdown" in item or
68
+ "html" in item or
69
+ "rawHtml" in item or
70
+ "links" in item or
71
+ "screenshot" in item or
72
+ "changeTracking" in item or
73
+ "summary" in item or
74
+ "json" in item
75
+ ):
76
+ results.append(Document(**item))
77
+ else:
78
+ results.append(result_type(**item))
79
+ else:
80
+ results.append(result_type(url=item))
81
+ return results
82
+
83
+ def _validate_search_request(request: SearchRequest) -> SearchRequest:
84
+ """
85
+ Validate and normalize search request.
86
+
87
+ Args:
88
+ request: Search request to validate
89
+
90
+ Returns:
91
+ Validated request
92
+
93
+ Raises:
94
+ ValueError: If request is invalid
95
+ """
96
+ if not request.query or not request.query.strip():
97
+ raise ValueError("Query cannot be empty")
98
+
99
+ if request.limit is not None:
100
+ if request.limit <= 0:
101
+ raise ValueError("Limit must be positive")
102
+ if request.limit > 100:
103
+ raise ValueError("Limit cannot exceed 100")
104
+
105
+ if request.timeout is not None:
106
+ if request.timeout <= 0:
107
+ raise ValueError("Timeout must be positive")
108
+ if request.timeout > 300000:
109
+ raise ValueError("Timeout cannot exceed 300000ms (5 minutes)")
110
+
111
+ if request.sources is not None:
112
+ valid_sources = {"web", "news", "images"}
113
+ for source in request.sources:
114
+ if isinstance(source, str):
115
+ if source not in valid_sources:
116
+ raise ValueError(f"Invalid source type: {source}. Valid types: {valid_sources}")
117
+ elif hasattr(source, 'type'):
118
+ if source.type not in valid_sources:
119
+ raise ValueError(f"Invalid source type: {source.type}. Valid types: {valid_sources}")
120
+
121
+ if request.location is not None:
122
+ if not isinstance(request.location, str) or len(request.location.strip()) == 0:
123
+ raise ValueError("Location must be a non-empty string")
124
+
125
+ if request.tbs is not None:
126
+ valid_tbs_values = {
127
+ "qdr:h", "qdr:d", "qdr:w", "qdr:m", "qdr:y",
128
+ "d", "w", "m", "y"
129
+ }
130
+ if request.tbs in valid_tbs_values:
131
+ pass
132
+ elif request.tbs.startswith("cdr:"):
133
+ custom_date_pattern = r"^cdr:1,cd_min:\d{1,2}/\d{1,2}/\d{4},cd_max:\d{1,2}/\d{1,2}/\d{4}$"
134
+ if not re.match(custom_date_pattern, request.tbs):
135
+ raise ValueError(f"Invalid custom date range format: {request.tbs}. Expected format: cdr:1,cd_min:MM/DD/YYYY,cd_max:MM/DD/YYYY")
136
+ else:
137
+ raise ValueError(f"Invalid tbs value: {request.tbs}. Valid values: {valid_tbs_values} or custom date range format: cdr:1,cd_min:MM/DD/YYYY,cd_max:MM/DD/YYYY")
138
+
139
+ if request.scrape_options is not None:
140
+ validate_scrape_options(request.scrape_options)
141
+
142
+ return request
143
+
144
+ def _prepare_search_request(request: SearchRequest) -> Dict[str, Any]:
145
+ """
146
+ Prepare a search request payload.
147
+
148
+ Args:
149
+ request: Search request
150
+
151
+ Returns:
152
+ Request payload dictionary
153
+ """
154
+ validated_request = _validate_search_request(request)
155
+ data = validated_request.model_dump(exclude_none=True, by_alias=True)
156
+
157
+ if "limit" not in data and validated_request.limit is not None:
158
+ data["limit"] = validated_request.limit
159
+ if "timeout" not in data and validated_request.timeout is not None:
160
+ data["timeout"] = validated_request.timeout
161
+
162
+ if validated_request.ignore_invalid_urls is not None:
163
+ data["ignoreInvalidURLs"] = validated_request.ignore_invalid_urls
164
+ data.pop("ignore_invalid_urls", None)
165
+
166
+ if validated_request.scrape_options is not None:
167
+ scrape_data = prepare_scrape_options(validated_request.scrape_options)
168
+ if scrape_data:
169
+ data["scrapeOptions"] = scrape_data
170
+ data.pop("scrape_options", None)
171
+
172
+ return data
@@ -3,11 +3,12 @@ Search functionality for Firecrawl v2 API.
3
3
  """
4
4
 
5
5
  import re
6
- from typing import Optional, Dict, Any, Union
7
- from ..types import SearchRequest, SearchData, SearchResult, Document
6
+ from typing import Dict, Any, Union, List, TypeVar, Type
7
+ from ..types import SearchRequest, SearchData, Document, SearchResultWeb, SearchResultNews, SearchResultImages
8
8
  from ..utils.normalize import normalize_document_input
9
9
  from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
10
10
 
11
+ T = TypeVar("T")
11
12
 
12
13
  def search(
13
14
  client: HttpClient,
@@ -27,48 +28,56 @@ def search(
27
28
  FirecrawlError: If the search operation fails
28
29
  """
29
30
  request_data = _prepare_search_request(request)
30
-
31
- response = client.post("/v2/search", request_data)
32
-
33
- if not response.ok:
34
- handle_response_error(response, "search")
35
-
36
- response_data = response.json()
37
-
38
- if not response_data.get("success"):
39
- # Handle error case
40
- error_msg = response_data.get("error", "Unknown error occurred")
41
- raise Exception(f"Search failed: {error_msg}")
42
-
43
- data = response_data.get("data", {})
44
- search_data = SearchData()
45
-
46
- for source_type, source_documents in data.items():
47
- if isinstance(source_documents, list):
48
- results = []
49
- for doc_data in source_documents:
50
- if isinstance(doc_data, dict):
51
- # If page scraping options were provided, API returns full Document objects
52
- if request.scrape_options is not None and any(
53
- key in doc_data for key in ['markdown', 'html', 'rawHtml', 'links', 'summary', 'screenshot', 'changeTracking']
54
- ):
55
- normalized = normalize_document_input(doc_data)
56
- results.append(Document(**normalized))
57
- else:
58
- # Minimal search result shape
59
- results.append(SearchResult(
60
- url=doc_data.get('url', ''),
61
- title=doc_data.get('title'),
62
- description=doc_data.get('description')
63
- ))
64
- elif isinstance(doc_data, str):
65
- results.append(SearchResult(url=doc_data))
66
-
67
- if hasattr(search_data, source_type):
68
- setattr(search_data, source_type, results)
69
-
70
- return search_data
31
+ try:
32
+ response = client.post("/v2/search", request_data)
33
+ if response.status_code != 200:
34
+ handle_response_error(response, "search")
35
+ response_data = response.json()
36
+ if not response_data.get("success"):
37
+ handle_response_error(response, "search")
38
+ data = response_data.get("data", {}) or {}
39
+ out = SearchData()
40
+ if "web" in data:
41
+ out.web = _transform_array(data["web"], SearchResultWeb)
42
+ if "news" in data:
43
+ out.news = _transform_array(data["news"], SearchResultNews)
44
+ if "images" in data:
45
+ out.images = _transform_array(data["images"], SearchResultImages)
46
+ return out
47
+ except Exception as err:
48
+ # If the error is an HTTP error from requests, handle it
49
+ # (simulate isAxiosError by checking for requests' HTTPError or Response)
50
+ if hasattr(err, "response"):
51
+ handle_response_error(getattr(err, "response"), "search")
52
+ raise err
71
53
 
54
+ def _transform_array(arr: List[Any], result_type: Type[T]) -> List[Union[T, 'Document']]:
55
+ """
56
+ Transforms an array of items into a list of result_type or Document.
57
+ If the item dict contains any of the special keys, it is treated as a Document.
58
+ Otherwise, it is treated as result_type.
59
+ If the item is not a dict, it is wrapped as result_type with url=item.
60
+ """
61
+ results: List[Union[T, 'Document']] = []
62
+ for item in arr:
63
+ if item and isinstance(item, dict):
64
+ if (
65
+ "markdown" in item or
66
+ "html" in item or
67
+ "rawHtml" in item or
68
+ "links" in item or
69
+ "screenshot" in item or
70
+ "changeTracking" in item or
71
+ "summary" in item or
72
+ "json" in item
73
+ ):
74
+ results.append(Document(**item))
75
+ else:
76
+ results.append(result_type(**item))
77
+ else:
78
+ # For non-dict items, assume it's a URL and wrap in result_type
79
+ results.append(result_type(url=item))
80
+ return results
72
81
 
73
82
  def _validate_search_request(request: SearchRequest) -> SearchRequest:
74
83
  """