firecrawl 3.1.0__py3-none-any.whl → 3.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +2 -2
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +190 -125
- firecrawl/__tests__/e2e/v2/test_search.py +10 -6
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +47 -17
- firecrawl/types.py +6 -2
- firecrawl/v2/methods/aio/search.py +162 -45
- firecrawl/v2/methods/search.py +64 -47
- firecrawl/v2/types.py +30 -6
- firecrawl/v2/utils/http_client_async.py +1 -0
- {firecrawl-3.1.0.dist-info → firecrawl-3.2.1.dist-info}/METADATA +7 -3
- {firecrawl-3.1.0.dist-info → firecrawl-3.2.1.dist-info}/RECORD +14 -14
- {firecrawl-3.1.0.dist-info → firecrawl-3.2.1.dist-info}/WHEEL +1 -1
- {firecrawl-3.1.0.dist-info → firecrawl-3.2.1.dist-info/licenses}/LICENSE +0 -0
- {firecrawl-3.1.0.dist-info → firecrawl-3.2.1.dist-info}/top_level.txt +0 -0
firecrawl/__init__.py
CHANGED
|
@@ -17,7 +17,7 @@ from .v1 import (
|
|
|
17
17
|
V1ChangeTrackingOptions,
|
|
18
18
|
)
|
|
19
19
|
|
|
20
|
-
__version__ = "3.1
|
|
20
|
+
__version__ = "3.2.1"
|
|
21
21
|
|
|
22
22
|
# Define the logger for the Firecrawl project
|
|
23
23
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
@@ -84,4 +84,4 @@ __all__ = [
|
|
|
84
84
|
'V1JsonConfig',
|
|
85
85
|
'V1ScrapeOptions',
|
|
86
86
|
'V1ChangeTrackingOptions',
|
|
87
|
-
]
|
|
87
|
+
]
|
|
@@ -2,8 +2,19 @@ import os
|
|
|
2
2
|
import pytest
|
|
3
3
|
from dotenv import load_dotenv
|
|
4
4
|
from firecrawl import AsyncFirecrawl
|
|
5
|
-
from firecrawl.
|
|
5
|
+
from firecrawl.types import (
|
|
6
|
+
SearchData,
|
|
7
|
+
Document,
|
|
8
|
+
ScrapeOptions,
|
|
9
|
+
ScrapeFormats,
|
|
10
|
+
SearchResultWeb,
|
|
11
|
+
SearchResultNews,
|
|
12
|
+
SearchResultImages,
|
|
13
|
+
)
|
|
6
14
|
|
|
15
|
+
load_dotenv()
|
|
16
|
+
|
|
17
|
+
firecrawl = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
7
18
|
|
|
8
19
|
def _collect_texts(entries):
|
|
9
20
|
texts = []
|
|
@@ -36,148 +47,202 @@ def _is_document(entry) -> bool:
|
|
|
36
47
|
hasattr(entry, 'change_tracking') or \
|
|
37
48
|
hasattr(entry, 'summary')
|
|
38
49
|
|
|
50
|
+
@pytest.mark.asyncio
|
|
51
|
+
async def test_async_search_minimal_request():
|
|
52
|
+
results = await firecrawl.search(
|
|
53
|
+
query="What is the capital of France?"
|
|
54
|
+
)
|
|
55
|
+
assert isinstance(results, SearchData)
|
|
56
|
+
assert hasattr(results, 'web')
|
|
57
|
+
assert results.web is not None
|
|
58
|
+
assert len(results.web) > 0
|
|
59
|
+
assert hasattr(results, 'news')
|
|
60
|
+
assert results.news is None
|
|
61
|
+
assert hasattr(results, 'images')
|
|
62
|
+
assert results.images is None
|
|
63
|
+
|
|
64
|
+
for result in results.web:
|
|
65
|
+
assert isinstance(result, SearchResultWeb)
|
|
66
|
+
assert hasattr(result, 'url')
|
|
67
|
+
assert hasattr(result, 'title')
|
|
68
|
+
assert hasattr(result, 'description')
|
|
69
|
+
assert result.url.startswith('http')
|
|
70
|
+
assert result.title is not None
|
|
71
|
+
assert result.description is not None
|
|
72
|
+
|
|
73
|
+
all_text = ' '.join(_collect_texts(results.web))
|
|
74
|
+
assert 'paris' in all_text
|
|
75
|
+
|
|
76
|
+
assert results.news is None
|
|
77
|
+
assert results.images is None
|
|
39
78
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
79
|
+
@pytest.mark.asyncio
|
|
80
|
+
async def test_async_search_with_sources():
|
|
81
|
+
results = await firecrawl.search(
|
|
82
|
+
query="firecrawl",
|
|
83
|
+
sources=["web", "news", "images"],
|
|
84
|
+
limit=3
|
|
85
|
+
)
|
|
86
|
+
assert isinstance(results, SearchData)
|
|
87
|
+
assert results.web is not None
|
|
88
|
+
assert len(results.web) <= 3
|
|
89
|
+
assert isinstance(results.web[0], SearchResultWeb)
|
|
47
90
|
|
|
91
|
+
if results.news is not None:
|
|
92
|
+
assert len(results.news) <= 3
|
|
93
|
+
assert isinstance(results.news[0], SearchResultNews)
|
|
48
94
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
data = await client.search("What is the capital of France?")
|
|
53
|
-
# Assert sections like sync tests
|
|
54
|
-
assert hasattr(data, "web")
|
|
55
|
-
assert hasattr(data, "news")
|
|
56
|
-
assert hasattr(data, "images")
|
|
57
|
-
assert data.web is not None
|
|
58
|
-
assert len(data.web) > 0
|
|
59
|
-
titles = [getattr(r, "title", None) for r in data.web]
|
|
60
|
-
descs = [getattr(r, "description", None) for r in data.web]
|
|
61
|
-
all_text = " ".join([t.lower() for t in titles if t] + [d.lower() for d in descs if d])
|
|
62
|
-
assert "paris" in all_text
|
|
63
|
-
assert data.news is None
|
|
64
|
-
assert data.images is None
|
|
95
|
+
if results.images is not None:
|
|
96
|
+
assert len(results.images) <= 3
|
|
97
|
+
assert isinstance(results.images[0], SearchResultImages)
|
|
65
98
|
|
|
99
|
+
web_titles = [result.title.lower() for result in results.web]
|
|
100
|
+
web_descriptions = [result.description.lower() for result in results.web]
|
|
101
|
+
all_web_text = ' '.join(web_titles + web_descriptions)
|
|
102
|
+
assert 'firecrawl' in all_web_text
|
|
66
103
|
|
|
67
104
|
@pytest.mark.asyncio
|
|
68
|
-
async def
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
assert
|
|
76
|
-
|
|
77
|
-
assert
|
|
78
|
-
|
|
79
|
-
|
|
105
|
+
async def test_async_search_result_structure():
|
|
106
|
+
results = await firecrawl.search(
|
|
107
|
+
query="test query",
|
|
108
|
+
limit=1
|
|
109
|
+
)
|
|
110
|
+
if results.web and len(results.web) > 0:
|
|
111
|
+
result = results.web[0]
|
|
112
|
+
assert hasattr(result, 'url')
|
|
113
|
+
assert hasattr(result, 'title')
|
|
114
|
+
assert hasattr(result, 'description')
|
|
115
|
+
assert isinstance(result.url, str)
|
|
116
|
+
assert isinstance(result.title, str) or result.title is None
|
|
117
|
+
assert isinstance(result.description, str) or result.description is None
|
|
118
|
+
assert result.url.startswith('http')
|
|
80
119
|
|
|
81
120
|
@pytest.mark.asyncio
|
|
82
|
-
async def
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
"
|
|
86
|
-
|
|
121
|
+
async def test_async_search_all_parameters():
|
|
122
|
+
from firecrawl.types import ScrapeOptions, Location, WaitAction
|
|
123
|
+
schema = {
|
|
124
|
+
"type": "object",
|
|
125
|
+
"properties": {
|
|
126
|
+
"title": {"type": "string"},
|
|
127
|
+
"description": {"type": "string"},
|
|
128
|
+
"url": {"type": "string"}
|
|
129
|
+
},
|
|
130
|
+
"required": ["title", "description"]
|
|
131
|
+
}
|
|
132
|
+
results = await firecrawl.search(
|
|
133
|
+
query="artificial intelligence",
|
|
134
|
+
sources=[
|
|
135
|
+
{"type": "web"},
|
|
136
|
+
{"type": "news"}
|
|
137
|
+
],
|
|
87
138
|
limit=3,
|
|
88
|
-
tbs="qdr:
|
|
139
|
+
tbs="qdr:m",
|
|
89
140
|
location="US",
|
|
90
|
-
ignore_invalid_urls=
|
|
91
|
-
timeout=
|
|
92
|
-
scrape_options=
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
141
|
+
ignore_invalid_urls=True,
|
|
142
|
+
timeout=60000,
|
|
143
|
+
scrape_options=ScrapeOptions(
|
|
144
|
+
formats=[
|
|
145
|
+
"markdown",
|
|
146
|
+
"html",
|
|
147
|
+
{
|
|
148
|
+
"type": "json",
|
|
149
|
+
"prompt": "Extract the title and description from the page",
|
|
150
|
+
"schema": schema
|
|
151
|
+
},
|
|
152
|
+
{"type": "summary"}
|
|
153
|
+
],
|
|
154
|
+
headers={"User-Agent": "Firecrawl-Test/1.0"},
|
|
155
|
+
include_tags=["h1", "h2", "p"],
|
|
156
|
+
exclude_tags=["nav", "footer"],
|
|
157
|
+
only_main_content=True,
|
|
158
|
+
wait_for=2000,
|
|
159
|
+
mobile=False,
|
|
160
|
+
skip_tls_verification=False,
|
|
161
|
+
remove_base64_images=True,
|
|
162
|
+
block_ads=True,
|
|
163
|
+
proxy="basic",
|
|
164
|
+
max_age=3600000,
|
|
165
|
+
store_in_cache=True,
|
|
166
|
+
location=Location(
|
|
167
|
+
country="US",
|
|
168
|
+
languages=["en"]
|
|
169
|
+
),
|
|
170
|
+
actions=[
|
|
171
|
+
WaitAction(milliseconds=1000)
|
|
172
|
+
]
|
|
173
|
+
)
|
|
104
174
|
)
|
|
105
|
-
|
|
106
|
-
assert
|
|
107
|
-
assert hasattr(
|
|
108
|
-
assert
|
|
109
|
-
assert
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
assert
|
|
117
|
-
|
|
118
|
-
|
|
175
|
+
assert isinstance(results, SearchData)
|
|
176
|
+
assert hasattr(results, 'web')
|
|
177
|
+
assert hasattr(results, 'news')
|
|
178
|
+
assert hasattr(results, 'images')
|
|
179
|
+
assert results.web is not None
|
|
180
|
+
assert len(results.web) <= 3
|
|
181
|
+
|
|
182
|
+
non_doc_entries = [r for r in (results.web or []) if not _is_document(r)]
|
|
183
|
+
if non_doc_entries:
|
|
184
|
+
all_web_text = ' '.join(_collect_texts(non_doc_entries))
|
|
185
|
+
ai_terms = ['artificial', 'intelligence', 'ai', 'machine', 'learning']
|
|
186
|
+
assert any(term in all_web_text for term in ai_terms)
|
|
187
|
+
|
|
188
|
+
for result in results.web:
|
|
189
|
+
assert isinstance(result, (SearchResultWeb, Document))
|
|
190
|
+
if isinstance(result, Document):
|
|
191
|
+
assert (result.markdown is not None) or (result.html is not None)
|
|
119
192
|
else:
|
|
120
|
-
assert hasattr(
|
|
121
|
-
assert isinstance(
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
193
|
+
assert hasattr(result, 'url')
|
|
194
|
+
assert isinstance(result.url, str) and result.url.startswith('http')
|
|
195
|
+
|
|
196
|
+
if results.news is not None:
|
|
197
|
+
assert len(results.news) <= 3
|
|
198
|
+
for result in results.news:
|
|
199
|
+
assert isinstance(result, (SearchResultNews, Document))
|
|
200
|
+
if isinstance(result, Document):
|
|
201
|
+
assert (result.markdown is not None) or (result.html is not None)
|
|
128
202
|
else:
|
|
129
|
-
assert
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
@pytest.mark.asyncio
|
|
134
|
-
async def test_async_search_minimal_content_check():
|
|
135
|
-
"""Stronger assertion similar to sync: content check on a known query."""
|
|
136
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
137
|
-
data = await client.search("What is the capital of France?")
|
|
138
|
-
assert hasattr(data, "web") and data.web is not None
|
|
139
|
-
non_doc = [r for r in (data.web or []) if not _is_document(r)]
|
|
140
|
-
if non_doc:
|
|
141
|
-
combined = " ".join(_collect_texts(non_doc))
|
|
142
|
-
assert "paris" in combined
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
@pytest.mark.asyncio
|
|
146
|
-
async def test_async_search_result_structure():
|
|
147
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
148
|
-
data = await client.search("test query", limit=1)
|
|
149
|
-
if data.web and len(data.web) > 0:
|
|
150
|
-
result = data.web[0]
|
|
151
|
-
assert hasattr(result, "url")
|
|
152
|
-
assert hasattr(result, "title")
|
|
153
|
-
assert hasattr(result, "description")
|
|
154
|
-
assert isinstance(result.url, str) and result.url.startswith("http")
|
|
155
|
-
assert isinstance(getattr(result, "title", None), (str, type(None)))
|
|
156
|
-
assert isinstance(getattr(result, "description", None), (str, type(None)))
|
|
203
|
+
assert hasattr(result, 'url')
|
|
204
|
+
assert isinstance(result.url, str) and result.url.startswith('http')
|
|
157
205
|
|
|
206
|
+
assert results.images is None
|
|
158
207
|
|
|
159
208
|
@pytest.mark.asyncio
|
|
160
209
|
async def test_async_search_formats_flexibility():
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
210
|
+
# Test with list format
|
|
211
|
+
results1 = await firecrawl.search(
|
|
212
|
+
query="python programming",
|
|
213
|
+
limit=1,
|
|
214
|
+
scrape_options=ScrapeOptions(
|
|
215
|
+
formats=["markdown"]
|
|
216
|
+
)
|
|
217
|
+
)
|
|
218
|
+
# Test with ScrapeFormats object
|
|
219
|
+
results2 = await firecrawl.search(
|
|
220
|
+
query="python programming",
|
|
221
|
+
limit=1,
|
|
222
|
+
scrape_options=ScrapeOptions(
|
|
223
|
+
formats=ScrapeFormats(markdown=True)
|
|
224
|
+
)
|
|
225
|
+
)
|
|
226
|
+
assert isinstance(results1, SearchData)
|
|
227
|
+
assert isinstance(results2, SearchData)
|
|
228
|
+
assert results1.web is not None
|
|
229
|
+
assert results2.web is not None
|
|
172
230
|
|
|
173
231
|
@pytest.mark.asyncio
|
|
174
|
-
async def
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
232
|
+
async def test_async_search_with_json_format_object():
|
|
233
|
+
json_schema = {
|
|
234
|
+
"type": "object",
|
|
235
|
+
"properties": {
|
|
236
|
+
"title": {"type": "string"}
|
|
237
|
+
},
|
|
238
|
+
"required": ["title"],
|
|
239
|
+
}
|
|
240
|
+
results = await firecrawl.search(
|
|
241
|
+
query="site:docs.firecrawl.dev",
|
|
179
242
|
limit=1,
|
|
180
|
-
scrape_options=
|
|
243
|
+
scrape_options=ScrapeOptions(
|
|
244
|
+
formats=[{"type": "json", "prompt": "Extract page title", "schema": json_schema}]
|
|
245
|
+
),
|
|
181
246
|
)
|
|
182
|
-
assert
|
|
183
|
-
|
|
247
|
+
assert isinstance(results, SearchData)
|
|
248
|
+
assert results.web is not None and len(results.web) >= 0
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from firecrawl import Firecrawl
|
|
2
2
|
import os
|
|
3
3
|
from dotenv import load_dotenv
|
|
4
|
-
from firecrawl.types import SearchData,
|
|
4
|
+
from firecrawl.types import SearchData, Document, ScrapeOptions, SearchResultWeb, SearchResultNews, SearchResultImages
|
|
5
5
|
|
|
6
6
|
load_dotenv()
|
|
7
7
|
|
|
@@ -53,7 +53,7 @@ def test_search_minimal_request():
|
|
|
53
53
|
assert results.images is None
|
|
54
54
|
|
|
55
55
|
for result in results.web:
|
|
56
|
-
assert isinstance(result,
|
|
56
|
+
assert isinstance(result, SearchResultWeb)
|
|
57
57
|
assert hasattr(result, 'url')
|
|
58
58
|
assert hasattr(result, 'title')
|
|
59
59
|
assert hasattr(result, 'description')
|
|
@@ -73,7 +73,7 @@ def test_search_with_sources():
|
|
|
73
73
|
"""Test search with specific sources."""
|
|
74
74
|
results = firecrawl.search(
|
|
75
75
|
query="firecrawl",
|
|
76
|
-
sources=["web", "news"],
|
|
76
|
+
sources=["web", "news", "images"],
|
|
77
77
|
limit=3
|
|
78
78
|
)
|
|
79
79
|
|
|
@@ -81,11 +81,15 @@ def test_search_with_sources():
|
|
|
81
81
|
|
|
82
82
|
assert results.web is not None
|
|
83
83
|
assert len(results.web) <= 3
|
|
84
|
+
assert isinstance(results.web[0], SearchResultWeb)
|
|
84
85
|
|
|
85
86
|
if results.news is not None:
|
|
86
87
|
assert len(results.news) <= 3
|
|
88
|
+
assert isinstance(results.news[0], SearchResultNews)
|
|
87
89
|
|
|
88
|
-
|
|
90
|
+
if results.images is not None:
|
|
91
|
+
assert len(results.images) <= 3
|
|
92
|
+
assert isinstance(results.images[0], SearchResultImages)
|
|
89
93
|
|
|
90
94
|
web_titles = [result.title.lower() for result in results.web]
|
|
91
95
|
web_descriptions = [result.description.lower() for result in results.web]
|
|
@@ -193,7 +197,7 @@ def test_search_all_parameters():
|
|
|
193
197
|
|
|
194
198
|
# Test that each result has proper structure
|
|
195
199
|
for result in results.web:
|
|
196
|
-
assert isinstance(result, (
|
|
200
|
+
assert isinstance(result, (SearchResultWeb, Document))
|
|
197
201
|
if isinstance(result, Document):
|
|
198
202
|
# Document path: ensure content present
|
|
199
203
|
assert (result.markdown is not None) or (result.html is not None)
|
|
@@ -206,7 +210,7 @@ def test_search_all_parameters():
|
|
|
206
210
|
if results.news is not None:
|
|
207
211
|
assert len(results.news) <= 3
|
|
208
212
|
for result in results.news:
|
|
209
|
-
assert isinstance(result, (
|
|
213
|
+
assert isinstance(result, (SearchResultNews, Document))
|
|
210
214
|
if isinstance(result, Document):
|
|
211
215
|
assert (result.markdown is not None) or (result.html is not None)
|
|
212
216
|
else:
|
|
@@ -11,7 +11,7 @@ class TestSearchValidation:
|
|
|
11
11
|
request = SearchRequest(query="")
|
|
12
12
|
with pytest.raises(ValueError, match="Query cannot be empty"):
|
|
13
13
|
_validate_search_request(request)
|
|
14
|
-
|
|
14
|
+
|
|
15
15
|
request = SearchRequest(query=" ")
|
|
16
16
|
with pytest.raises(ValueError, match="Query cannot be empty"):
|
|
17
17
|
_validate_search_request(request)
|
|
@@ -22,12 +22,12 @@ class TestSearchValidation:
|
|
|
22
22
|
request = SearchRequest(query="test", limit=0)
|
|
23
23
|
with pytest.raises(ValueError, match="Limit must be positive"):
|
|
24
24
|
_validate_search_request(request)
|
|
25
|
-
|
|
25
|
+
|
|
26
26
|
# Negative limit
|
|
27
27
|
request = SearchRequest(query="test", limit=-1)
|
|
28
28
|
with pytest.raises(ValueError, match="Limit must be positive"):
|
|
29
29
|
_validate_search_request(request)
|
|
30
|
-
|
|
30
|
+
|
|
31
31
|
# Too high limit
|
|
32
32
|
request = SearchRequest(query="test", limit=101)
|
|
33
33
|
with pytest.raises(ValueError, match="Limit cannot exceed 100"):
|
|
@@ -39,12 +39,12 @@ class TestSearchValidation:
|
|
|
39
39
|
request = SearchRequest(query="test", timeout=0)
|
|
40
40
|
with pytest.raises(ValueError, match="Timeout must be positive"):
|
|
41
41
|
_validate_search_request(request)
|
|
42
|
-
|
|
42
|
+
|
|
43
43
|
# Negative timeout
|
|
44
44
|
request = SearchRequest(query="test", timeout=-1000)
|
|
45
45
|
with pytest.raises(ValueError, match="Timeout must be positive"):
|
|
46
46
|
_validate_search_request(request)
|
|
47
|
-
|
|
47
|
+
|
|
48
48
|
# Too high timeout
|
|
49
49
|
request = SearchRequest(query="test", timeout=300001)
|
|
50
50
|
with pytest.raises(ValueError, match="Timeout cannot exceed 300000ms"):
|
|
@@ -56,12 +56,12 @@ class TestSearchValidation:
|
|
|
56
56
|
request = SearchRequest(query="test", sources=["invalid_source"])
|
|
57
57
|
with pytest.raises(ValueError, match="Invalid source type"):
|
|
58
58
|
_validate_search_request(request)
|
|
59
|
-
|
|
59
|
+
|
|
60
60
|
# Invalid object source
|
|
61
61
|
request = SearchRequest(query="test", sources=[Source(type="invalid_source")])
|
|
62
62
|
with pytest.raises(ValueError, match="Invalid source type"):
|
|
63
63
|
_validate_search_request(request)
|
|
64
|
-
|
|
64
|
+
|
|
65
65
|
# Mixed valid/invalid sources
|
|
66
66
|
request = SearchRequest(query="test", sources=["web", "invalid_source"])
|
|
67
67
|
with pytest.raises(ValueError, match="Invalid source type"):
|
|
@@ -73,7 +73,7 @@ class TestSearchValidation:
|
|
|
73
73
|
request = SearchRequest(query="test", location="")
|
|
74
74
|
with pytest.raises(ValueError, match="Location must be a non-empty string"):
|
|
75
75
|
_validate_search_request(request)
|
|
76
|
-
|
|
76
|
+
|
|
77
77
|
# Whitespace location
|
|
78
78
|
request = SearchRequest(query="test", location=" ")
|
|
79
79
|
with pytest.raises(ValueError, match="Location must be a non-empty string"):
|
|
@@ -82,19 +82,49 @@ class TestSearchValidation:
|
|
|
82
82
|
def test_validate_invalid_tbs(self):
|
|
83
83
|
"""Test validation of invalid tbs values."""
|
|
84
84
|
invalid_tbs_values = ["invalid", "qdr:x", "yesterday", "last_week"]
|
|
85
|
-
|
|
85
|
+
|
|
86
86
|
for invalid_tbs in invalid_tbs_values:
|
|
87
87
|
request = SearchRequest(query="test", tbs=invalid_tbs)
|
|
88
88
|
with pytest.raises(ValueError, match="Invalid tbs value"):
|
|
89
89
|
_validate_search_request(request)
|
|
90
90
|
|
|
91
|
+
def test_validate_custom_date_ranges(self):
|
|
92
|
+
"""Test validation of custom date range formats."""
|
|
93
|
+
valid_custom_ranges = [
|
|
94
|
+
"cdr:1,cd_min:1/1/2024,cd_max:12/31/2024",
|
|
95
|
+
"cdr:1,cd_min:12/1/2024,cd_max:12/31/2024",
|
|
96
|
+
"cdr:1,cd_min:2/28/2023,cd_max:3/1/2023",
|
|
97
|
+
"cdr:1,cd_min:10/15/2023,cd_max:11/15/2023"
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
for valid_range in valid_custom_ranges:
|
|
101
|
+
request = SearchRequest(query="test", tbs=valid_range)
|
|
102
|
+
validated = _validate_search_request(request)
|
|
103
|
+
assert validated == request
|
|
104
|
+
|
|
105
|
+
def test_validate_invalid_custom_date_ranges(self):
|
|
106
|
+
"""Test validation of invalid custom date range formats."""
|
|
107
|
+
# Invalid custom date ranges
|
|
108
|
+
invalid_custom_ranges = [
|
|
109
|
+
"cdr:1,cd_min:2/28/2023", # Missing cd_max
|
|
110
|
+
"cdr:1,cd_max:2/28/2023", # Missing cd_min
|
|
111
|
+
"cdr:2,cd_min:1/1/2024,cd_max:12/31/2024", # Wrong cdr value
|
|
112
|
+
"cdr:cd_min:1/1/2024,cd_max:12/31/2024", # Missing :1
|
|
113
|
+
"custom:1,cd_min:1/1/2024,cd_max:12/31/2024" # Wrong prefix
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
for invalid_range in invalid_custom_ranges:
|
|
117
|
+
request = SearchRequest(query="test", tbs=invalid_range)
|
|
118
|
+
with pytest.raises(ValueError, match="Invalid"):
|
|
119
|
+
_validate_search_request(request)
|
|
120
|
+
|
|
91
121
|
def test_validate_valid_requests(self):
|
|
92
122
|
"""Test that valid requests pass validation."""
|
|
93
123
|
# Minimal valid request
|
|
94
124
|
request = SearchRequest(query="test")
|
|
95
125
|
validated = _validate_search_request(request)
|
|
96
126
|
assert validated == request
|
|
97
|
-
|
|
127
|
+
|
|
98
128
|
# Request with all optional parameters
|
|
99
129
|
request = SearchRequest(
|
|
100
130
|
query="test query",
|
|
@@ -107,7 +137,7 @@ class TestSearchValidation:
|
|
|
107
137
|
)
|
|
108
138
|
validated = _validate_search_request(request)
|
|
109
139
|
assert validated == request
|
|
110
|
-
|
|
140
|
+
|
|
111
141
|
# Request with object sources
|
|
112
142
|
request = SearchRequest(
|
|
113
143
|
query="test",
|
|
@@ -122,17 +152,17 @@ class TestSearchValidation:
|
|
|
122
152
|
request = SearchRequest(query="test", limit=100)
|
|
123
153
|
validated = _validate_search_request(request)
|
|
124
154
|
assert validated == request
|
|
125
|
-
|
|
155
|
+
|
|
126
156
|
# Maximum valid timeout
|
|
127
157
|
request = SearchRequest(query="test", timeout=300000)
|
|
128
158
|
validated = _validate_search_request(request)
|
|
129
159
|
assert validated == request
|
|
130
|
-
|
|
160
|
+
|
|
131
161
|
# Minimum valid limit
|
|
132
162
|
request = SearchRequest(query="test", limit=1)
|
|
133
163
|
validated = _validate_search_request(request)
|
|
134
164
|
assert validated == request
|
|
135
|
-
|
|
165
|
+
|
|
136
166
|
# Minimum valid timeout
|
|
137
167
|
request = SearchRequest(query="test", timeout=1)
|
|
138
168
|
validated = _validate_search_request(request)
|
|
@@ -191,16 +221,16 @@ class TestSearchRequestModel:
|
|
|
191
221
|
data1 = request1.model_dump(by_alias=True)
|
|
192
222
|
assert "ignore_invalid_urls" in data1 # No alias, uses snake_case
|
|
193
223
|
assert data1["ignore_invalid_urls"] is None
|
|
194
|
-
|
|
224
|
+
|
|
195
225
|
# Test with explicit False value
|
|
196
226
|
request2 = SearchRequest(
|
|
197
227
|
query="test",
|
|
198
228
|
ignore_invalid_urls=False,
|
|
199
229
|
scrape_options=ScrapeOptions(formats=["markdown"])
|
|
200
230
|
)
|
|
201
|
-
|
|
231
|
+
|
|
202
232
|
# Check that aliases are used in model_dump with by_alias=True
|
|
203
233
|
data2 = request2.model_dump(by_alias=True)
|
|
204
234
|
assert "ignore_invalid_urls" in data2 # No alias, uses snake_case
|
|
205
235
|
assert "scrape_options" in data2 # No alias, uses snake_case
|
|
206
|
-
assert data2["ignore_invalid_urls"] is False
|
|
236
|
+
assert data2["ignore_invalid_urls"] is False
|
firecrawl/types.py
CHANGED
|
@@ -48,7 +48,9 @@ from .v2.types import (
|
|
|
48
48
|
JsonFormat,
|
|
49
49
|
FormatOption,
|
|
50
50
|
SearchRequest,
|
|
51
|
-
|
|
51
|
+
SearchResultWeb,
|
|
52
|
+
SearchResultNews,
|
|
53
|
+
SearchResultImages,
|
|
52
54
|
SearchData,
|
|
53
55
|
SearchResponse,
|
|
54
56
|
|
|
@@ -124,7 +126,9 @@ __all__ = [
|
|
|
124
126
|
'JsonFormat',
|
|
125
127
|
'FormatOption',
|
|
126
128
|
'SearchRequest',
|
|
127
|
-
'
|
|
129
|
+
'SearchResultWeb',
|
|
130
|
+
'SearchResultNews',
|
|
131
|
+
'SearchResultImages',
|
|
128
132
|
'SearchData',
|
|
129
133
|
'SearchResponse',
|
|
130
134
|
|
|
@@ -1,55 +1,172 @@
|
|
|
1
|
-
|
|
2
|
-
from
|
|
3
|
-
from ...
|
|
1
|
+
import re
|
|
2
|
+
from typing import Dict, Any, Union, List, TypeVar, Type
|
|
3
|
+
from ...types import (
|
|
4
|
+
SearchRequest,
|
|
5
|
+
SearchData,
|
|
6
|
+
Document,
|
|
7
|
+
SearchResultWeb,
|
|
8
|
+
SearchResultNews,
|
|
9
|
+
SearchResultImages,
|
|
10
|
+
)
|
|
4
11
|
from ...utils.http_client_async import AsyncHttpClient
|
|
5
12
|
from ...utils.error_handler import handle_response_error
|
|
6
|
-
from ...utils.validation import
|
|
13
|
+
from ...utils.validation import validate_scrape_options, prepare_scrape_options
|
|
7
14
|
|
|
15
|
+
T = TypeVar("T")
|
|
16
|
+
|
|
17
|
+
async def search(
|
|
18
|
+
client: AsyncHttpClient,
|
|
19
|
+
request: SearchRequest
|
|
20
|
+
) -> SearchData:
|
|
21
|
+
"""
|
|
22
|
+
Async search for documents.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
client: Async HTTP client instance
|
|
26
|
+
request: Search request
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
SearchData with search results grouped by source type
|
|
30
|
+
|
|
31
|
+
Raises:
|
|
32
|
+
FirecrawlError: If the search operation fails
|
|
33
|
+
"""
|
|
34
|
+
request_data = _prepare_search_request(request)
|
|
35
|
+
try:
|
|
36
|
+
response = await client.post("/v2/search", request_data)
|
|
37
|
+
if response.status_code != 200:
|
|
38
|
+
handle_response_error(response, "search")
|
|
39
|
+
response_data = response.json()
|
|
40
|
+
if not response_data.get("success"):
|
|
41
|
+
handle_response_error(response, "search")
|
|
42
|
+
data = response_data.get("data", {}) or {}
|
|
43
|
+
out = SearchData()
|
|
44
|
+
if "web" in data:
|
|
45
|
+
out.web = _transform_array(data["web"], SearchResultWeb)
|
|
46
|
+
if "news" in data:
|
|
47
|
+
out.news = _transform_array(data["news"], SearchResultNews)
|
|
48
|
+
if "images" in data:
|
|
49
|
+
out.images = _transform_array(data["images"], SearchResultImages)
|
|
50
|
+
return out
|
|
51
|
+
except Exception as err:
|
|
52
|
+
if hasattr(err, "response"):
|
|
53
|
+
handle_response_error(getattr(err, "response"), "search")
|
|
54
|
+
raise err
|
|
55
|
+
|
|
56
|
+
def _transform_array(arr: List[Any], result_type: Type[T]) -> List[Union[T, Document]]:
|
|
57
|
+
"""
|
|
58
|
+
Transforms an array of items into a list of result_type or Document.
|
|
59
|
+
If the item dict contains any of the special keys, it is treated as a Document.
|
|
60
|
+
Otherwise, it is treated as result_type.
|
|
61
|
+
If the item is not a dict, it is wrapped as result_type with url=item.
|
|
62
|
+
"""
|
|
63
|
+
results: List[Union[T, Document]] = []
|
|
64
|
+
for item in arr:
|
|
65
|
+
if item and isinstance(item, dict):
|
|
66
|
+
if (
|
|
67
|
+
"markdown" in item or
|
|
68
|
+
"html" in item or
|
|
69
|
+
"rawHtml" in item or
|
|
70
|
+
"links" in item or
|
|
71
|
+
"screenshot" in item or
|
|
72
|
+
"changeTracking" in item or
|
|
73
|
+
"summary" in item or
|
|
74
|
+
"json" in item
|
|
75
|
+
):
|
|
76
|
+
results.append(Document(**item))
|
|
77
|
+
else:
|
|
78
|
+
results.append(result_type(**item))
|
|
79
|
+
else:
|
|
80
|
+
results.append(result_type(url=item))
|
|
81
|
+
return results
|
|
82
|
+
|
|
83
|
+
def _validate_search_request(request: SearchRequest) -> SearchRequest:
|
|
84
|
+
"""
|
|
85
|
+
Validate and normalize search request.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
request: Search request to validate
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Validated request
|
|
92
|
+
|
|
93
|
+
Raises:
|
|
94
|
+
ValueError: If request is invalid
|
|
95
|
+
"""
|
|
96
|
+
if not request.query or not request.query.strip():
|
|
97
|
+
raise ValueError("Query cannot be empty")
|
|
98
|
+
|
|
99
|
+
if request.limit is not None:
|
|
100
|
+
if request.limit <= 0:
|
|
101
|
+
raise ValueError("Limit must be positive")
|
|
102
|
+
if request.limit > 100:
|
|
103
|
+
raise ValueError("Limit cannot exceed 100")
|
|
104
|
+
|
|
105
|
+
if request.timeout is not None:
|
|
106
|
+
if request.timeout <= 0:
|
|
107
|
+
raise ValueError("Timeout must be positive")
|
|
108
|
+
if request.timeout > 300000:
|
|
109
|
+
raise ValueError("Timeout cannot exceed 300000ms (5 minutes)")
|
|
110
|
+
|
|
111
|
+
if request.sources is not None:
|
|
112
|
+
valid_sources = {"web", "news", "images"}
|
|
113
|
+
for source in request.sources:
|
|
114
|
+
if isinstance(source, str):
|
|
115
|
+
if source not in valid_sources:
|
|
116
|
+
raise ValueError(f"Invalid source type: {source}. Valid types: {valid_sources}")
|
|
117
|
+
elif hasattr(source, 'type'):
|
|
118
|
+
if source.type not in valid_sources:
|
|
119
|
+
raise ValueError(f"Invalid source type: {source.type}. Valid types: {valid_sources}")
|
|
120
|
+
|
|
121
|
+
if request.location is not None:
|
|
122
|
+
if not isinstance(request.location, str) or len(request.location.strip()) == 0:
|
|
123
|
+
raise ValueError("Location must be a non-empty string")
|
|
124
|
+
|
|
125
|
+
if request.tbs is not None:
|
|
126
|
+
valid_tbs_values = {
|
|
127
|
+
"qdr:h", "qdr:d", "qdr:w", "qdr:m", "qdr:y",
|
|
128
|
+
"d", "w", "m", "y"
|
|
129
|
+
}
|
|
130
|
+
if request.tbs in valid_tbs_values:
|
|
131
|
+
pass
|
|
132
|
+
elif request.tbs.startswith("cdr:"):
|
|
133
|
+
custom_date_pattern = r"^cdr:1,cd_min:\d{1,2}/\d{1,2}/\d{4},cd_max:\d{1,2}/\d{1,2}/\d{4}$"
|
|
134
|
+
if not re.match(custom_date_pattern, request.tbs):
|
|
135
|
+
raise ValueError(f"Invalid custom date range format: {request.tbs}. Expected format: cdr:1,cd_min:MM/DD/YYYY,cd_max:MM/DD/YYYY")
|
|
136
|
+
else:
|
|
137
|
+
raise ValueError(f"Invalid tbs value: {request.tbs}. Valid values: {valid_tbs_values} or custom date range format: cdr:1,cd_min:MM/DD/YYYY,cd_max:MM/DD/YYYY")
|
|
8
138
|
|
|
9
|
-
def _prepare_search_request(request: SearchRequest) -> Dict[str, Any]:
|
|
10
|
-
data = request.model_dump(exclude_none=True)
|
|
11
|
-
if request.ignore_invalid_urls is not None:
|
|
12
|
-
data["ignoreInvalidURLs"] = request.ignore_invalid_urls
|
|
13
|
-
data.pop("ignore_invalid_urls", None)
|
|
14
139
|
if request.scrape_options is not None:
|
|
15
140
|
validate_scrape_options(request.scrape_options)
|
|
16
|
-
|
|
141
|
+
|
|
142
|
+
return request
|
|
143
|
+
|
|
144
|
+
def _prepare_search_request(request: SearchRequest) -> Dict[str, Any]:
|
|
145
|
+
"""
|
|
146
|
+
Prepare a search request payload.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
request: Search request
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Request payload dictionary
|
|
153
|
+
"""
|
|
154
|
+
validated_request = _validate_search_request(request)
|
|
155
|
+
data = validated_request.model_dump(exclude_none=True, by_alias=True)
|
|
156
|
+
|
|
157
|
+
if "limit" not in data and validated_request.limit is not None:
|
|
158
|
+
data["limit"] = validated_request.limit
|
|
159
|
+
if "timeout" not in data and validated_request.timeout is not None:
|
|
160
|
+
data["timeout"] = validated_request.timeout
|
|
161
|
+
|
|
162
|
+
if validated_request.ignore_invalid_urls is not None:
|
|
163
|
+
data["ignoreInvalidURLs"] = validated_request.ignore_invalid_urls
|
|
164
|
+
data.pop("ignore_invalid_urls", None)
|
|
165
|
+
|
|
166
|
+
if validated_request.scrape_options is not None:
|
|
167
|
+
scrape_data = prepare_scrape_options(validated_request.scrape_options)
|
|
17
168
|
if scrape_data:
|
|
18
169
|
data["scrapeOptions"] = scrape_data
|
|
19
170
|
data.pop("scrape_options", None)
|
|
20
|
-
return data
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
async def search(client: AsyncHttpClient, request: SearchRequest) -> SearchData:
|
|
24
|
-
payload = _prepare_search_request(request)
|
|
25
|
-
response = await client.post("/v2/search", payload)
|
|
26
|
-
if response.status_code >= 400:
|
|
27
|
-
handle_response_error(response, "search")
|
|
28
|
-
body = response.json()
|
|
29
|
-
if not body.get("success"):
|
|
30
|
-
raise Exception(body.get("error", "Unknown error occurred"))
|
|
31
|
-
|
|
32
|
-
data = body.get("data", {})
|
|
33
|
-
search_data = SearchData()
|
|
34
|
-
for source_type, source_documents in data.items():
|
|
35
|
-
if isinstance(source_documents, list):
|
|
36
|
-
results = []
|
|
37
|
-
for doc_data in source_documents:
|
|
38
|
-
if isinstance(doc_data, dict):
|
|
39
|
-
if request.scrape_options is not None and any(
|
|
40
|
-
key in doc_data for key in ['markdown', 'html', 'rawHtml', 'links', 'summary', 'screenshot', 'changeTracking']
|
|
41
|
-
):
|
|
42
|
-
normalized = normalize_document_input(doc_data)
|
|
43
|
-
results.append(Document(**normalized))
|
|
44
|
-
else:
|
|
45
|
-
results.append(SearchResult(
|
|
46
|
-
url=doc_data.get('url', ''),
|
|
47
|
-
title=doc_data.get('title'),
|
|
48
|
-
description=doc_data.get('description')
|
|
49
|
-
))
|
|
50
|
-
elif isinstance(doc_data, str):
|
|
51
|
-
results.append(SearchResult(url=doc_data))
|
|
52
|
-
if hasattr(search_data, source_type):
|
|
53
|
-
setattr(search_data, source_type, results)
|
|
54
|
-
return search_data
|
|
55
171
|
|
|
172
|
+
return data
|
firecrawl/v2/methods/search.py
CHANGED
|
@@ -2,11 +2,13 @@
|
|
|
2
2
|
Search functionality for Firecrawl v2 API.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
from
|
|
5
|
+
import re
|
|
6
|
+
from typing import Dict, Any, Union, List, TypeVar, Type
|
|
7
|
+
from ..types import SearchRequest, SearchData, Document, SearchResultWeb, SearchResultNews, SearchResultImages
|
|
7
8
|
from ..utils.normalize import normalize_document_input
|
|
8
9
|
from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
|
|
9
10
|
|
|
11
|
+
T = TypeVar("T")
|
|
10
12
|
|
|
11
13
|
def search(
|
|
12
14
|
client: HttpClient,
|
|
@@ -26,48 +28,56 @@ def search(
|
|
|
26
28
|
FirecrawlError: If the search operation fails
|
|
27
29
|
"""
|
|
28
30
|
request_data = _prepare_search_request(request)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
if request.scrape_options is not None and any(
|
|
52
|
-
key in doc_data for key in ['markdown', 'html', 'rawHtml', 'links', 'summary', 'screenshot', 'changeTracking']
|
|
53
|
-
):
|
|
54
|
-
normalized = normalize_document_input(doc_data)
|
|
55
|
-
results.append(Document(**normalized))
|
|
56
|
-
else:
|
|
57
|
-
# Minimal search result shape
|
|
58
|
-
results.append(SearchResult(
|
|
59
|
-
url=doc_data.get('url', ''),
|
|
60
|
-
title=doc_data.get('title'),
|
|
61
|
-
description=doc_data.get('description')
|
|
62
|
-
))
|
|
63
|
-
elif isinstance(doc_data, str):
|
|
64
|
-
results.append(SearchResult(url=doc_data))
|
|
65
|
-
|
|
66
|
-
if hasattr(search_data, source_type):
|
|
67
|
-
setattr(search_data, source_type, results)
|
|
68
|
-
|
|
69
|
-
return search_data
|
|
31
|
+
try:
|
|
32
|
+
response = client.post("/v2/search", request_data)
|
|
33
|
+
if response.status_code != 200:
|
|
34
|
+
handle_response_error(response, "search")
|
|
35
|
+
response_data = response.json()
|
|
36
|
+
if not response_data.get("success"):
|
|
37
|
+
handle_response_error(response, "search")
|
|
38
|
+
data = response_data.get("data", {}) or {}
|
|
39
|
+
out = SearchData()
|
|
40
|
+
if "web" in data:
|
|
41
|
+
out.web = _transform_array(data["web"], SearchResultWeb)
|
|
42
|
+
if "news" in data:
|
|
43
|
+
out.news = _transform_array(data["news"], SearchResultNews)
|
|
44
|
+
if "images" in data:
|
|
45
|
+
out.images = _transform_array(data["images"], SearchResultImages)
|
|
46
|
+
return out
|
|
47
|
+
except Exception as err:
|
|
48
|
+
# If the error is an HTTP error from requests, handle it
|
|
49
|
+
# (simulate isAxiosError by checking for requests' HTTPError or Response)
|
|
50
|
+
if hasattr(err, "response"):
|
|
51
|
+
handle_response_error(getattr(err, "response"), "search")
|
|
52
|
+
raise err
|
|
70
53
|
|
|
54
|
+
def _transform_array(arr: List[Any], result_type: Type[T]) -> List[Union[T, 'Document']]:
|
|
55
|
+
"""
|
|
56
|
+
Transforms an array of items into a list of result_type or Document.
|
|
57
|
+
If the item dict contains any of the special keys, it is treated as a Document.
|
|
58
|
+
Otherwise, it is treated as result_type.
|
|
59
|
+
If the item is not a dict, it is wrapped as result_type with url=item.
|
|
60
|
+
"""
|
|
61
|
+
results: List[Union[T, 'Document']] = []
|
|
62
|
+
for item in arr:
|
|
63
|
+
if item and isinstance(item, dict):
|
|
64
|
+
if (
|
|
65
|
+
"markdown" in item or
|
|
66
|
+
"html" in item or
|
|
67
|
+
"rawHtml" in item or
|
|
68
|
+
"links" in item or
|
|
69
|
+
"screenshot" in item or
|
|
70
|
+
"changeTracking" in item or
|
|
71
|
+
"summary" in item or
|
|
72
|
+
"json" in item
|
|
73
|
+
):
|
|
74
|
+
results.append(Document(**item))
|
|
75
|
+
else:
|
|
76
|
+
results.append(result_type(**item))
|
|
77
|
+
else:
|
|
78
|
+
# For non-dict items, assume it's a URL and wrap in result_type
|
|
79
|
+
results.append(result_type(url=item))
|
|
80
|
+
return results
|
|
71
81
|
|
|
72
82
|
def _validate_search_request(request: SearchRequest) -> SearchRequest:
|
|
73
83
|
"""
|
|
@@ -119,11 +129,18 @@ def _validate_search_request(request: SearchRequest) -> SearchRequest:
|
|
|
119
129
|
# Validate tbs (time-based search, if provided)
|
|
120
130
|
if request.tbs is not None:
|
|
121
131
|
valid_tbs_values = {
|
|
122
|
-
"qdr:d", "qdr:w", "qdr:m", "qdr:y", # Google time filters
|
|
132
|
+
"qdr:h", "qdr:d", "qdr:w", "qdr:m", "qdr:y", # Google time filters
|
|
123
133
|
"d", "w", "m", "y" # Short forms
|
|
124
134
|
}
|
|
125
|
-
|
|
126
|
-
|
|
135
|
+
|
|
136
|
+
if request.tbs in valid_tbs_values:
|
|
137
|
+
pass # Valid predefined value
|
|
138
|
+
elif request.tbs.startswith("cdr:"):
|
|
139
|
+
custom_date_pattern = r"^cdr:1,cd_min:\d{1,2}/\d{1,2}/\d{4},cd_max:\d{1,2}/\d{1,2}/\d{4}$"
|
|
140
|
+
if not re.match(custom_date_pattern, request.tbs):
|
|
141
|
+
raise ValueError(f"Invalid custom date range format: {request.tbs}. Expected format: cdr:1,cd_min:MM/DD/YYYY,cd_max:MM/DD/YYYY")
|
|
142
|
+
else:
|
|
143
|
+
raise ValueError(f"Invalid tbs value: {request.tbs}. Valid values: {valid_tbs_values} or custom date range format: cdr:1,cd_min:MM/DD/YYYY,cd_max:MM/DD/YYYY")
|
|
127
144
|
|
|
128
145
|
# Validate scrape_options (if provided)
|
|
129
146
|
if request.scrape_options is not None:
|
|
@@ -166,4 +183,4 @@ def _prepare_search_request(request: SearchRequest) -> Dict[str, Any]:
|
|
|
166
183
|
data["scrapeOptions"] = scrape_data
|
|
167
184
|
data.pop("scrape_options", None)
|
|
168
185
|
|
|
169
|
-
return data
|
|
186
|
+
return data
|
firecrawl/v2/types.py
CHANGED
|
@@ -327,11 +327,35 @@ class CrawlJob(BaseModel):
|
|
|
327
327
|
next: Optional[str] = None
|
|
328
328
|
data: List[Document] = []
|
|
329
329
|
|
|
330
|
-
class
|
|
331
|
-
"""A
|
|
330
|
+
class SearchResultWeb(BaseModel):
|
|
331
|
+
"""A web search result with URL, title, and description."""
|
|
332
332
|
url: str
|
|
333
333
|
title: Optional[str] = None
|
|
334
|
-
description: Optional[str] = None
|
|
334
|
+
description: Optional[str] = None
|
|
335
|
+
|
|
336
|
+
class SearchResultNews(BaseModel):
|
|
337
|
+
"""A news search result with URL, title, snippet, date, image URL, and position."""
|
|
338
|
+
title: Optional[str] = None
|
|
339
|
+
url: Optional[str] = None
|
|
340
|
+
snippet: Optional[str] = None
|
|
341
|
+
date: Optional[str] = None
|
|
342
|
+
image_url: Optional[str] = None
|
|
343
|
+
position: Optional[int] = None
|
|
344
|
+
|
|
345
|
+
class SearchResultImages(BaseModel):
|
|
346
|
+
"""An image search result with URL, title, image URL, image width, image height, and position."""
|
|
347
|
+
title: Optional[str] = None
|
|
348
|
+
image_url: Optional[str] = None
|
|
349
|
+
image_width: Optional[int] = None
|
|
350
|
+
image_height: Optional[int] = None
|
|
351
|
+
url: Optional[str] = None
|
|
352
|
+
position: Optional[int] = None
|
|
353
|
+
|
|
354
|
+
class SearchData(BaseModel):
|
|
355
|
+
"""Search results grouped by source type."""
|
|
356
|
+
web: Optional[List[Union[SearchResultWeb, Document]]] = None
|
|
357
|
+
news: Optional[List[Union[SearchResultNews, Document]]] = None
|
|
358
|
+
images: Optional[List[Union[SearchResultImages, Document]]] = None
|
|
335
359
|
|
|
336
360
|
class MapDocument(Document):
|
|
337
361
|
"""A document from a map operation with URL and description."""
|
|
@@ -535,9 +559,9 @@ SearchResult = LinkResult
|
|
|
535
559
|
|
|
536
560
|
class SearchData(BaseModel):
|
|
537
561
|
"""Search results grouped by source type."""
|
|
538
|
-
web: Optional[List[Union[
|
|
539
|
-
news: Optional[List[Union[
|
|
540
|
-
images: Optional[List[Union[
|
|
562
|
+
web: Optional[List[Union[SearchResultWeb, Document]]] = None
|
|
563
|
+
news: Optional[List[Union[SearchResultNews, Document]]] = None
|
|
564
|
+
images: Optional[List[Union[SearchResultImages, Document]]] = None
|
|
541
565
|
|
|
542
566
|
class SearchResponse(BaseResponse[SearchData]):
|
|
543
567
|
"""Response from search operation."""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: firecrawl
|
|
3
|
-
Version: 3.1
|
|
3
|
+
Version: 3.2.1
|
|
4
4
|
Summary: Python SDK for Firecrawl API
|
|
5
5
|
Home-page: https://github.com/firecrawl/firecrawl
|
|
6
6
|
Author: Mendable.ai
|
|
@@ -38,8 +38,12 @@ Requires-Dist: httpx
|
|
|
38
38
|
Requires-Dist: python-dotenv
|
|
39
39
|
Requires-Dist: websockets
|
|
40
40
|
Requires-Dist: nest-asyncio
|
|
41
|
-
Requires-Dist: pydantic
|
|
41
|
+
Requires-Dist: pydantic>=2.0
|
|
42
42
|
Requires-Dist: aiohttp
|
|
43
|
+
Dynamic: author
|
|
44
|
+
Dynamic: home-page
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
Dynamic: requires-python
|
|
43
47
|
|
|
44
48
|
# Firecrawl Python SDK
|
|
45
49
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
firecrawl/__init__.py,sha256=
|
|
1
|
+
firecrawl/__init__.py,sha256=SoaA5_UyKZxxrqkDlOfvoYOdHgPCsagI1bQZRameUNw,2192
|
|
2
2
|
firecrawl/client.py,sha256=2BGIRTiW2eR6q3wu_g2s3VTQtrHYauoDeNF1YklQpHo,11089
|
|
3
3
|
firecrawl/firecrawl.backup.py,sha256=v1FEN3jR4g5Aupg4xp6SLkuFvYMQuUKND2YELbYjE6c,200430
|
|
4
|
-
firecrawl/types.py,sha256=
|
|
4
|
+
firecrawl/types.py,sha256=W9N2pqQuevEIIjYHN9rbDf31E-nwdCECqIn11Foz2T8,2836
|
|
5
5
|
firecrawl/__tests__/e2e/v2/conftest.py,sha256=I28TUpN5j0-9gM79NlbrDS8Jlsheao657od2f-2xK0Y,2587
|
|
6
6
|
firecrawl/__tests__/e2e/v2/test_async.py,sha256=ZXpf1FVOJgNclITglrxIyFwP4cOiqzWLicGaxIm70BQ,2526
|
|
7
7
|
firecrawl/__tests__/e2e/v2/test_batch_scrape.py,sha256=H9GtuwHIFdOQ958SOVThi_kvDDxcXAK_ECRh95ogonQ,3265
|
|
@@ -9,7 +9,7 @@ firecrawl/__tests__/e2e/v2/test_crawl.py,sha256=cOssZvIwtghAtLiM1QdNLhPEwAxZ9j9u
|
|
|
9
9
|
firecrawl/__tests__/e2e/v2/test_extract.py,sha256=HgvGiDlyWtFygiPo5EP44Dem1oWrwgRF-hfc1LfeVSU,1670
|
|
10
10
|
firecrawl/__tests__/e2e/v2/test_map.py,sha256=9sT-Yq8V_8c9esl_bv5hnTA9WXb2Dg81kj6M-s0484c,1618
|
|
11
11
|
firecrawl/__tests__/e2e/v2/test_scrape.py,sha256=psW2nfcA_hMFpZ4msL_VJWJTMa3Sidp11ubhftbm52g,5759
|
|
12
|
-
firecrawl/__tests__/e2e/v2/test_search.py,sha256=
|
|
12
|
+
firecrawl/__tests__/e2e/v2/test_search.py,sha256=tvU9_eg_3H5em0fhIwPPjuYe9BRAQ5St-BLM0l_FfVs,9079
|
|
13
13
|
firecrawl/__tests__/e2e/v2/test_usage.py,sha256=JlBkYblhThua5qF2crRjsPpq4Ja0cBsdzxZ5zxXnQ_Y,805
|
|
14
14
|
firecrawl/__tests__/e2e/v2/test_watcher.py,sha256=OPTKLhVAKWqXl2Tieo6zCN1xpEwZDsz-B977CVJgLMA,1932
|
|
15
15
|
firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py,sha256=gJv_mLzzoAYftETB2TLkrpSfB5c04kaYgkD4hQTYsIg,2639
|
|
@@ -17,7 +17,7 @@ firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py,sha256=X-nk5tkYUYIkM6kTYl7GDjvx
|
|
|
17
17
|
firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py,sha256=3CNRIFzgBMcOYOLhnKcK1k5a3Gy--u08EGDkL31uieM,1199
|
|
18
18
|
firecrawl/__tests__/e2e/v2/aio/test_aio_map.py,sha256=nckl1kbiEaaTdu5lm__tOoTDG-txTYwwSH3KZEvyKzc,1199
|
|
19
19
|
firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py,sha256=b17A7advBEjxrjdait2w8GHztZeKy_P3zZ3ixm5H7xw,4453
|
|
20
|
-
firecrawl/__tests__/e2e/v2/aio/test_aio_search.py,sha256=
|
|
20
|
+
firecrawl/__tests__/e2e/v2/aio/test_aio_search.py,sha256=ehV0Ai_hknAkaoE551j2lbktV4bi_J0h3FKzC7G15Iw,8246
|
|
21
21
|
firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py,sha256=Dh9BVo48NKSZOKgLbO7n8fpMjvYmeMXDFzbIhnCTMhE,1014
|
|
22
22
|
firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py,sha256=hwES4Nu5c0hniZ9heIPDfvh_2JmJ2wPoX9ULTZ0Asjs,1471
|
|
23
23
|
firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py,sha256=HeOxN-sPYSssytcIRAEicJSZsFt_Oa5qGXAtdumR54c,4040
|
|
@@ -27,7 +27,7 @@ firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py,sha256=kErOmHSD01eM
|
|
|
27
27
|
firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py,sha256=toVcgnMp_cFeYsIUuyKGEWZGp0nAAkzaeFGUbY0zY0o,1868
|
|
28
28
|
firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py,sha256=wDOslsA5BN4kyezlaT5GeMv_Ifn8f461EaA7i5ujnaQ,3482
|
|
29
29
|
firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py,sha256=14lUgFpQsiosgMKjDustBRVE0zXnHujBI76F8BC5PZ4,6072
|
|
30
|
-
firecrawl/__tests__/unit/v2/methods/test_search_validation.py,sha256=
|
|
30
|
+
firecrawl/__tests__/unit/v2/methods/test_search_validation.py,sha256=7UGcNHpQzCpZbAPYjthfdPFWmAPcoApY-ED-khtuANs,9498
|
|
31
31
|
firecrawl/__tests__/unit/v2/methods/test_usage_types.py,sha256=cCHHfa6agSjD0brQ9rcAcw2kaI9riUH5C0dXV-fqktg,591
|
|
32
32
|
firecrawl/__tests__/unit/v2/methods/test_webhook.py,sha256=AvvW-bKpUA--Lvtif2bmUIp-AxiaMJ29ie1i9dk8WbI,4586
|
|
33
33
|
firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py,sha256=9azJxVvDOBqUevLp-wBF9gF7Ptj-7nN6LOkPQncFX2M,456
|
|
@@ -45,7 +45,7 @@ firecrawl/v1/client.py,sha256=sydurfEFTsXyowyaGryA1lkPxN_r9Nf6iQpM43OwJyM,201672
|
|
|
45
45
|
firecrawl/v2/__init__.py,sha256=Jc6a8tBjYG5OPkjDM5pl-notyys-7DEj7PLEfepv3fc,137
|
|
46
46
|
firecrawl/v2/client.py,sha256=P6WAzwYGLLIANTrqAM-K4EUdGWQoFsi-zCjBibbxKQw,30507
|
|
47
47
|
firecrawl/v2/client_async.py,sha256=zwxHis1bSh0tSF1480ze-4XDQEDJ5yDur1ZqtL94dwc,10127
|
|
48
|
-
firecrawl/v2/types.py,sha256=
|
|
48
|
+
firecrawl/v2/types.py,sha256=bbHXPWJp6Kvjx9rKkTPyWZwdqVTErS4VYZKfHsb7ZQc,21137
|
|
49
49
|
firecrawl/v2/watcher.py,sha256=FOU71tqSKxgeuGycu4ye0SLc2dw7clIcoQjPsi-4Csc,14229
|
|
50
50
|
firecrawl/v2/watcher_async.py,sha256=AVjW2mgABniolSsauK4u0FW8ya6WzRUdyEg2R-8vGCw,10278
|
|
51
51
|
firecrawl/v2/methods/batch.py,sha256=us7zUGl7u9ZDIEk2J3rNqj87bkaNjXU27SMFW_fdcg8,11932
|
|
@@ -53,7 +53,7 @@ firecrawl/v2/methods/crawl.py,sha256=4ZUmanHNuNtq9wbKMAZ3lenuPcNdOaV0kYXqMI5XJJ8
|
|
|
53
53
|
firecrawl/v2/methods/extract.py,sha256=-Jr4BtraU3b7hd3JIY73V-S69rUclxyXyUpoQb6DCQk,4274
|
|
54
54
|
firecrawl/v2/methods/map.py,sha256=4SADb0-lkbdOWDmO6k8_TzK0yRti5xsN40N45nUl9uA,2592
|
|
55
55
|
firecrawl/v2/methods/scrape.py,sha256=CSHBwC-P91UfrW3zHirjNAs2h899FKcWvd1DY_4fJdo,1921
|
|
56
|
-
firecrawl/v2/methods/search.py,sha256=
|
|
56
|
+
firecrawl/v2/methods/search.py,sha256=c6tkDQGYZeLsPABPVfzhjalsasnhlien3w80aoe89t0,7077
|
|
57
57
|
firecrawl/v2/methods/usage.py,sha256=OJlkxwaB-AAtgO3WLr9QiqBRmjdh6GVhroCgleegupQ,1460
|
|
58
58
|
firecrawl/v2/methods/aio/__init__.py,sha256=RocMJnGwnLIvGu3G8ZvY8INkipC7WHZiu2bE31eSyJs,35
|
|
59
59
|
firecrawl/v2/methods/aio/batch.py,sha256=GS_xsd_Uib1fxFITBK1sH88VGzFMrIcqJVQqOvMQ540,3735
|
|
@@ -61,19 +61,19 @@ firecrawl/v2/methods/aio/crawl.py,sha256=pC6bHVk30Hj1EJdAChxpMOg0Xx_GVqq4tIlvU2e
|
|
|
61
61
|
firecrawl/v2/methods/aio/extract.py,sha256=IfNr2ETqt4dR73JFzrEYI4kk5vpKnJOG0BmPEjGEoO4,4217
|
|
62
62
|
firecrawl/v2/methods/aio/map.py,sha256=EuT-5A0cQr_e5SBfEZ6pnl8u0JUwEEvSwhyT2N-QoKU,2326
|
|
63
63
|
firecrawl/v2/methods/aio/scrape.py,sha256=ilA9qco8YGwCFpE0PN1XBQUyuHPQwH2QioZ-xsfxhgU,1386
|
|
64
|
-
firecrawl/v2/methods/aio/search.py,sha256=
|
|
64
|
+
firecrawl/v2/methods/aio/search.py,sha256=_TqTFGQLlOCCLNdWcOvakTqPGD2r9AOlBg8RasOgmvw,6177
|
|
65
65
|
firecrawl/v2/methods/aio/usage.py,sha256=OtBi6X-aT09MMR2dpm3vBCm9JrJZIJLCQ8jJ3L7vie4,1606
|
|
66
66
|
firecrawl/v2/utils/__init__.py,sha256=i1GgxySmqEXpWSBQCu3iZBPIJG7fXj0QXCDWGwerWNs,338
|
|
67
67
|
firecrawl/v2/utils/error_handler.py,sha256=Iuf916dHphDY8ObNNlWy75628DFeJ0Rv8ljRp4LttLE,4199
|
|
68
68
|
firecrawl/v2/utils/get_version.py,sha256=0CxW_41q2hlzIxEWOivUCaYw3GFiSIH32RPUMcIgwAY,492
|
|
69
69
|
firecrawl/v2/utils/http_client.py,sha256=_n8mp4xi6GGihg662Lsv6TSlvw9zykyADwEk0fg8mYA,4873
|
|
70
|
-
firecrawl/v2/utils/http_client_async.py,sha256=
|
|
70
|
+
firecrawl/v2/utils/http_client_async.py,sha256=iy89_bk2HS3afSRHZ8016eMCa9Fk-5MFTntcOHfbPgE,1936
|
|
71
71
|
firecrawl/v2/utils/normalize.py,sha256=nlTU6QRghT1YKZzNZlIQj4STSRuSUGrS9cCErZIcY5w,3636
|
|
72
72
|
firecrawl/v2/utils/validation.py,sha256=L8by7z-t6GuMGIYkK7il1BM8d-4_-sAdG9hDMF_LeG4,14518
|
|
73
|
+
firecrawl-3.2.1.dist-info/licenses/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
73
74
|
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
74
75
|
tests/test_timeout_conversion.py,sha256=PWlIEMASQNhu4cp1OW_ebklnE9NCiigPnEFCtI5N3w0,3996
|
|
75
|
-
firecrawl-3.1.
|
|
76
|
-
firecrawl-3.1.
|
|
77
|
-
firecrawl-3.1.
|
|
78
|
-
firecrawl-3.1.
|
|
79
|
-
firecrawl-3.1.0.dist-info/RECORD,,
|
|
76
|
+
firecrawl-3.2.1.dist-info/METADATA,sha256=k1UNlt3XP09k-9i3oAkm4ElYkZOflYHkFIWuOHD4YsU,7392
|
|
77
|
+
firecrawl-3.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
78
|
+
firecrawl-3.2.1.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
79
|
+
firecrawl-3.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|