firecrawl 1.7.1__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/__tests__/v1/e2e_withAuth/test.py +66 -0
- firecrawl/firecrawl.py +167 -23
- {firecrawl-1.7.1.dist-info → firecrawl-1.9.0.dist-info}/METADATA +1 -1
- firecrawl-1.9.0.dist-info/RECORD +11 -0
- firecrawl-1.7.1.dist-info/RECORD +0 -11
- {firecrawl-1.7.1.dist-info → firecrawl-1.9.0.dist-info}/LICENSE +0 -0
- {firecrawl-1.7.1.dist-info → firecrawl-1.9.0.dist-info}/WHEEL +0 -0
- {firecrawl-1.7.1.dist-info → firecrawl-1.9.0.dist-info}/top_level.txt +0 -0
firecrawl/__init__.py
CHANGED
|
@@ -371,4 +371,70 @@ def test_search_e2e():
|
|
|
371
371
|
# assert isinstance(llm_extraction['supports_sso'], bool)
|
|
372
372
|
# assert isinstance(llm_extraction['is_open_source'], bool)
|
|
373
373
|
|
|
374
|
+
def test_search_with_string_query():
|
|
375
|
+
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
376
|
+
response = app.search("firecrawl")
|
|
377
|
+
assert response["success"] is True
|
|
378
|
+
assert len(response["data"]) > 0
|
|
379
|
+
assert response["data"][0]["markdown"] is not None
|
|
380
|
+
assert response["data"][0]["metadata"] is not None
|
|
381
|
+
assert response["data"][0]["metadata"]["title"] is not None
|
|
382
|
+
assert response["data"][0]["metadata"]["description"] is not None
|
|
383
|
+
|
|
384
|
+
def test_search_with_params_dict():
|
|
385
|
+
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
386
|
+
response = app.search("firecrawl", {
|
|
387
|
+
"limit": 3,
|
|
388
|
+
"lang": "en",
|
|
389
|
+
"country": "us",
|
|
390
|
+
"scrapeOptions": {
|
|
391
|
+
"formats": ["markdown", "html", "links"],
|
|
392
|
+
"onlyMainContent": True
|
|
393
|
+
}
|
|
394
|
+
})
|
|
395
|
+
assert response["success"] is True
|
|
396
|
+
assert len(response["data"]) <= 3
|
|
397
|
+
for doc in response["data"]:
|
|
398
|
+
assert doc["markdown"] is not None
|
|
399
|
+
assert doc["html"] is not None
|
|
400
|
+
assert doc["links"] is not None
|
|
401
|
+
assert doc["metadata"] is not None
|
|
402
|
+
assert doc["metadata"]["title"] is not None
|
|
403
|
+
assert doc["metadata"]["description"] is not None
|
|
404
|
+
|
|
405
|
+
def test_search_with_params_object():
|
|
406
|
+
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
407
|
+
params = SearchParams(
|
|
408
|
+
query="firecrawl",
|
|
409
|
+
limit=3,
|
|
410
|
+
lang="en",
|
|
411
|
+
country="us",
|
|
412
|
+
scrapeOptions={
|
|
413
|
+
"formats": ["markdown", "html", "links"],
|
|
414
|
+
"onlyMainContent": True
|
|
415
|
+
}
|
|
416
|
+
)
|
|
417
|
+
response = app.search(params.query, params)
|
|
418
|
+
assert response["success"] is True
|
|
419
|
+
assert len(response["data"]) <= 3
|
|
420
|
+
for doc in response["data"]:
|
|
421
|
+
assert doc["markdown"] is not None
|
|
422
|
+
assert doc["html"] is not None
|
|
423
|
+
assert doc["links"] is not None
|
|
424
|
+
assert doc["metadata"] is not None
|
|
425
|
+
assert doc["metadata"]["title"] is not None
|
|
426
|
+
assert doc["metadata"]["description"] is not None
|
|
427
|
+
|
|
428
|
+
def test_search_invalid_api_key():
|
|
429
|
+
app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
|
430
|
+
with pytest.raises(Exception) as e:
|
|
431
|
+
app.search("test query")
|
|
432
|
+
assert "404" in str(e.value)
|
|
433
|
+
|
|
434
|
+
def test_search_with_invalid_params():
|
|
435
|
+
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
436
|
+
with pytest.raises(Exception) as e:
|
|
437
|
+
app.search("test query", {"invalid_param": "value"})
|
|
438
|
+
assert "ValidationError" in str(e.value)
|
|
439
|
+
|
|
374
440
|
|
firecrawl/firecrawl.py
CHANGED
|
@@ -21,7 +21,28 @@ import websockets
|
|
|
21
21
|
|
|
22
22
|
logger : logging.Logger = logging.getLogger("firecrawl")
|
|
23
23
|
|
|
24
|
+
class SearchParams(pydantic.BaseModel):
|
|
25
|
+
query: str
|
|
26
|
+
limit: Optional[int] = 5
|
|
27
|
+
tbs: Optional[str] = None
|
|
28
|
+
filter: Optional[str] = None
|
|
29
|
+
lang: Optional[str] = "en"
|
|
30
|
+
country: Optional[str] = "us"
|
|
31
|
+
location: Optional[str] = None
|
|
32
|
+
origin: Optional[str] = "api"
|
|
33
|
+
timeout: Optional[int] = 60000
|
|
34
|
+
scrapeOptions: Optional[Dict[str, Any]] = None
|
|
35
|
+
|
|
24
36
|
class FirecrawlApp:
|
|
37
|
+
class SearchResponse(pydantic.BaseModel):
|
|
38
|
+
"""
|
|
39
|
+
Response from the search operation.
|
|
40
|
+
"""
|
|
41
|
+
success: bool
|
|
42
|
+
data: List[Dict[str, Any]]
|
|
43
|
+
warning: Optional[str] = None
|
|
44
|
+
error: Optional[str] = None
|
|
45
|
+
|
|
25
46
|
class ExtractParams(pydantic.BaseModel):
|
|
26
47
|
"""
|
|
27
48
|
Parameters for the extract operation.
|
|
@@ -109,22 +130,36 @@ class FirecrawlApp:
|
|
|
109
130
|
else:
|
|
110
131
|
self._handle_error(response, 'scrape URL')
|
|
111
132
|
|
|
112
|
-
def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
|
133
|
+
def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> Dict[str, Any]:
|
|
113
134
|
"""
|
|
114
|
-
|
|
135
|
+
Search for content using the Firecrawl API.
|
|
115
136
|
|
|
116
137
|
Args:
|
|
117
|
-
query (str): The search query.
|
|
118
|
-
params (Optional[Dict[str, Any]]): Additional
|
|
138
|
+
query (str): The search query string.
|
|
139
|
+
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters.
|
|
119
140
|
|
|
120
141
|
Returns:
|
|
121
|
-
Any: The search
|
|
122
|
-
|
|
123
|
-
Raises:
|
|
124
|
-
NotImplementedError: If the search request is attempted on API version v1.
|
|
125
|
-
Exception: If the search request fails.
|
|
142
|
+
Dict[str, Any]: The search response containing success status and search results.
|
|
126
143
|
"""
|
|
127
|
-
|
|
144
|
+
if params is None:
|
|
145
|
+
params = {}
|
|
146
|
+
|
|
147
|
+
if isinstance(params, dict):
|
|
148
|
+
search_params = SearchParams(query=query, **params)
|
|
149
|
+
else:
|
|
150
|
+
search_params = params
|
|
151
|
+
search_params.query = query
|
|
152
|
+
|
|
153
|
+
response = requests.post(
|
|
154
|
+
f"{self.api_url}/v1/search",
|
|
155
|
+
headers={"Authorization": f"Bearer {self.api_key}"},
|
|
156
|
+
json=search_params.dict(exclude_none=True)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
if response.status_code != 200:
|
|
160
|
+
raise Exception(f"Request failed with status code {response.status_code}")
|
|
161
|
+
|
|
162
|
+
return response.json()
|
|
128
163
|
|
|
129
164
|
def crawl_url(self, url: str,
|
|
130
165
|
params: Optional[Dict[str, Any]] = None,
|
|
@@ -215,6 +250,8 @@ class FirecrawlApp:
|
|
|
215
250
|
if 'data' in status_data:
|
|
216
251
|
data = status_data['data']
|
|
217
252
|
while 'next' in status_data:
|
|
253
|
+
if len(status_data['data']) == 0:
|
|
254
|
+
break
|
|
218
255
|
next_url = status_data.get('next')
|
|
219
256
|
if not next_url:
|
|
220
257
|
logger.warning("Expected 'next' URL is missing.")
|
|
@@ -231,17 +268,25 @@ class FirecrawlApp:
|
|
|
231
268
|
logger.error(f"Error during pagination request: {e}")
|
|
232
269
|
break
|
|
233
270
|
status_data['data'] = data
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
'success': True,
|
|
271
|
+
|
|
272
|
+
response = {
|
|
237
273
|
'status': status_data.get('status'),
|
|
238
274
|
'total': status_data.get('total'),
|
|
239
275
|
'completed': status_data.get('completed'),
|
|
240
276
|
'creditsUsed': status_data.get('creditsUsed'),
|
|
241
277
|
'expiresAt': status_data.get('expiresAt'),
|
|
242
|
-
'data': status_data.get('data')
|
|
243
|
-
|
|
244
|
-
|
|
278
|
+
'data': status_data.get('data')
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
if 'error' in status_data:
|
|
282
|
+
response['error'] = status_data['error']
|
|
283
|
+
|
|
284
|
+
if 'next' in status_data:
|
|
285
|
+
response['next'] = status_data['next']
|
|
286
|
+
|
|
287
|
+
return {
|
|
288
|
+
'success': False if 'error' in status_data else True,
|
|
289
|
+
**response
|
|
245
290
|
}
|
|
246
291
|
else:
|
|
247
292
|
self._handle_error(response, 'check crawl status')
|
|
@@ -424,6 +469,8 @@ class FirecrawlApp:
|
|
|
424
469
|
if 'data' in status_data:
|
|
425
470
|
data = status_data['data']
|
|
426
471
|
while 'next' in status_data:
|
|
472
|
+
if len(status_data['data']) == 0:
|
|
473
|
+
break
|
|
427
474
|
next_url = status_data.get('next')
|
|
428
475
|
if not next_url:
|
|
429
476
|
logger.warning("Expected 'next' URL is missing.")
|
|
@@ -441,16 +488,24 @@ class FirecrawlApp:
|
|
|
441
488
|
break
|
|
442
489
|
status_data['data'] = data
|
|
443
490
|
|
|
444
|
-
|
|
445
|
-
'success': True,
|
|
491
|
+
response = {
|
|
446
492
|
'status': status_data.get('status'),
|
|
447
493
|
'total': status_data.get('total'),
|
|
448
494
|
'completed': status_data.get('completed'),
|
|
449
495
|
'creditsUsed': status_data.get('creditsUsed'),
|
|
450
496
|
'expiresAt': status_data.get('expiresAt'),
|
|
451
|
-
'data': status_data.get('data')
|
|
452
|
-
|
|
453
|
-
|
|
497
|
+
'data': status_data.get('data')
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
if 'error' in status_data:
|
|
501
|
+
response['error'] = status_data['error']
|
|
502
|
+
|
|
503
|
+
if 'next' in status_data:
|
|
504
|
+
response['next'] = status_data['next']
|
|
505
|
+
|
|
506
|
+
return {
|
|
507
|
+
'success': False if 'error' in status_data else True,
|
|
508
|
+
**response
|
|
454
509
|
}
|
|
455
510
|
else:
|
|
456
511
|
self._handle_error(response, 'check batch scrape status')
|
|
@@ -483,10 +538,12 @@ class FirecrawlApp:
|
|
|
483
538
|
request_data = {
|
|
484
539
|
**jsonData,
|
|
485
540
|
'allowExternalLinks': params.get('allow_external_links', False),
|
|
486
|
-
'schema': schema
|
|
541
|
+
'schema': schema,
|
|
542
|
+
'origin': 'api-sdk'
|
|
487
543
|
}
|
|
488
544
|
|
|
489
545
|
try:
|
|
546
|
+
# Send the initial extract request
|
|
490
547
|
response = self._post_request(
|
|
491
548
|
f'{self.api_url}/v1/extract',
|
|
492
549
|
request_data,
|
|
@@ -495,7 +552,29 @@ class FirecrawlApp:
|
|
|
495
552
|
if response.status_code == 200:
|
|
496
553
|
data = response.json()
|
|
497
554
|
if data['success']:
|
|
498
|
-
|
|
555
|
+
job_id = data.get('id')
|
|
556
|
+
if not job_id:
|
|
557
|
+
raise Exception('Job ID not returned from extract request.')
|
|
558
|
+
|
|
559
|
+
# Poll for the extract status
|
|
560
|
+
while True:
|
|
561
|
+
status_response = self._get_request(
|
|
562
|
+
f'{self.api_url}/v1/extract/{job_id}',
|
|
563
|
+
headers
|
|
564
|
+
)
|
|
565
|
+
if status_response.status_code == 200:
|
|
566
|
+
status_data = status_response.json()
|
|
567
|
+
if status_data['status'] == 'completed':
|
|
568
|
+
if status_data['success']:
|
|
569
|
+
return status_data
|
|
570
|
+
else:
|
|
571
|
+
raise Exception(f'Failed to extract. Error: {status_data["error"]}')
|
|
572
|
+
elif status_data['status'] in ['failed', 'cancelled']:
|
|
573
|
+
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
|
574
|
+
else:
|
|
575
|
+
self._handle_error(status_response, "extract-status")
|
|
576
|
+
|
|
577
|
+
time.sleep(2) # Polling interval
|
|
499
578
|
else:
|
|
500
579
|
raise Exception(f'Failed to extract. Error: {data["error"]}')
|
|
501
580
|
else:
|
|
@@ -504,6 +583,69 @@ class FirecrawlApp:
|
|
|
504
583
|
raise ValueError(str(e), 500)
|
|
505
584
|
|
|
506
585
|
return {'success': False, 'error': "Internal server error."}
|
|
586
|
+
|
|
587
|
+
def get_extract_status(self, job_id: str) -> Dict[str, Any]:
|
|
588
|
+
"""
|
|
589
|
+
Retrieve the status of an extract job.
|
|
590
|
+
|
|
591
|
+
Args:
|
|
592
|
+
job_id (str): The ID of the extract job.
|
|
593
|
+
|
|
594
|
+
Returns:
|
|
595
|
+
Dict[str, Any]: The status of the extract job.
|
|
596
|
+
|
|
597
|
+
Raises:
|
|
598
|
+
ValueError: If there is an error retrieving the status.
|
|
599
|
+
"""
|
|
600
|
+
headers = self._prepare_headers()
|
|
601
|
+
try:
|
|
602
|
+
response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
|
|
603
|
+
if response.status_code == 200:
|
|
604
|
+
return response.json()
|
|
605
|
+
else:
|
|
606
|
+
self._handle_error(response, "get extract status")
|
|
607
|
+
except Exception as e:
|
|
608
|
+
raise ValueError(str(e), 500)
|
|
609
|
+
|
|
610
|
+
def async_extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
|
|
611
|
+
"""
|
|
612
|
+
Initiate an asynchronous extract job.
|
|
613
|
+
|
|
614
|
+
Args:
|
|
615
|
+
urls (List[str]): The URLs to extract data from.
|
|
616
|
+
params (Optional[Dict[str, Any]]): Additional parameters for the extract request.
|
|
617
|
+
idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
|
|
618
|
+
|
|
619
|
+
Returns:
|
|
620
|
+
Dict[str, Any]: The response from the extract operation.
|
|
621
|
+
|
|
622
|
+
Raises:
|
|
623
|
+
ValueError: If there is an error initiating the extract job.
|
|
624
|
+
"""
|
|
625
|
+
headers = self._prepare_headers(idempotency_key)
|
|
626
|
+
|
|
627
|
+
schema = params.get('schema') if params else None
|
|
628
|
+
if schema:
|
|
629
|
+
if hasattr(schema, 'model_json_schema'):
|
|
630
|
+
# Convert Pydantic model to JSON schema
|
|
631
|
+
schema = schema.model_json_schema()
|
|
632
|
+
# Otherwise assume it's already a JSON schema dict
|
|
633
|
+
|
|
634
|
+
jsonData = {'urls': urls, **(params or {})}
|
|
635
|
+
request_data = {
|
|
636
|
+
**jsonData,
|
|
637
|
+
'allowExternalLinks': params.get('allow_external_links', False) if params else False,
|
|
638
|
+
'schema': schema
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
try:
|
|
642
|
+
response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
|
|
643
|
+
if response.status_code == 200:
|
|
644
|
+
return response.json()
|
|
645
|
+
else:
|
|
646
|
+
self._handle_error(response, "async extract")
|
|
647
|
+
except Exception as e:
|
|
648
|
+
raise ValueError(str(e), 500)
|
|
507
649
|
|
|
508
650
|
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
|
509
651
|
"""
|
|
@@ -634,6 +776,8 @@ class FirecrawlApp:
|
|
|
634
776
|
if 'data' in status_data:
|
|
635
777
|
data = status_data['data']
|
|
636
778
|
while 'next' in status_data:
|
|
779
|
+
if len(status_data['data']) == 0:
|
|
780
|
+
break
|
|
637
781
|
status_response = self._get_request(status_data['next'], headers)
|
|
638
782
|
status_data = status_response.json()
|
|
639
783
|
data.extend(status_data.get('data', []))
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
firecrawl/__init__.py,sha256=5ubhPauv4SGtK_XBudjfb2AgdfGzSMetytrO2nb9QII,2543
|
|
2
|
+
firecrawl/firecrawl.py,sha256=VuSKgvzxF3G-1MWK7INR1NBae3jYx6kES-kDyqkqD40,35962
|
|
3
|
+
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
firecrawl/__tests__/e2e_withAuth/test.py,sha256=6OawnVF4IPeGyXg_Izi3t8U7MyT90roaJBJIG5UfllM,7935
|
|
5
|
+
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=tL5kJJ4el37Wc-Z2TRSuSWwWG2M40h3VPxHYuWijD00,19888
|
|
7
|
+
firecrawl-1.9.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
8
|
+
firecrawl-1.9.0.dist-info/METADATA,sha256=Hz7bNsZqTKMLpZ-wP3myJXRSM1MOUbCoouQy9DIk78c,10631
|
|
9
|
+
firecrawl-1.9.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
10
|
+
firecrawl-1.9.0.dist-info/top_level.txt,sha256=jTvz79zWhiyAezfmmHe4FQ-hR60C59UU5FrjMjijLu8,10
|
|
11
|
+
firecrawl-1.9.0.dist-info/RECORD,,
|
firecrawl-1.7.1.dist-info/RECORD
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
firecrawl/__init__.py,sha256=tZdRrVCtEZ7K0rU3ivzNX_EfEElo3y5zBhk9Had3W3o,2543
|
|
2
|
-
firecrawl/firecrawl.py,sha256=MFbF6gxXFQXe2kKeEE-PoQZnAbR00Ip163GD9MZh9ZM,30578
|
|
3
|
-
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
firecrawl/__tests__/e2e_withAuth/test.py,sha256=6OawnVF4IPeGyXg_Izi3t8U7MyT90roaJBJIG5UfllM,7935
|
|
5
|
-
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=Qad0xRPboRdlH6Q5o2985b4xjpjw2jr9LCik-GbXaZ0,17470
|
|
7
|
-
firecrawl-1.7.1.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
8
|
-
firecrawl-1.7.1.dist-info/METADATA,sha256=AleNQvsesEq0Uzt1R51p6mqWc43O5JQi3B_YYG6xr84,10631
|
|
9
|
-
firecrawl-1.7.1.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
10
|
-
firecrawl-1.7.1.dist-info/top_level.txt,sha256=jTvz79zWhiyAezfmmHe4FQ-hR60C59UU5FrjMjijLu8,10
|
|
11
|
-
firecrawl-1.7.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|