firecrawl 1.8.0__tar.gz → 1.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- {firecrawl-1.8.0 → firecrawl-1.9.0}/PKG-INFO +1 -1
- {firecrawl-1.8.0 → firecrawl-1.9.0}/firecrawl/__init__.py +1 -1
- {firecrawl-1.8.0 → firecrawl-1.9.0}/firecrawl/firecrawl.py +122 -13
- {firecrawl-1.8.0 → firecrawl-1.9.0}/firecrawl.egg-info/PKG-INFO +1 -1
- {firecrawl-1.8.0 → firecrawl-1.9.0}/LICENSE +0 -0
- {firecrawl-1.8.0 → firecrawl-1.9.0}/README.md +0 -0
- {firecrawl-1.8.0 → firecrawl-1.9.0}/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- {firecrawl-1.8.0 → firecrawl-1.9.0}/firecrawl/__tests__/e2e_withAuth/test.py +0 -0
- {firecrawl-1.8.0 → firecrawl-1.9.0}/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- {firecrawl-1.8.0 → firecrawl-1.9.0}/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -0
- {firecrawl-1.8.0 → firecrawl-1.9.0}/firecrawl.egg-info/SOURCES.txt +0 -0
- {firecrawl-1.8.0 → firecrawl-1.9.0}/firecrawl.egg-info/dependency_links.txt +0 -0
- {firecrawl-1.8.0 → firecrawl-1.9.0}/firecrawl.egg-info/requires.txt +0 -0
- {firecrawl-1.8.0 → firecrawl-1.9.0}/firecrawl.egg-info/top_level.txt +0 -0
- {firecrawl-1.8.0 → firecrawl-1.9.0}/pyproject.toml +0 -0
- {firecrawl-1.8.0 → firecrawl-1.9.0}/setup.cfg +0 -0
- {firecrawl-1.8.0 → firecrawl-1.9.0}/setup.py +0 -0
|
@@ -250,6 +250,8 @@ class FirecrawlApp:
|
|
|
250
250
|
if 'data' in status_data:
|
|
251
251
|
data = status_data['data']
|
|
252
252
|
while 'next' in status_data:
|
|
253
|
+
if len(status_data['data']) == 0:
|
|
254
|
+
break
|
|
253
255
|
next_url = status_data.get('next')
|
|
254
256
|
if not next_url:
|
|
255
257
|
logger.warning("Expected 'next' URL is missing.")
|
|
@@ -266,17 +268,25 @@ class FirecrawlApp:
|
|
|
266
268
|
logger.error(f"Error during pagination request: {e}")
|
|
267
269
|
break
|
|
268
270
|
status_data['data'] = data
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
'success': True,
|
|
271
|
+
|
|
272
|
+
response = {
|
|
272
273
|
'status': status_data.get('status'),
|
|
273
274
|
'total': status_data.get('total'),
|
|
274
275
|
'completed': status_data.get('completed'),
|
|
275
276
|
'creditsUsed': status_data.get('creditsUsed'),
|
|
276
277
|
'expiresAt': status_data.get('expiresAt'),
|
|
277
|
-
'data': status_data.get('data')
|
|
278
|
-
|
|
279
|
-
|
|
278
|
+
'data': status_data.get('data')
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
if 'error' in status_data:
|
|
282
|
+
response['error'] = status_data['error']
|
|
283
|
+
|
|
284
|
+
if 'next' in status_data:
|
|
285
|
+
response['next'] = status_data['next']
|
|
286
|
+
|
|
287
|
+
return {
|
|
288
|
+
'success': False if 'error' in status_data else True,
|
|
289
|
+
**response
|
|
280
290
|
}
|
|
281
291
|
else:
|
|
282
292
|
self._handle_error(response, 'check crawl status')
|
|
@@ -459,6 +469,8 @@ class FirecrawlApp:
|
|
|
459
469
|
if 'data' in status_data:
|
|
460
470
|
data = status_data['data']
|
|
461
471
|
while 'next' in status_data:
|
|
472
|
+
if len(status_data['data']) == 0:
|
|
473
|
+
break
|
|
462
474
|
next_url = status_data.get('next')
|
|
463
475
|
if not next_url:
|
|
464
476
|
logger.warning("Expected 'next' URL is missing.")
|
|
@@ -476,16 +488,24 @@ class FirecrawlApp:
|
|
|
476
488
|
break
|
|
477
489
|
status_data['data'] = data
|
|
478
490
|
|
|
479
|
-
|
|
480
|
-
'success': True,
|
|
491
|
+
response = {
|
|
481
492
|
'status': status_data.get('status'),
|
|
482
493
|
'total': status_data.get('total'),
|
|
483
494
|
'completed': status_data.get('completed'),
|
|
484
495
|
'creditsUsed': status_data.get('creditsUsed'),
|
|
485
496
|
'expiresAt': status_data.get('expiresAt'),
|
|
486
|
-
'data': status_data.get('data')
|
|
487
|
-
|
|
488
|
-
|
|
497
|
+
'data': status_data.get('data')
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
if 'error' in status_data:
|
|
501
|
+
response['error'] = status_data['error']
|
|
502
|
+
|
|
503
|
+
if 'next' in status_data:
|
|
504
|
+
response['next'] = status_data['next']
|
|
505
|
+
|
|
506
|
+
return {
|
|
507
|
+
'success': False if 'error' in status_data else True,
|
|
508
|
+
**response
|
|
489
509
|
}
|
|
490
510
|
else:
|
|
491
511
|
self._handle_error(response, 'check batch scrape status')
|
|
@@ -518,10 +538,12 @@ class FirecrawlApp:
|
|
|
518
538
|
request_data = {
|
|
519
539
|
**jsonData,
|
|
520
540
|
'allowExternalLinks': params.get('allow_external_links', False),
|
|
521
|
-
'schema': schema
|
|
541
|
+
'schema': schema,
|
|
542
|
+
'origin': 'api-sdk'
|
|
522
543
|
}
|
|
523
544
|
|
|
524
545
|
try:
|
|
546
|
+
# Send the initial extract request
|
|
525
547
|
response = self._post_request(
|
|
526
548
|
f'{self.api_url}/v1/extract',
|
|
527
549
|
request_data,
|
|
@@ -530,7 +552,29 @@ class FirecrawlApp:
|
|
|
530
552
|
if response.status_code == 200:
|
|
531
553
|
data = response.json()
|
|
532
554
|
if data['success']:
|
|
533
|
-
|
|
555
|
+
job_id = data.get('id')
|
|
556
|
+
if not job_id:
|
|
557
|
+
raise Exception('Job ID not returned from extract request.')
|
|
558
|
+
|
|
559
|
+
# Poll for the extract status
|
|
560
|
+
while True:
|
|
561
|
+
status_response = self._get_request(
|
|
562
|
+
f'{self.api_url}/v1/extract/{job_id}',
|
|
563
|
+
headers
|
|
564
|
+
)
|
|
565
|
+
if status_response.status_code == 200:
|
|
566
|
+
status_data = status_response.json()
|
|
567
|
+
if status_data['status'] == 'completed':
|
|
568
|
+
if status_data['success']:
|
|
569
|
+
return status_data
|
|
570
|
+
else:
|
|
571
|
+
raise Exception(f'Failed to extract. Error: {status_data["error"]}')
|
|
572
|
+
elif status_data['status'] in ['failed', 'cancelled']:
|
|
573
|
+
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
|
574
|
+
else:
|
|
575
|
+
self._handle_error(status_response, "extract-status")
|
|
576
|
+
|
|
577
|
+
time.sleep(2) # Polling interval
|
|
534
578
|
else:
|
|
535
579
|
raise Exception(f'Failed to extract. Error: {data["error"]}')
|
|
536
580
|
else:
|
|
@@ -539,6 +583,69 @@ class FirecrawlApp:
|
|
|
539
583
|
raise ValueError(str(e), 500)
|
|
540
584
|
|
|
541
585
|
return {'success': False, 'error': "Internal server error."}
|
|
586
|
+
|
|
587
|
+
def get_extract_status(self, job_id: str) -> Dict[str, Any]:
|
|
588
|
+
"""
|
|
589
|
+
Retrieve the status of an extract job.
|
|
590
|
+
|
|
591
|
+
Args:
|
|
592
|
+
job_id (str): The ID of the extract job.
|
|
593
|
+
|
|
594
|
+
Returns:
|
|
595
|
+
Dict[str, Any]: The status of the extract job.
|
|
596
|
+
|
|
597
|
+
Raises:
|
|
598
|
+
ValueError: If there is an error retrieving the status.
|
|
599
|
+
"""
|
|
600
|
+
headers = self._prepare_headers()
|
|
601
|
+
try:
|
|
602
|
+
response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
|
|
603
|
+
if response.status_code == 200:
|
|
604
|
+
return response.json()
|
|
605
|
+
else:
|
|
606
|
+
self._handle_error(response, "get extract status")
|
|
607
|
+
except Exception as e:
|
|
608
|
+
raise ValueError(str(e), 500)
|
|
609
|
+
|
|
610
|
+
def async_extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
|
|
611
|
+
"""
|
|
612
|
+
Initiate an asynchronous extract job.
|
|
613
|
+
|
|
614
|
+
Args:
|
|
615
|
+
urls (List[str]): The URLs to extract data from.
|
|
616
|
+
params (Optional[Dict[str, Any]]): Additional parameters for the extract request.
|
|
617
|
+
idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
|
|
618
|
+
|
|
619
|
+
Returns:
|
|
620
|
+
Dict[str, Any]: The response from the extract operation.
|
|
621
|
+
|
|
622
|
+
Raises:
|
|
623
|
+
ValueError: If there is an error initiating the extract job.
|
|
624
|
+
"""
|
|
625
|
+
headers = self._prepare_headers(idempotency_key)
|
|
626
|
+
|
|
627
|
+
schema = params.get('schema') if params else None
|
|
628
|
+
if schema:
|
|
629
|
+
if hasattr(schema, 'model_json_schema'):
|
|
630
|
+
# Convert Pydantic model to JSON schema
|
|
631
|
+
schema = schema.model_json_schema()
|
|
632
|
+
# Otherwise assume it's already a JSON schema dict
|
|
633
|
+
|
|
634
|
+
jsonData = {'urls': urls, **(params or {})}
|
|
635
|
+
request_data = {
|
|
636
|
+
**jsonData,
|
|
637
|
+
'allowExternalLinks': params.get('allow_external_links', False) if params else False,
|
|
638
|
+
'schema': schema
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
try:
|
|
642
|
+
response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
|
|
643
|
+
if response.status_code == 200:
|
|
644
|
+
return response.json()
|
|
645
|
+
else:
|
|
646
|
+
self._handle_error(response, "async extract")
|
|
647
|
+
except Exception as e:
|
|
648
|
+
raise ValueError(str(e), 500)
|
|
542
649
|
|
|
543
650
|
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
|
544
651
|
"""
|
|
@@ -669,6 +776,8 @@ class FirecrawlApp:
|
|
|
669
776
|
if 'data' in status_data:
|
|
670
777
|
data = status_data['data']
|
|
671
778
|
while 'next' in status_data:
|
|
779
|
+
if len(status_data['data']) == 0:
|
|
780
|
+
break
|
|
672
781
|
status_response = self._get_request(status_data['next'], headers)
|
|
673
782
|
status_data = status_response.json()
|
|
674
783
|
data.extend(status_data.get('data', []))
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|