firecrawl 2.4.3__py3-none-any.whl → 2.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- build/lib/build/lib/build/lib/firecrawl/__init__.py +79 -0
- build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +170 -0
- build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +440 -0
- build/lib/build/lib/build/lib/firecrawl/firecrawl.py +4439 -0
- build/lib/build/lib/build/lib/tests/test_change_tracking.py +98 -0
- build/lib/build/lib/firecrawl/__init__.py +79 -0
- build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +170 -0
- build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +440 -0
- build/lib/build/lib/firecrawl/firecrawl.py +4439 -0
- build/lib/build/lib/tests/test_change_tracking.py +98 -0
- build/lib/firecrawl/__init__.py +79 -0
- build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/e2e_withAuth/test.py +170 -0
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +440 -0
- build/lib/firecrawl/firecrawl.py +4439 -0
- build/lib/tests/test_change_tracking.py +98 -0
- firecrawl/__init__.py +2 -2
- firecrawl/firecrawl.py +132 -65
- {firecrawl-2.4.3.dist-info → firecrawl-2.5.1.dist-info}/METADATA +1 -1
- firecrawl-2.5.1.dist-info/RECORD +33 -0
- {firecrawl-2.4.3.dist-info → firecrawl-2.5.1.dist-info}/top_level.txt +2 -0
- firecrawl-2.4.3.dist-info/RECORD +0 -12
- {firecrawl-2.4.3.dist-info → firecrawl-2.5.1.dist-info}/LICENSE +0 -0
- {firecrawl-2.4.3.dist-info → firecrawl-2.5.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from unittest.mock import patch, MagicMock
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from firecrawl import FirecrawlApp
|
|
6
|
+
|
|
7
|
+
class TestChangeTracking(unittest.TestCase):
|
|
8
|
+
@patch('requests.post')
|
|
9
|
+
def test_change_tracking_format(self, mock_post):
|
|
10
|
+
mock_response = MagicMock()
|
|
11
|
+
mock_response.status_code = 200
|
|
12
|
+
mock_response.json.return_value = {
|
|
13
|
+
'success': True,
|
|
14
|
+
'data': {
|
|
15
|
+
'markdown': 'Test markdown content',
|
|
16
|
+
'changeTracking': {
|
|
17
|
+
'previousScrapeAt': '2023-01-01T00:00:00Z',
|
|
18
|
+
'changeStatus': 'changed',
|
|
19
|
+
'visibility': 'visible'
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
mock_post.return_value = mock_response
|
|
24
|
+
|
|
25
|
+
app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
|
|
26
|
+
result = app.scrape_url('https://example.com', {
|
|
27
|
+
'formats': ['markdown', 'changeTracking']
|
|
28
|
+
})
|
|
29
|
+
|
|
30
|
+
args, kwargs = mock_post.call_args
|
|
31
|
+
self.assertEqual(kwargs['json']['formats'], ['markdown', 'changeTracking'])
|
|
32
|
+
|
|
33
|
+
self.assertEqual(result['changeTracking']['previousScrapeAt'], '2023-01-01T00:00:00Z')
|
|
34
|
+
self.assertEqual(result['changeTracking']['changeStatus'], 'changed')
|
|
35
|
+
self.assertEqual(result['changeTracking']['visibility'], 'visible')
|
|
36
|
+
|
|
37
|
+
@patch('requests.post')
|
|
38
|
+
def test_change_tracking_options(self, mock_post):
|
|
39
|
+
mock_response = MagicMock()
|
|
40
|
+
mock_response.status_code = 200
|
|
41
|
+
mock_response.json.return_value = {
|
|
42
|
+
'success': True,
|
|
43
|
+
'data': {
|
|
44
|
+
'markdown': 'Test markdown content',
|
|
45
|
+
'changeTracking': {
|
|
46
|
+
'previousScrapeAt': '2023-01-01T00:00:00Z',
|
|
47
|
+
'changeStatus': 'changed',
|
|
48
|
+
'visibility': 'visible',
|
|
49
|
+
'diff': {
|
|
50
|
+
'text': '@@ -1,1 +1,1 @@\n-old content\n+new content',
|
|
51
|
+
'json': {
|
|
52
|
+
'files': [{
|
|
53
|
+
'from': None,
|
|
54
|
+
'to': None,
|
|
55
|
+
'chunks': [{
|
|
56
|
+
'content': '@@ -1,1 +1,1 @@',
|
|
57
|
+
'changes': [{
|
|
58
|
+
'type': 'del',
|
|
59
|
+
'content': '-old content',
|
|
60
|
+
'del': True,
|
|
61
|
+
'ln': 1
|
|
62
|
+
}, {
|
|
63
|
+
'type': 'add',
|
|
64
|
+
'content': '+new content',
|
|
65
|
+
'add': True,
|
|
66
|
+
'ln': 1
|
|
67
|
+
}]
|
|
68
|
+
}]
|
|
69
|
+
}]
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
'json': {
|
|
73
|
+
'title': {
|
|
74
|
+
'previous': 'Old Title',
|
|
75
|
+
'current': 'New Title'
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
mock_post.return_value = mock_response
|
|
82
|
+
|
|
83
|
+
app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
|
|
84
|
+
result = app.scrape_url('https://example.com', {
|
|
85
|
+
'formats': ['markdown', 'changeTracking'],
|
|
86
|
+
'changeTrackingOptions': {
|
|
87
|
+
'modes': ['git-diff', 'json'],
|
|
88
|
+
'schema': {'type': 'object', 'properties': {'title': {'type': 'string'}}}
|
|
89
|
+
}
|
|
90
|
+
})
|
|
91
|
+
|
|
92
|
+
args, kwargs = mock_post.call_args
|
|
93
|
+
self.assertEqual(kwargs['json']['formats'], ['markdown', 'changeTracking'])
|
|
94
|
+
self.assertEqual(kwargs['json']['changeTrackingOptions']['modes'], ['git-diff', 'json'])
|
|
95
|
+
|
|
96
|
+
self.assertEqual(result['changeTracking']['diff']['text'], '@@ -1,1 +1,1 @@\n-old content\n+new content')
|
|
97
|
+
self.assertEqual(result['changeTracking']['json']['title']['previous'], 'Old Title')
|
|
98
|
+
self.assertEqual(result['changeTracking']['json']['title']['current'], 'New Title')
|
firecrawl/__init__.py
CHANGED
|
@@ -11,9 +11,9 @@ For more information visit https://github.com/firecrawl/
|
|
|
11
11
|
import logging
|
|
12
12
|
import os
|
|
13
13
|
|
|
14
|
-
from .firecrawl import FirecrawlApp, JsonConfig, ScrapeOptions # noqa
|
|
14
|
+
from .firecrawl import FirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.
|
|
16
|
+
__version__ = "2.5.1"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
firecrawl/firecrawl.py
CHANGED
|
@@ -29,7 +29,7 @@ warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDoc
|
|
|
29
29
|
warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
|
|
30
30
|
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
|
|
31
31
|
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
|
|
32
|
-
|
|
32
|
+
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
|
|
33
33
|
|
|
34
34
|
def get_version():
|
|
35
35
|
try:
|
|
@@ -135,6 +135,12 @@ class WebhookConfig(pydantic.BaseModel):
|
|
|
135
135
|
metadata: Optional[Dict[str, str]] = None
|
|
136
136
|
events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
|
|
137
137
|
|
|
138
|
+
class ChangeTrackingOptions(pydantic.BaseModel):
|
|
139
|
+
"""Configuration for change tracking."""
|
|
140
|
+
modes: Optional[List[Literal["git-diff", "json"]]] = None
|
|
141
|
+
schema: Optional[Any] = None
|
|
142
|
+
prompt: Optional[str] = None
|
|
143
|
+
|
|
138
144
|
class ScrapeOptions(pydantic.BaseModel):
|
|
139
145
|
"""Parameters for scraping operations."""
|
|
140
146
|
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
|
|
@@ -150,6 +156,7 @@ class ScrapeOptions(pydantic.BaseModel):
|
|
|
150
156
|
removeBase64Images: Optional[bool] = None
|
|
151
157
|
blockAds: Optional[bool] = None
|
|
152
158
|
proxy: Optional[Literal["basic", "stealth"]] = None
|
|
159
|
+
changeTrackingOptions: Optional[ChangeTrackingOptions] = None
|
|
153
160
|
|
|
154
161
|
class WaitAction(pydantic.BaseModel):
|
|
155
162
|
"""Wait action to perform during scraping."""
|
|
@@ -454,6 +461,7 @@ class FirecrawlApp:
|
|
|
454
461
|
extract: Optional[JsonConfig] = None,
|
|
455
462
|
json_options: Optional[JsonConfig] = None,
|
|
456
463
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
464
|
+
change_tracking_options: Optional[ChangeTrackingOptions] = None,
|
|
457
465
|
**kwargs) -> ScrapeResponse[Any]:
|
|
458
466
|
"""
|
|
459
467
|
Scrape and extract content from a URL.
|
|
@@ -475,6 +483,7 @@ class FirecrawlApp:
|
|
|
475
483
|
extract (Optional[JsonConfig]): Content extraction settings
|
|
476
484
|
json_options (Optional[JsonConfig]): JSON extraction settings
|
|
477
485
|
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
486
|
+
change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
|
|
478
487
|
|
|
479
488
|
|
|
480
489
|
Returns:
|
|
@@ -520,18 +529,28 @@ class FirecrawlApp:
|
|
|
520
529
|
scrape_params['blockAds'] = block_ads
|
|
521
530
|
if proxy:
|
|
522
531
|
scrape_params['proxy'] = proxy
|
|
523
|
-
if extract:
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
532
|
+
if extract is not None:
|
|
533
|
+
extract = self._ensure_schema_dict(extract)
|
|
534
|
+
if isinstance(extract, dict) and "schema" in extract:
|
|
535
|
+
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
|
536
|
+
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
|
537
|
+
if json_options is not None:
|
|
538
|
+
json_options = self._ensure_schema_dict(json_options)
|
|
539
|
+
if isinstance(json_options, dict) and "schema" in json_options:
|
|
540
|
+
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
|
541
|
+
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
|
531
542
|
if actions:
|
|
532
543
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
544
|
+
if change_tracking_options:
|
|
545
|
+
scrape_params['changeTrackingOptions'] = change_tracking_options.dict(exclude_none=True)
|
|
546
|
+
|
|
533
547
|
scrape_params.update(kwargs)
|
|
534
548
|
|
|
549
|
+
if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
|
|
550
|
+
scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
|
|
551
|
+
if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
|
|
552
|
+
scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
|
|
553
|
+
|
|
535
554
|
# Make request
|
|
536
555
|
response = requests.post(
|
|
537
556
|
f'{self.api_url}/v1/scrape',
|
|
@@ -1240,13 +1259,15 @@ class FirecrawlApp:
|
|
|
1240
1259
|
if proxy is not None:
|
|
1241
1260
|
scrape_params['proxy'] = proxy
|
|
1242
1261
|
if extract is not None:
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1262
|
+
extract = self._ensure_schema_dict(extract)
|
|
1263
|
+
if isinstance(extract, dict) and "schema" in extract:
|
|
1264
|
+
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
|
1265
|
+
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
|
1246
1266
|
if json_options is not None:
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1267
|
+
json_options = self._ensure_schema_dict(json_options)
|
|
1268
|
+
if isinstance(json_options, dict) and "schema" in json_options:
|
|
1269
|
+
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
|
1270
|
+
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
|
1250
1271
|
if actions is not None:
|
|
1251
1272
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1252
1273
|
if agent is not None:
|
|
@@ -1261,6 +1282,11 @@ class FirecrawlApp:
|
|
|
1261
1282
|
params_dict['urls'] = urls
|
|
1262
1283
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
1263
1284
|
|
|
1285
|
+
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
|
|
1286
|
+
params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
|
|
1287
|
+
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
|
|
1288
|
+
params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
|
|
1289
|
+
|
|
1264
1290
|
# Make request
|
|
1265
1291
|
headers = self._prepare_headers(idempotency_key)
|
|
1266
1292
|
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
|
|
@@ -1366,13 +1392,15 @@ class FirecrawlApp:
|
|
|
1366
1392
|
if proxy is not None:
|
|
1367
1393
|
scrape_params['proxy'] = proxy
|
|
1368
1394
|
if extract is not None:
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1395
|
+
extract = self._ensure_schema_dict(extract)
|
|
1396
|
+
if isinstance(extract, dict) and "schema" in extract:
|
|
1397
|
+
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
|
1398
|
+
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
|
1372
1399
|
if json_options is not None:
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1400
|
+
json_options = self._ensure_schema_dict(json_options)
|
|
1401
|
+
if isinstance(json_options, dict) and "schema" in json_options:
|
|
1402
|
+
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
|
1403
|
+
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
|
1376
1404
|
if actions is not None:
|
|
1377
1405
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1378
1406
|
if agent is not None:
|
|
@@ -1387,6 +1415,11 @@ class FirecrawlApp:
|
|
|
1387
1415
|
params_dict['urls'] = urls
|
|
1388
1416
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
1389
1417
|
|
|
1418
|
+
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
|
|
1419
|
+
params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
|
|
1420
|
+
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
|
|
1421
|
+
params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
|
|
1422
|
+
|
|
1390
1423
|
# Make request
|
|
1391
1424
|
headers = self._prepare_headers(idempotency_key)
|
|
1392
1425
|
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
|
|
@@ -1487,13 +1520,15 @@ class FirecrawlApp:
|
|
|
1487
1520
|
if proxy is not None:
|
|
1488
1521
|
scrape_params['proxy'] = proxy
|
|
1489
1522
|
if extract is not None:
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1523
|
+
extract = self._ensure_schema_dict(extract)
|
|
1524
|
+
if isinstance(extract, dict) and "schema" in extract:
|
|
1525
|
+
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
|
1526
|
+
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
|
1493
1527
|
if json_options is not None:
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
|
|
1528
|
+
json_options = self._ensure_schema_dict(json_options)
|
|
1529
|
+
if isinstance(json_options, dict) and "schema" in json_options:
|
|
1530
|
+
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
|
1531
|
+
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
|
1497
1532
|
if actions is not None:
|
|
1498
1533
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1499
1534
|
if agent is not None:
|
|
@@ -1508,6 +1543,11 @@ class FirecrawlApp:
|
|
|
1508
1543
|
params_dict['urls'] = urls
|
|
1509
1544
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
1510
1545
|
|
|
1546
|
+
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
|
|
1547
|
+
params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
|
|
1548
|
+
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
|
|
1549
|
+
params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
|
|
1550
|
+
|
|
1511
1551
|
# Make request
|
|
1512
1552
|
headers = self._prepare_headers(idempotency_key)
|
|
1513
1553
|
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
|
|
@@ -1594,7 +1634,7 @@ class FirecrawlApp:
|
|
|
1594
1634
|
id (str): The ID of the crawl job.
|
|
1595
1635
|
|
|
1596
1636
|
Returns:
|
|
1597
|
-
CrawlErrorsResponse
|
|
1637
|
+
CrawlErrorsResponse containing:
|
|
1598
1638
|
* errors (List[Dict[str, str]]): List of errors with fields:
|
|
1599
1639
|
* id (str): Error ID
|
|
1600
1640
|
* timestamp (str): When the error occurred
|
|
@@ -1657,10 +1697,7 @@ class FirecrawlApp:
|
|
|
1657
1697
|
raise ValueError("Either urls or prompt is required")
|
|
1658
1698
|
|
|
1659
1699
|
if schema:
|
|
1660
|
-
|
|
1661
|
-
# Convert Pydantic model to JSON schema
|
|
1662
|
-
schema = schema.model_json_schema()
|
|
1663
|
-
# Otherwise assume it's already a JSON schema dict
|
|
1700
|
+
schema = self._ensure_schema_dict(schema)
|
|
1664
1701
|
|
|
1665
1702
|
request_data = {
|
|
1666
1703
|
'urls': urls or [],
|
|
@@ -1789,10 +1826,7 @@ class FirecrawlApp:
|
|
|
1789
1826
|
|
|
1790
1827
|
schema = schema
|
|
1791
1828
|
if schema:
|
|
1792
|
-
|
|
1793
|
-
# Convert Pydantic model to JSON schema
|
|
1794
|
-
schema = schema.model_json_schema()
|
|
1795
|
-
# Otherwise assume it's already a JSON schema dict
|
|
1829
|
+
schema = self._ensure_schema_dict(schema)
|
|
1796
1830
|
|
|
1797
1831
|
request_data = {
|
|
1798
1832
|
'urls': urls,
|
|
@@ -2424,7 +2458,7 @@ class FirecrawlApp:
|
|
|
2424
2458
|
method_params = {
|
|
2425
2459
|
"scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
|
|
2426
2460
|
"timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
|
|
2427
|
-
"block_ads", "proxy", "extract", "json_options", "actions"},
|
|
2461
|
+
"block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options"},
|
|
2428
2462
|
"search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options"},
|
|
2429
2463
|
"crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
|
|
2430
2464
|
"allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
|
|
@@ -2455,6 +2489,24 @@ class FirecrawlApp:
|
|
|
2455
2489
|
# Additional type validation can be added here if needed
|
|
2456
2490
|
# For now, we rely on Pydantic models for detailed type validation
|
|
2457
2491
|
|
|
2492
|
+
def _ensure_schema_dict(self, schema):
|
|
2493
|
+
"""
|
|
2494
|
+
Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
|
|
2495
|
+
"""
|
|
2496
|
+
if schema is None:
|
|
2497
|
+
return schema
|
|
2498
|
+
if isinstance(schema, type):
|
|
2499
|
+
# Pydantic v1/v2 model class
|
|
2500
|
+
if hasattr(schema, 'model_json_schema'):
|
|
2501
|
+
return schema.model_json_schema()
|
|
2502
|
+
elif hasattr(schema, 'schema'):
|
|
2503
|
+
return schema.schema()
|
|
2504
|
+
if isinstance(schema, dict):
|
|
2505
|
+
return {k: self._ensure_schema_dict(v) for k, v in schema.items()}
|
|
2506
|
+
if isinstance(schema, (list, tuple)):
|
|
2507
|
+
return [self._ensure_schema_dict(v) for v in schema]
|
|
2508
|
+
return schema
|
|
2509
|
+
|
|
2458
2510
|
class CrawlWatcher:
|
|
2459
2511
|
"""
|
|
2460
2512
|
A class to watch and handle crawl job events via WebSocket connection.
|
|
@@ -2861,19 +2913,24 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2861
2913
|
scrape_params['blockAds'] = block_ads
|
|
2862
2914
|
if proxy:
|
|
2863
2915
|
scrape_params['proxy'] = proxy
|
|
2864
|
-
if extract:
|
|
2865
|
-
|
|
2866
|
-
if
|
|
2867
|
-
|
|
2868
|
-
scrape_params['extract'] =
|
|
2869
|
-
if json_options:
|
|
2870
|
-
|
|
2871
|
-
if
|
|
2872
|
-
|
|
2873
|
-
scrape_params['jsonOptions'] =
|
|
2916
|
+
if extract is not None:
|
|
2917
|
+
extract = self._ensure_schema_dict(extract)
|
|
2918
|
+
if isinstance(extract, dict) and "schema" in extract:
|
|
2919
|
+
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
|
2920
|
+
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
|
2921
|
+
if json_options is not None:
|
|
2922
|
+
json_options = self._ensure_schema_dict(json_options)
|
|
2923
|
+
if isinstance(json_options, dict) and "schema" in json_options:
|
|
2924
|
+
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
|
2925
|
+
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
|
2874
2926
|
if actions:
|
|
2875
2927
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
2876
2928
|
|
|
2929
|
+
if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
|
|
2930
|
+
scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
|
|
2931
|
+
if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
|
|
2932
|
+
scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
|
|
2933
|
+
|
|
2877
2934
|
# Make async request
|
|
2878
2935
|
endpoint = f'/v1/scrape'
|
|
2879
2936
|
response = await self._async_post_request(
|
|
@@ -2984,13 +3041,15 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2984
3041
|
if proxy is not None:
|
|
2985
3042
|
scrape_params['proxy'] = proxy
|
|
2986
3043
|
if extract is not None:
|
|
2987
|
-
|
|
2988
|
-
|
|
2989
|
-
|
|
3044
|
+
extract = self._ensure_schema_dict(extract)
|
|
3045
|
+
if isinstance(extract, dict) and "schema" in extract:
|
|
3046
|
+
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
|
3047
|
+
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
|
2990
3048
|
if json_options is not None:
|
|
2991
|
-
|
|
2992
|
-
|
|
2993
|
-
|
|
3049
|
+
json_options = self._ensure_schema_dict(json_options)
|
|
3050
|
+
if isinstance(json_options, dict) and "schema" in json_options:
|
|
3051
|
+
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
|
3052
|
+
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
|
2994
3053
|
if actions is not None:
|
|
2995
3054
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
2996
3055
|
if agent is not None:
|
|
@@ -3005,6 +3064,11 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3005
3064
|
params_dict['urls'] = urls
|
|
3006
3065
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
3007
3066
|
|
|
3067
|
+
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
|
|
3068
|
+
params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
|
|
3069
|
+
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
|
|
3070
|
+
params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
|
|
3071
|
+
|
|
3008
3072
|
# Make request
|
|
3009
3073
|
headers = self._prepare_headers(idempotency_key)
|
|
3010
3074
|
response = await self._async_post_request(
|
|
@@ -3115,13 +3179,15 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3115
3179
|
if proxy is not None:
|
|
3116
3180
|
scrape_params['proxy'] = proxy
|
|
3117
3181
|
if extract is not None:
|
|
3118
|
-
|
|
3119
|
-
|
|
3120
|
-
|
|
3182
|
+
extract = self._ensure_schema_dict(extract)
|
|
3183
|
+
if isinstance(extract, dict) and "schema" in extract:
|
|
3184
|
+
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
|
3185
|
+
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
|
3121
3186
|
if json_options is not None:
|
|
3122
|
-
|
|
3123
|
-
|
|
3124
|
-
|
|
3187
|
+
json_options = self._ensure_schema_dict(json_options)
|
|
3188
|
+
if isinstance(json_options, dict) and "schema" in json_options:
|
|
3189
|
+
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
|
3190
|
+
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
|
3125
3191
|
if actions is not None:
|
|
3126
3192
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
3127
3193
|
if agent is not None:
|
|
@@ -3136,6 +3202,11 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3136
3202
|
params_dict['urls'] = urls
|
|
3137
3203
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
3138
3204
|
|
|
3205
|
+
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
|
|
3206
|
+
params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
|
|
3207
|
+
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
|
|
3208
|
+
params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
|
|
3209
|
+
|
|
3139
3210
|
# Make request
|
|
3140
3211
|
headers = self._prepare_headers(idempotency_key)
|
|
3141
3212
|
response = await self._async_post_request(
|
|
@@ -3593,10 +3664,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3593
3664
|
raise ValueError("Either urls or prompt is required")
|
|
3594
3665
|
|
|
3595
3666
|
if schema:
|
|
3596
|
-
|
|
3597
|
-
# Convert Pydantic model to JSON schema
|
|
3598
|
-
schema = schema.model_json_schema()
|
|
3599
|
-
# Otherwise assume it's already a JSON schema dict
|
|
3667
|
+
schema = self._ensure_schema_dict(schema)
|
|
3600
3668
|
|
|
3601
3669
|
request_data = {
|
|
3602
3670
|
'urls': urls or [],
|
|
@@ -3850,8 +3918,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3850
3918
|
raise ValueError("Either urls or prompt is required")
|
|
3851
3919
|
|
|
3852
3920
|
if schema:
|
|
3853
|
-
|
|
3854
|
-
schema = schema.model_json_schema()
|
|
3921
|
+
schema = self._ensure_schema_dict(schema)
|
|
3855
3922
|
|
|
3856
3923
|
request_data = ExtractResponse(
|
|
3857
3924
|
urls=urls or [],
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
build/lib/build/lib/build/lib/firecrawl/__init__.py,sha256=J1HgnaGyIrbk_5clFRTAb3XWfy9m3at1i8RFYzn5O0Q,2593
|
|
2
|
+
build/lib/build/lib/build/lib/firecrawl/firecrawl.py,sha256=RyUiKke08spOP6iSUgJ9_dz6l-D_dkGB4aA6UDPWiXI,188709
|
|
3
|
+
build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
5
|
+
build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
|
|
7
|
+
build/lib/build/lib/build/lib/tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
8
|
+
build/lib/build/lib/firecrawl/__init__.py,sha256=J1HgnaGyIrbk_5clFRTAb3XWfy9m3at1i8RFYzn5O0Q,2593
|
|
9
|
+
build/lib/build/lib/firecrawl/firecrawl.py,sha256=RyUiKke08spOP6iSUgJ9_dz6l-D_dkGB4aA6UDPWiXI,188709
|
|
10
|
+
build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
12
|
+
build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
|
|
14
|
+
build/lib/build/lib/tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
15
|
+
build/lib/firecrawl/__init__.py,sha256=J1HgnaGyIrbk_5clFRTAb3XWfy9m3at1i8RFYzn5O0Q,2593
|
|
16
|
+
build/lib/firecrawl/firecrawl.py,sha256=RyUiKke08spOP6iSUgJ9_dz6l-D_dkGB4aA6UDPWiXI,188709
|
|
17
|
+
build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
+
build/lib/firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
19
|
+
build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
|
+
build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
|
|
21
|
+
build/lib/tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
22
|
+
firecrawl/__init__.py,sha256=J1HgnaGyIrbk_5clFRTAb3XWfy9m3at1i8RFYzn5O0Q,2593
|
|
23
|
+
firecrawl/firecrawl.py,sha256=RyUiKke08spOP6iSUgJ9_dz6l-D_dkGB4aA6UDPWiXI,188709
|
|
24
|
+
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
+
firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
26
|
+
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
27
|
+
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
|
|
28
|
+
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
29
|
+
firecrawl-2.5.1.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
30
|
+
firecrawl-2.5.1.dist-info/METADATA,sha256=jBHAE4mNK7Yq2NA2pPSN5_Rg_aVFztHCO1DLN5PaaQ0,7165
|
|
31
|
+
firecrawl-2.5.1.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
32
|
+
firecrawl-2.5.1.dist-info/top_level.txt,sha256=ytN_R30g2U2qZYFyIm710Z8QeK9FO1Uwa-WPGHXyqjE,27
|
|
33
|
+
firecrawl-2.5.1.dist-info/RECORD,,
|
firecrawl-2.4.3.dist-info/RECORD
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
firecrawl/__init__.py,sha256=VW7ON6xBoqPVlchUkJMjD92NZ_tNC-XsqG-M2sIvbc8,2570
|
|
2
|
-
firecrawl/firecrawl.py,sha256=Q1opxN1JxjbWLEDsSS3P5aEm4f9LEJrZyhd8UdsMVFw,182769
|
|
3
|
-
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
5
|
-
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
|
|
7
|
-
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
8
|
-
firecrawl-2.4.3.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
9
|
-
firecrawl-2.4.3.dist-info/METADATA,sha256=XFy99h7X3oruTzlbEWMx3uvxfk-JiQ5FZxZolp5bzSw,7165
|
|
10
|
-
firecrawl-2.4.3.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
11
|
-
firecrawl-2.4.3.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
12
|
-
firecrawl-2.4.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|