firecrawl 3.0.3__tar.gz → 3.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

Files changed (85) hide show
  1. {firecrawl-3.0.3 → firecrawl-3.1.0}/PKG-INFO +1 -1
  2. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__init__.py +1 -1
  3. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +0 -1
  4. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/client.py +1 -0
  5. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/aio/crawl.py +2 -5
  6. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/aio/scrape.py +2 -5
  7. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/aio/search.py +2 -5
  8. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/batch.py +2 -5
  9. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/crawl.py +2 -1
  10. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/scrape.py +2 -6
  11. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/search.py +2 -6
  12. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/types.py +68 -2
  13. firecrawl-3.1.0/firecrawl/v2/utils/normalize.py +107 -0
  14. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/watcher.py +4 -15
  15. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/watcher_async.py +2 -5
  16. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl.egg-info/PKG-INFO +1 -1
  17. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl.egg-info/SOURCES.txt +1 -0
  18. {firecrawl-3.0.3 → firecrawl-3.1.0}/LICENSE +0 -0
  19. {firecrawl-3.0.3 → firecrawl-3.1.0}/README.md +0 -0
  20. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +0 -0
  21. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +0 -0
  22. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +0 -0
  23. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +0 -0
  24. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +0 -0
  25. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +0 -0
  26. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +0 -0
  27. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/conftest.py +0 -0
  28. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_async.py +0 -0
  29. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +0 -0
  30. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_crawl.py +0 -0
  31. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_extract.py +0 -0
  32. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_map.py +0 -0
  33. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_scrape.py +0 -0
  34. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_search.py +0 -0
  35. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_usage.py +0 -0
  36. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_watcher.py +0 -0
  37. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +0 -0
  38. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +0 -0
  39. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +0 -0
  40. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +0 -0
  41. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +0 -0
  42. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +0 -0
  43. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +0 -0
  44. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +0 -0
  45. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +0 -0
  46. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +0 -0
  47. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +0 -0
  48. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +0 -0
  49. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +0 -0
  50. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +0 -0
  51. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +0 -0
  52. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +0 -0
  53. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +0 -0
  54. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_webhook.py +0 -0
  55. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/utils/test_validation.py +0 -0
  56. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +0 -0
  57. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/firecrawl.backup.py +0 -0
  58. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/types.py +0 -0
  59. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v1/__init__.py +0 -0
  60. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v1/client.py +0 -0
  61. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/__init__.py +0 -0
  62. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/client.py +0 -0
  63. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/client_async.py +0 -0
  64. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/aio/__init__.py +0 -0
  65. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/aio/batch.py +0 -0
  66. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/aio/extract.py +0 -0
  67. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/aio/map.py +0 -0
  68. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/aio/usage.py +0 -0
  69. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/extract.py +0 -0
  70. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/map.py +0 -0
  71. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/usage.py +0 -0
  72. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/utils/__init__.py +0 -0
  73. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/utils/error_handler.py +0 -0
  74. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/utils/get_version.py +0 -0
  75. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/utils/http_client.py +0 -0
  76. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/utils/http_client_async.py +0 -0
  77. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/utils/validation.py +0 -0
  78. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl.egg-info/dependency_links.txt +0 -0
  79. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl.egg-info/requires.txt +0 -0
  80. {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl.egg-info/top_level.txt +0 -0
  81. {firecrawl-3.0.3 → firecrawl-3.1.0}/pyproject.toml +0 -0
  82. {firecrawl-3.0.3 → firecrawl-3.1.0}/setup.cfg +0 -0
  83. {firecrawl-3.0.3 → firecrawl-3.1.0}/setup.py +0 -0
  84. {firecrawl-3.0.3 → firecrawl-3.1.0}/tests/test_change_tracking.py +0 -0
  85. {firecrawl-3.0.3 → firecrawl-3.1.0}/tests/test_timeout_conversion.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: firecrawl
3
- Version: 3.0.3
3
+ Version: 3.1.0
4
4
  Summary: Python SDK for Firecrawl API
5
5
  Home-page: https://github.com/firecrawl/firecrawl
6
6
  Author: Mendable.ai
@@ -17,7 +17,7 @@ from .v1 import (
17
17
  V1ChangeTrackingOptions,
18
18
  )
19
19
 
20
- __version__ = "3.0.3"
20
+ __version__ = "3.1.0"
21
21
 
22
22
  # Define the logger for the Firecrawl project
23
23
  logger: logging.Logger = logging.getLogger("firecrawl")
@@ -96,7 +96,6 @@ async def test_async_get_crawl_status_shape():
96
96
  assert status.status in ("scraping", "completed", "failed")
97
97
  assert status.completed >= 0
98
98
  assert status.expires_at is not None
99
- assert status.next is not None
100
99
  assert isinstance(status.data, list)
101
100
 
102
101
 
@@ -25,6 +25,7 @@ import logging
25
25
  from .v1 import V1FirecrawlApp, AsyncV1FirecrawlApp
26
26
  from .v2 import FirecrawlClient as V2FirecrawlClient
27
27
  from .v2.client_async import AsyncFirecrawlClient
28
+ from .v2.types import Document
28
29
 
29
30
  logger = logging.getLogger("firecrawl")
30
31
 
@@ -14,6 +14,7 @@ from ...types import (
14
14
  from ...utils.error_handler import handle_response_error
15
15
  from ...utils.validation import prepare_scrape_options
16
16
  from ...utils.http_client_async import AsyncHttpClient
17
+ from ...utils.normalize import normalize_document_input
17
18
 
18
19
 
19
20
  def _prepare_crawl_request(request: CrawlRequest) -> dict:
@@ -76,11 +77,7 @@ async def get_crawl_status(client: AsyncHttpClient, job_id: str) -> CrawlJob:
76
77
  documents = []
77
78
  for doc_data in body.get("data", []):
78
79
  if isinstance(doc_data, dict):
79
- normalized = dict(doc_data)
80
- if 'rawHtml' in normalized and 'raw_html' not in normalized:
81
- normalized['raw_html'] = normalized.pop('rawHtml')
82
- if 'changeTracking' in normalized and 'change_tracking' not in normalized:
83
- normalized['change_tracking'] = normalized.pop('changeTracking')
80
+ normalized = normalize_document_input(doc_data)
84
81
  documents.append(Document(**normalized))
85
82
  return CrawlJob(
86
83
  status=body.get("status"),
@@ -1,5 +1,6 @@
1
1
  from typing import Optional, Dict, Any
2
2
  from ...types import ScrapeOptions, Document
3
+ from ...utils.normalize import normalize_document_input
3
4
  from ...utils.error_handler import handle_response_error
4
5
  from ...utils.validation import prepare_scrape_options, validate_scrape_options
5
6
  from ...utils.http_client_async import AsyncHttpClient
@@ -27,10 +28,6 @@ async def scrape(client: AsyncHttpClient, url: str, options: Optional[ScrapeOpti
27
28
  if not body.get("success"):
28
29
  raise Exception(body.get("error", "Unknown error occurred"))
29
30
  document_data = body.get("data", {})
30
- normalized = dict(document_data)
31
- if 'rawHtml' in normalized and 'raw_html' not in normalized:
32
- normalized['raw_html'] = normalized.pop('rawHtml')
33
- if 'changeTracking' in normalized and 'change_tracking' not in normalized:
34
- normalized['change_tracking'] = normalized.pop('changeTracking')
31
+ normalized = normalize_document_input(document_data)
35
32
  return Document(**normalized)
36
33
 
@@ -1,5 +1,6 @@
1
1
  from typing import Dict, Any
2
2
  from ...types import SearchRequest, SearchData, SearchResult, Document
3
+ from ...utils.normalize import normalize_document_input
3
4
  from ...utils.http_client_async import AsyncHttpClient
4
5
  from ...utils.error_handler import handle_response_error
5
6
  from ...utils.validation import prepare_scrape_options, validate_scrape_options
@@ -38,11 +39,7 @@ async def search(client: AsyncHttpClient, request: SearchRequest) -> SearchData:
38
39
  if request.scrape_options is not None and any(
39
40
  key in doc_data for key in ['markdown', 'html', 'rawHtml', 'links', 'summary', 'screenshot', 'changeTracking']
40
41
  ):
41
- normalized = dict(doc_data)
42
- if 'rawHtml' in normalized and 'raw_html' not in normalized:
43
- normalized['raw_html'] = normalized.pop('rawHtml')
44
- if 'changeTracking' in normalized and 'change_tracking' not in normalized:
45
- normalized['change_tracking'] = normalized.pop('changeTracking')
42
+ normalized = normalize_document_input(doc_data)
46
43
  results.append(Document(**normalized))
47
44
  else:
48
45
  results.append(SearchResult(
@@ -13,6 +13,7 @@ from ..types import (
13
13
  WebhookConfig,
14
14
  )
15
15
  from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
16
+ from ..utils.normalize import normalize_document_input
16
17
  from ..types import CrawlErrorsResponse
17
18
 
18
19
 
@@ -107,11 +108,7 @@ def get_batch_scrape_status(
107
108
  documents: List[Document] = []
108
109
  for doc in body.get("data", []) or []:
109
110
  if isinstance(doc, dict):
110
- normalized = dict(doc)
111
- if 'rawHtml' in normalized and 'raw_html' not in normalized:
112
- normalized['raw_html'] = normalized.pop('rawHtml')
113
- if 'changeTracking' in normalized and 'change_tracking' not in normalized:
114
- normalized['change_tracking'] = normalized.pop('changeTracking')
111
+ normalized = normalize_document_input(doc)
115
112
  documents.append(Document(**normalized))
116
113
 
117
114
  return BatchScrapeJob(
@@ -11,6 +11,7 @@ from ..types import (
11
11
  WebhookConfig, CrawlErrorsResponse, ActiveCrawlsResponse, ActiveCrawl
12
12
  )
13
13
  from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
14
+ from ..utils.normalize import normalize_document_input
14
15
 
15
16
 
16
17
  def _validate_crawl_request(request: CrawlRequest) -> None:
@@ -173,7 +174,7 @@ def get_crawl_status(client: HttpClient, job_id: str) -> CrawlJob:
173
174
  # but we'll handle it gracefully
174
175
  continue
175
176
  else:
176
- documents.append(Document(**doc_data))
177
+ documents.append(Document(**normalize_document_input(doc_data)))
177
178
 
178
179
  # Create CrawlJob with current status and data
179
180
  return CrawlJob(
@@ -4,6 +4,7 @@ Scraping functionality for Firecrawl v2 API.
4
4
 
5
5
  from typing import Optional, Dict, Any
6
6
  from ..types import ScrapeOptions, Document
7
+ from ..utils.normalize import normalize_document_input
7
8
  from ..utils import HttpClient, handle_response_error, prepare_scrape_options, validate_scrape_options
8
9
 
9
10
 
@@ -59,10 +60,5 @@ def scrape(client: HttpClient, url: str, options: Optional[ScrapeOptions] = None
59
60
  raise Exception(body.get("error", "Unknown error occurred"))
60
61
 
61
62
  document_data = body.get("data", {})
62
- # Normalize keys for Document (no Pydantic aliases)
63
- normalized = dict(document_data)
64
- if 'rawHtml' in normalized and 'raw_html' not in normalized:
65
- normalized['raw_html'] = normalized.pop('rawHtml')
66
- if 'changeTracking' in normalized and 'change_tracking' not in normalized:
67
- normalized['change_tracking'] = normalized.pop('changeTracking')
63
+ normalized = normalize_document_input(document_data)
68
64
  return Document(**normalized)
@@ -4,6 +4,7 @@ Search functionality for Firecrawl v2 API.
4
4
 
5
5
  from typing import Optional, Dict, Any, Union
6
6
  from ..types import SearchRequest, SearchData, SearchResult, Document
7
+ from ..utils.normalize import normalize_document_input
7
8
  from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
8
9
 
9
10
 
@@ -50,12 +51,7 @@ def search(
50
51
  if request.scrape_options is not None and any(
51
52
  key in doc_data for key in ['markdown', 'html', 'rawHtml', 'links', 'summary', 'screenshot', 'changeTracking']
52
53
  ):
53
- # Normalize keys for Document (no Pydantic aliases)
54
- normalized = dict(doc_data)
55
- if 'rawHtml' in normalized and 'raw_html' not in normalized:
56
- normalized['raw_html'] = normalized.pop('rawHtml')
57
- if 'changeTracking' in normalized and 'change_tracking' not in normalized:
58
- normalized['change_tracking'] = normalized.pop('changeTracking')
54
+ normalized = normalize_document_input(doc_data)
59
55
  results.append(Document(**normalized))
60
56
  else:
61
57
  # Minimal search result shape
@@ -7,7 +7,8 @@ This module contains clean, modern type definitions for the v2 API.
7
7
  import warnings
8
8
  from datetime import datetime
9
9
  from typing import Any, Dict, Generic, List, Literal, Optional, TypeVar, Union
10
- from pydantic import BaseModel, Field, field_validator
10
+ import logging
11
+ from pydantic import BaseModel, Field, field_validator, ValidationError
11
12
 
12
13
  # Suppress pydantic warnings about schema field shadowing
13
14
  # Tested using schema_field alias="schema" but it doesn't work.
@@ -19,6 +20,9 @@ warnings.filterwarnings("ignore", message="Field name \"json\" in \"Document\" s
19
20
 
20
21
  T = TypeVar('T')
21
22
 
23
+ # Module logger
24
+ logger = logging.getLogger("firecrawl")
25
+
22
26
  # Base response types
23
27
  class BaseResponse(BaseModel, Generic[T]):
24
28
  """Base response structure for all API responses."""
@@ -29,18 +33,57 @@ class BaseResponse(BaseModel, Generic[T]):
29
33
 
30
34
  # Document and content types
31
35
  class DocumentMetadata(BaseModel):
32
- """Metadata for scraped documents."""
36
+ """Metadata for scraped documents (snake_case only; API camelCase normalized in code)."""
37
+ # Common metadata fields
33
38
  title: Optional[str] = None
34
39
  description: Optional[str] = None
40
+ url: Optional[str] = None
35
41
  language: Optional[str] = None
36
42
  keywords: Optional[Union[str, List[str]]] = None
37
43
  robots: Optional[str] = None
44
+
45
+ # OpenGraph and social metadata
38
46
  og_title: Optional[str] = None
39
47
  og_description: Optional[str] = None
40
48
  og_url: Optional[str] = None
41
49
  og_image: Optional[str] = None
50
+ og_audio: Optional[str] = None
51
+ og_determiner: Optional[str] = None
52
+ og_locale: Optional[str] = None
53
+ og_locale_alternate: Optional[List[str]] = None
54
+ og_site_name: Optional[str] = None
55
+ og_video: Optional[str] = None
56
+
57
+ # Dublin Core and other site metadata
58
+ favicon: Optional[str] = None
59
+ dc_terms_created: Optional[str] = None
60
+ dc_date_created: Optional[str] = None
61
+ dc_date: Optional[str] = None
62
+ dc_terms_type: Optional[str] = None
63
+ dc_type: Optional[str] = None
64
+ dc_terms_audience: Optional[str] = None
65
+ dc_terms_subject: Optional[str] = None
66
+ dc_subject: Optional[str] = None
67
+ dc_description: Optional[str] = None
68
+ dc_terms_keywords: Optional[str] = None
69
+
70
+ modified_time: Optional[str] = None
71
+ published_time: Optional[str] = None
72
+ article_tag: Optional[str] = None
73
+ article_section: Optional[str] = None
74
+
75
+ # Response-level metadata
42
76
  source_url: Optional[str] = None
43
77
  status_code: Optional[int] = None
78
+ scrape_id: Optional[str] = None
79
+ num_pages: Optional[int] = None
80
+ content_type: Optional[str] = None
81
+ proxy_used: Optional[Literal["basic", "stealth"]] = None
82
+ cache_state: Optional[Literal["hit", "miss"]] = None
83
+ cached_at: Optional[str] = None
84
+ credits_used: Optional[int] = None
85
+
86
+ # Error information
44
87
  error: Optional[str] = None
45
88
 
46
89
  @staticmethod
@@ -85,6 +128,29 @@ class Document(BaseModel):
85
128
  warning: Optional[str] = None
86
129
  change_tracking: Optional[Dict[str, Any]] = None
87
130
 
131
+ @property
132
+ def metadata_typed(self) -> DocumentMetadata:
133
+ """Always returns a DocumentMetadata instance for LSP-friendly access."""
134
+ md = self.metadata
135
+ if isinstance(md, DocumentMetadata):
136
+ return md
137
+ if isinstance(md, dict):
138
+ try:
139
+ return DocumentMetadata(**md)
140
+ except (ValidationError, TypeError) as exc:
141
+ logger.debug("Failed to construct DocumentMetadata from dict: %s", exc)
142
+ return DocumentMetadata()
143
+
144
+ @property
145
+ def metadata_dict(self) -> Dict[str, Any]:
146
+ """Returns metadata as a plain dict (exclude None)."""
147
+ md = self.metadata
148
+ if isinstance(md, DocumentMetadata):
149
+ return md.model_dump(exclude_none=True)
150
+ if isinstance(md, dict):
151
+ return {k: v for k, v in md.items() if v is not None}
152
+ return {}
153
+
88
154
  # Webhook types
89
155
  class WebhookConfig(BaseModel):
90
156
  """Configuration for webhooks."""
@@ -0,0 +1,107 @@
1
+ """
2
+ Normalization helpers for v2 API payloads to avoid relying on Pydantic aliases.
3
+ """
4
+
5
+ from typing import Any, Dict, List
6
+ from ..types import DocumentMetadata
7
+
8
+
9
+ def _map_metadata_keys(md: Dict[str, Any]) -> Dict[str, Any]:
10
+ """
11
+ Convert API v2 camelCase metadata keys to snake_case expected by DocumentMetadata.
12
+ Leaves unknown keys as-is.
13
+ """
14
+ mapping = {
15
+ # OpenGraph
16
+ "ogTitle": "og_title",
17
+ "ogDescription": "og_description",
18
+ "ogUrl": "og_url",
19
+ "ogImage": "og_image",
20
+ "ogAudio": "og_audio",
21
+ "ogDeterminer": "og_determiner",
22
+ "ogLocale": "og_locale",
23
+ "ogLocaleAlternate": "og_locale_alternate",
24
+ "ogSiteName": "og_site_name",
25
+ "ogVideo": "og_video",
26
+ # Dublin Core and misc
27
+ "dcTermsCreated": "dc_terms_created",
28
+ "dcDateCreated": "dc_date_created",
29
+ "dcDate": "dc_date",
30
+ "dcTermsType": "dc_terms_type",
31
+ "dcType": "dc_type",
32
+ "dcTermsAudience": "dc_terms_audience",
33
+ "dcTermsSubject": "dc_terms_subject",
34
+ "dcSubject": "dc_subject",
35
+ "dcDescription": "dc_description",
36
+ "dcTermsKeywords": "dc_terms_keywords",
37
+ "modifiedTime": "modified_time",
38
+ "publishedTime": "published_time",
39
+ "articleTag": "article_tag",
40
+ "articleSection": "article_section",
41
+ # Response-level
42
+ "sourceURL": "source_url",
43
+ "statusCode": "status_code",
44
+ "scrapeId": "scrape_id",
45
+ "numPages": "num_pages",
46
+ "contentType": "content_type",
47
+ "proxyUsed": "proxy_used",
48
+ "cacheState": "cache_state",
49
+ "cachedAt": "cached_at",
50
+ "creditsUsed": "credits_used",
51
+ }
52
+
53
+ out: Dict[str, Any] = {}
54
+ for k, v in md.items():
55
+ snake = mapping.get(k, k)
56
+ out[snake] = v
57
+
58
+ # Light coercions where server may send strings/lists
59
+ if isinstance(out.get("status_code"), str):
60
+ try:
61
+ out["status_code"] = int(out["status_code"]) # type: ignore
62
+ except ValueError:
63
+ pass
64
+
65
+ # Generic rule: if a value is a list, join with ", " for string-like fields,
66
+ # except for explicit fields we preserve as lists.
67
+ preserve_list_fields: List[str] = [
68
+ "og_locale_alternate",
69
+ ]
70
+ for f, val in list(out.items()):
71
+ if isinstance(val, list) and f not in preserve_list_fields:
72
+ try:
73
+ out[f] = ", ".join(str(x) for x in val)
74
+ except Exception:
75
+ # Fallback: keep original list if join fails
76
+ pass
77
+
78
+ return out
79
+
80
+
81
+ def normalize_document_input(doc: Dict[str, Any]) -> Dict[str, Any]:
82
+ """
83
+ Normalize a raw Document dict from the API into the Python SDK's expected shape:
84
+ - Convert top-level keys rawHtml->raw_html, changeTracking->change_tracking
85
+ - Convert metadata keys from camelCase to snake_case
86
+ """
87
+ normalized = dict(doc)
88
+
89
+ if "rawHtml" in normalized and "raw_html" not in normalized:
90
+ normalized["raw_html"] = normalized.pop("rawHtml")
91
+
92
+ if "changeTracking" in normalized and "change_tracking" not in normalized:
93
+ normalized["change_tracking"] = normalized.pop("changeTracking")
94
+
95
+ md = normalized.get("metadata")
96
+ if isinstance(md, dict):
97
+ mapped = _map_metadata_keys(md)
98
+ # Construct a concrete DocumentMetadata so downstream has a typed object
99
+ try:
100
+ normalized["metadata"] = DocumentMetadata(**mapped)
101
+ except Exception:
102
+ # Fallback to mapped dict if model construction fails for any reason
103
+ normalized["metadata"] = mapped
104
+
105
+ return normalized
106
+
107
+
@@ -15,6 +15,7 @@ from typing import Callable, List, Optional, Literal, Union, Dict, Any
15
15
  import websockets
16
16
 
17
17
  from .types import CrawlJob, BatchScrapeJob, Document
18
+ from .utils.normalize import normalize_document_input
18
19
 
19
20
 
20
21
  JobKind = Literal["crawl", "batch"]
@@ -172,11 +173,7 @@ class Watcher:
172
173
  docs: List[Document] = []
173
174
  for doc in self.data:
174
175
  if isinstance(doc, dict):
175
- d = dict(doc)
176
- if "rawHtml" in d and "raw_html" not in d:
177
- d["raw_html"] = d.pop("rawHtml")
178
- if "changeTracking" in d and "change_tracking" not in d:
179
- d["change_tracking"] = d.pop("changeTracking")
176
+ d = normalize_document_input(doc)
180
177
  docs.append(Document(**d))
181
178
  if self._kind == "crawl":
182
179
  job = CrawlJob(
@@ -212,11 +209,7 @@ class Watcher:
212
209
  docs = []
213
210
  for doc in payload.get("data", []):
214
211
  if isinstance(doc, dict):
215
- d = dict(doc)
216
- if "rawHtml" in d and "raw_html" not in d:
217
- d["raw_html"] = d.pop("rawHtml")
218
- if "changeTracking" in d and "change_tracking" not in d:
219
- d["change_tracking"] = d.pop("changeTracking")
212
+ d = normalize_document_input(doc)
220
213
  docs.append(Document(**d))
221
214
  job = CrawlJob(
222
215
  status=status_str,
@@ -241,11 +234,7 @@ class Watcher:
241
234
  docs = []
242
235
  for doc in payload.get("data", []):
243
236
  if isinstance(doc, dict):
244
- d = dict(doc)
245
- if "rawHtml" in d and "raw_html" not in d:
246
- d["raw_html"] = d.pop("rawHtml")
247
- if "changeTracking" in d and "change_tracking" not in d:
248
- d["change_tracking"] = d.pop("changeTracking")
237
+ d = normalize_document_input(doc)
249
238
  docs.append(Document(**d))
250
239
  job = BatchScrapeJob(
251
240
  status=status_str,
@@ -16,6 +16,7 @@ import websockets
16
16
  from websockets.exceptions import ConnectionClosed, ConnectionClosedOK, ConnectionClosedError
17
17
 
18
18
  from .types import BatchScrapeJob, CrawlJob, Document
19
+ from .utils.normalize import normalize_document_input
19
20
 
20
21
  JobKind = Literal["crawl", "batch"]
21
22
 
@@ -216,11 +217,7 @@ class AsyncWatcher:
216
217
  source_docs = docs_override if docs_override is not None else payload.get("data", []) or []
217
218
  for doc in source_docs:
218
219
  if isinstance(doc, dict):
219
- d = dict(doc)
220
- if "rawHtml" in d and "raw_html" not in d:
221
- d["raw_html"] = d.pop("rawHtml")
222
- if "changeTracking" in d and "change_tracking" not in d:
223
- d["change_tracking"] = d.pop("changeTracking")
220
+ d = normalize_document_input(doc)
224
221
  docs.append(Document(**d))
225
222
 
226
223
  if self._kind == "crawl":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: firecrawl
3
- Version: 3.0.3
3
+ Version: 3.1.0
4
4
  Summary: Python SDK for Firecrawl API
5
5
  Home-page: https://github.com/firecrawl/firecrawl
6
6
  Author: Mendable.ai
@@ -77,6 +77,7 @@ firecrawl/v2/utils/error_handler.py
77
77
  firecrawl/v2/utils/get_version.py
78
78
  firecrawl/v2/utils/http_client.py
79
79
  firecrawl/v2/utils/http_client_async.py
80
+ firecrawl/v2/utils/normalize.py
80
81
  firecrawl/v2/utils/validation.py
81
82
  tests/test_change_tracking.py
82
83
  tests/test_timeout_conversion.py
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes