onetool-mcp 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bench/__init__.py +5 -0
- bench/cli.py +69 -0
- bench/harness/__init__.py +66 -0
- bench/harness/client.py +692 -0
- bench/harness/config.py +397 -0
- bench/harness/csv_writer.py +109 -0
- bench/harness/evaluate.py +512 -0
- bench/harness/metrics.py +283 -0
- bench/harness/runner.py +899 -0
- bench/py.typed +0 -0
- bench/reporter.py +629 -0
- bench/run.py +487 -0
- bench/secrets.py +101 -0
- bench/utils.py +16 -0
- onetool/__init__.py +4 -0
- onetool/cli.py +391 -0
- onetool/py.typed +0 -0
- onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
- onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
- onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
- onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
- ot/__init__.py +37 -0
- ot/__main__.py +6 -0
- ot/_cli.py +107 -0
- ot/_tui.py +53 -0
- ot/config/__init__.py +46 -0
- ot/config/defaults/bench.yaml +4 -0
- ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
- ot/config/defaults/diagram-templates/c4-context.puml +30 -0
- ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
- ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
- ot/config/defaults/diagram-templates/microservices.d2 +81 -0
- ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
- ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
- ot/config/defaults/onetool.yaml +25 -0
- ot/config/defaults/prompts.yaml +97 -0
- ot/config/defaults/servers.yaml +7 -0
- ot/config/defaults/snippets.yaml +4 -0
- ot/config/defaults/tool_templates/__init__.py +7 -0
- ot/config/defaults/tool_templates/extension.py +52 -0
- ot/config/defaults/tool_templates/isolated.py +61 -0
- ot/config/dynamic.py +121 -0
- ot/config/global_templates/__init__.py +2 -0
- ot/config/global_templates/bench-secrets-template.yaml +6 -0
- ot/config/global_templates/bench.yaml +9 -0
- ot/config/global_templates/onetool.yaml +27 -0
- ot/config/global_templates/secrets-template.yaml +44 -0
- ot/config/global_templates/servers.yaml +18 -0
- ot/config/global_templates/snippets.yaml +235 -0
- ot/config/loader.py +1087 -0
- ot/config/mcp.py +145 -0
- ot/config/secrets.py +190 -0
- ot/config/tool_config.py +125 -0
- ot/decorators.py +116 -0
- ot/executor/__init__.py +35 -0
- ot/executor/base.py +16 -0
- ot/executor/fence_processor.py +83 -0
- ot/executor/linter.py +142 -0
- ot/executor/pack_proxy.py +260 -0
- ot/executor/param_resolver.py +140 -0
- ot/executor/pep723.py +288 -0
- ot/executor/result_store.py +369 -0
- ot/executor/runner.py +496 -0
- ot/executor/simple.py +163 -0
- ot/executor/tool_loader.py +396 -0
- ot/executor/validator.py +398 -0
- ot/executor/worker_pool.py +388 -0
- ot/executor/worker_proxy.py +189 -0
- ot/http_client.py +145 -0
- ot/logging/__init__.py +37 -0
- ot/logging/config.py +315 -0
- ot/logging/entry.py +213 -0
- ot/logging/format.py +188 -0
- ot/logging/span.py +349 -0
- ot/meta.py +1555 -0
- ot/paths.py +453 -0
- ot/prompts.py +218 -0
- ot/proxy/__init__.py +21 -0
- ot/proxy/manager.py +396 -0
- ot/py.typed +0 -0
- ot/registry/__init__.py +189 -0
- ot/registry/models.py +57 -0
- ot/registry/parser.py +269 -0
- ot/registry/registry.py +413 -0
- ot/server.py +315 -0
- ot/shortcuts/__init__.py +15 -0
- ot/shortcuts/aliases.py +87 -0
- ot/shortcuts/snippets.py +258 -0
- ot/stats/__init__.py +35 -0
- ot/stats/html.py +250 -0
- ot/stats/jsonl_writer.py +283 -0
- ot/stats/reader.py +354 -0
- ot/stats/timing.py +57 -0
- ot/support.py +63 -0
- ot/tools.py +114 -0
- ot/utils/__init__.py +81 -0
- ot/utils/batch.py +161 -0
- ot/utils/cache.py +120 -0
- ot/utils/deps.py +403 -0
- ot/utils/exceptions.py +23 -0
- ot/utils/factory.py +179 -0
- ot/utils/format.py +65 -0
- ot/utils/http.py +202 -0
- ot/utils/platform.py +45 -0
- ot/utils/sanitize.py +130 -0
- ot/utils/truncate.py +69 -0
- ot_tools/__init__.py +4 -0
- ot_tools/_convert/__init__.py +12 -0
- ot_tools/_convert/excel.py +279 -0
- ot_tools/_convert/pdf.py +254 -0
- ot_tools/_convert/powerpoint.py +268 -0
- ot_tools/_convert/utils.py +358 -0
- ot_tools/_convert/word.py +283 -0
- ot_tools/brave_search.py +604 -0
- ot_tools/code_search.py +736 -0
- ot_tools/context7.py +495 -0
- ot_tools/convert.py +614 -0
- ot_tools/db.py +415 -0
- ot_tools/diagram.py +1604 -0
- ot_tools/diagram.yaml +167 -0
- ot_tools/excel.py +1372 -0
- ot_tools/file.py +1348 -0
- ot_tools/firecrawl.py +732 -0
- ot_tools/grounding_search.py +646 -0
- ot_tools/package.py +604 -0
- ot_tools/py.typed +0 -0
- ot_tools/ripgrep.py +544 -0
- ot_tools/scaffold.py +471 -0
- ot_tools/transform.py +213 -0
- ot_tools/web_fetch.py +384 -0
ot_tools/firecrawl.py
ADDED
|
@@ -0,0 +1,732 @@
|
|
|
1
|
+
"""Web scraping, crawling, and structured extraction via Firecrawl API.
|
|
2
|
+
|
|
3
|
+
Provides single URL scraping, batch scraping, URL discovery, web search,
|
|
4
|
+
multi-page crawling, and LLM-powered data extraction.
|
|
5
|
+
|
|
6
|
+
API docs: https://docs.firecrawl.dev/api-reference
|
|
7
|
+
Python SDK: https://pypi.org/project/firecrawl/
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
# Pack for dot notation: firecrawl.scrape(), firecrawl.crawl(), etc.
|
|
13
|
+
pack = "firecrawl"
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"crawl",
|
|
17
|
+
"crawl_status",
|
|
18
|
+
"deep_research",
|
|
19
|
+
"extract",
|
|
20
|
+
"map_urls",
|
|
21
|
+
"scrape",
|
|
22
|
+
"scrape_batch",
|
|
23
|
+
"search",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
# Dependency declarations for CLI validation
|
|
27
|
+
__ot_requires__ = {
|
|
28
|
+
"lib": [("firecrawl", "pip install firecrawl")],
|
|
29
|
+
"secrets": ["FIRECRAWL_API_KEY"],
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
from typing import Any, Literal
|
|
33
|
+
from urllib.parse import urlparse
|
|
34
|
+
|
|
35
|
+
from firecrawl import Firecrawl
|
|
36
|
+
from pydantic import BaseModel, Field
|
|
37
|
+
|
|
38
|
+
from ot.config import get_secret, get_tool_config
|
|
39
|
+
from ot.logging import LogSpan
|
|
40
|
+
from ot.utils import batch_execute, lazy_client, normalize_items
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _validate_url(url: str) -> str | None:
|
|
44
|
+
"""Return error message if URL is invalid, None otherwise."""
|
|
45
|
+
if not url or not url.strip():
|
|
46
|
+
return "URL is required and cannot be empty"
|
|
47
|
+
try:
|
|
48
|
+
parsed = urlparse(url)
|
|
49
|
+
if not parsed.scheme or not parsed.netloc:
|
|
50
|
+
return f"Invalid URL: {url} (missing scheme or host)"
|
|
51
|
+
if parsed.scheme not in ("http", "https"):
|
|
52
|
+
return f"Invalid URL scheme: {parsed.scheme} (expected http/https)"
|
|
53
|
+
except Exception as e:
|
|
54
|
+
return f"Invalid URL: {e}"
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class Config(BaseModel):
|
|
59
|
+
"""Pack configuration - discovered by registry."""
|
|
60
|
+
|
|
61
|
+
api_url: str | None = Field(
|
|
62
|
+
default=None,
|
|
63
|
+
description="Custom API URL for self-hosted Firecrawl instances",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _get_config() -> Config:
|
|
68
|
+
"""Get firecrawl pack configuration."""
|
|
69
|
+
return get_tool_config("firecrawl", Config)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _create_client() -> Firecrawl | None:
|
|
73
|
+
"""Create Firecrawl client with API key."""
|
|
74
|
+
api_key = get_secret("FIRECRAWL_API_KEY")
|
|
75
|
+
if not api_key:
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
api_url = _get_config().api_url
|
|
79
|
+
if api_url:
|
|
80
|
+
return Firecrawl(api_key=api_key, api_url=api_url)
|
|
81
|
+
return Firecrawl(api_key=api_key)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# Thread-safe lazy client using SDK utility
|
|
85
|
+
_get_client = lazy_client(_create_client)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _to_dict(obj: Any) -> dict[str, Any]:
|
|
89
|
+
"""Convert SDK response objects to dicts for JSON serialization."""
|
|
90
|
+
if isinstance(obj, dict):
|
|
91
|
+
return obj
|
|
92
|
+
# Pydantic v2
|
|
93
|
+
if hasattr(obj, "model_dump"):
|
|
94
|
+
return obj.model_dump()
|
|
95
|
+
# Pydantic v1
|
|
96
|
+
if hasattr(obj, "dict"):
|
|
97
|
+
return obj.dict()
|
|
98
|
+
# Fallback for dataclasses or other objects
|
|
99
|
+
if hasattr(obj, "__dict__"):
|
|
100
|
+
return {k: v for k, v in obj.__dict__.items() if not k.startswith("_")}
|
|
101
|
+
return {"value": str(obj)}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def scrape(
|
|
105
|
+
*,
|
|
106
|
+
url: str,
|
|
107
|
+
formats: list[
|
|
108
|
+
Literal[
|
|
109
|
+
"markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"
|
|
110
|
+
]
|
|
111
|
+
]
|
|
112
|
+
| None = None,
|
|
113
|
+
only_main_content: bool = True,
|
|
114
|
+
include_tags: list[str] | None = None,
|
|
115
|
+
exclude_tags: list[str] | None = None,
|
|
116
|
+
wait_for: int | None = None,
|
|
117
|
+
timeout: int | None = None,
|
|
118
|
+
mobile: bool = False,
|
|
119
|
+
skip_tls_verification: bool = False,
|
|
120
|
+
remove_base64_images: bool = True,
|
|
121
|
+
location: dict[str, Any] | None = None,
|
|
122
|
+
) -> dict[str, Any] | str:
|
|
123
|
+
"""Scrape content from a single URL.
|
|
124
|
+
|
|
125
|
+
Extracts content in various formats with configurable filtering.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
url: The URL to scrape
|
|
129
|
+
formats: Output formats to include. Options:
|
|
130
|
+
- "markdown": Clean markdown text (default)
|
|
131
|
+
- "html": Cleaned HTML
|
|
132
|
+
- "rawHtml": Original HTML
|
|
133
|
+
- "links": All hyperlinks on the page
|
|
134
|
+
- "screenshot": Screenshot image (base64)
|
|
135
|
+
- "screenshot@fullPage": Full page screenshot
|
|
136
|
+
only_main_content: Extract only main content, excluding nav/footer (default: True)
|
|
137
|
+
include_tags: HTML tags to include (e.g., ["article", "main"])
|
|
138
|
+
exclude_tags: HTML tags to exclude (e.g., ["nav", "footer"])
|
|
139
|
+
wait_for: Milliseconds to wait for dynamic content
|
|
140
|
+
timeout: Request timeout in milliseconds
|
|
141
|
+
mobile: Use mobile user agent
|
|
142
|
+
skip_tls_verification: Skip TLS certificate validation
|
|
143
|
+
remove_base64_images: Remove base64 images from markdown (default: True)
|
|
144
|
+
location: Geolocation for request (e.g., {"country": "US", "languages": ["en"]})
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Dict with scraped content in requested formats, or error message
|
|
148
|
+
|
|
149
|
+
Example:
|
|
150
|
+
# Basic scrape
|
|
151
|
+
firecrawl.scrape(url="https://example.com")
|
|
152
|
+
|
|
153
|
+
# Get markdown and links
|
|
154
|
+
firecrawl.scrape(url="https://example.com", formats=["markdown", "links"])
|
|
155
|
+
|
|
156
|
+
# Scrape with geolocation
|
|
157
|
+
firecrawl.scrape(url="https://example.com", location={"country": "US"})
|
|
158
|
+
"""
|
|
159
|
+
# Validate URL
|
|
160
|
+
if url_error := _validate_url(url):
|
|
161
|
+
return f"Error: {url_error}"
|
|
162
|
+
|
|
163
|
+
with LogSpan(span="firecrawl.scrape", url=url) as span:
|
|
164
|
+
client = _get_client()
|
|
165
|
+
if client is None:
|
|
166
|
+
return "Error: FIRECRAWL_API_KEY secret not configured"
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
# Build kwargs for v2 API
|
|
170
|
+
kwargs: dict[str, Any] = {}
|
|
171
|
+
|
|
172
|
+
if formats:
|
|
173
|
+
kwargs["formats"] = formats
|
|
174
|
+
if not only_main_content:
|
|
175
|
+
kwargs["only_main_content"] = False
|
|
176
|
+
if include_tags:
|
|
177
|
+
kwargs["include_tags"] = include_tags
|
|
178
|
+
if exclude_tags:
|
|
179
|
+
kwargs["exclude_tags"] = exclude_tags
|
|
180
|
+
if wait_for is not None:
|
|
181
|
+
kwargs["wait_for"] = wait_for
|
|
182
|
+
if timeout is not None:
|
|
183
|
+
kwargs["timeout"] = timeout
|
|
184
|
+
if mobile:
|
|
185
|
+
kwargs["mobile"] = True
|
|
186
|
+
if skip_tls_verification:
|
|
187
|
+
kwargs["skip_tls_verification"] = True
|
|
188
|
+
if not remove_base64_images:
|
|
189
|
+
kwargs["remove_base64_images"] = False
|
|
190
|
+
if location:
|
|
191
|
+
kwargs["location"] = location
|
|
192
|
+
|
|
193
|
+
result = client.scrape(url, **kwargs)
|
|
194
|
+
|
|
195
|
+
span.add(success=True)
|
|
196
|
+
result_dict = _to_dict(result)
|
|
197
|
+
if isinstance(result_dict, dict):
|
|
198
|
+
span.add(formats=list(result_dict.keys()))
|
|
199
|
+
return result_dict
|
|
200
|
+
|
|
201
|
+
except Exception as e:
|
|
202
|
+
error_msg = f"Scrape failed: {e}"
|
|
203
|
+
span.add(error=str(e))
|
|
204
|
+
return error_msg
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def scrape_batch(
|
|
208
|
+
*,
|
|
209
|
+
urls: list[str] | list[tuple[str, str]],
|
|
210
|
+
formats: list[
|
|
211
|
+
Literal[
|
|
212
|
+
"markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"
|
|
213
|
+
]
|
|
214
|
+
]
|
|
215
|
+
| None = None,
|
|
216
|
+
only_main_content: bool = True,
|
|
217
|
+
max_workers: int = 5,
|
|
218
|
+
) -> dict[str, dict[str, Any] | str]:
|
|
219
|
+
"""Scrape multiple URLs concurrently.
|
|
220
|
+
|
|
221
|
+
Uses ThreadPoolExecutor for parallel execution with error isolation.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
urls: List of URLs to scrape. Each item can be:
|
|
225
|
+
- A string (URL used as key)
|
|
226
|
+
- A tuple of (url, label) for custom labeling
|
|
227
|
+
formats: Output formats (see scrape() for options)
|
|
228
|
+
only_main_content: Extract only main content (default: True)
|
|
229
|
+
max_workers: Maximum concurrent scrapes (default: 5)
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
Dict mapping URL/label to scraped content or error message
|
|
233
|
+
|
|
234
|
+
Example:
|
|
235
|
+
# Simple list
|
|
236
|
+
firecrawl.scrape_batch(urls=[
|
|
237
|
+
"https://docs.python.org/3/library/asyncio.html",
|
|
238
|
+
"https://docs.python.org/3/library/threading.html",
|
|
239
|
+
])
|
|
240
|
+
|
|
241
|
+
# With labels
|
|
242
|
+
firecrawl.scrape_batch(urls=[
|
|
243
|
+
("https://example.com/page1", "Page 1"),
|
|
244
|
+
("https://example.com/page2", "Page 2"),
|
|
245
|
+
])
|
|
246
|
+
"""
|
|
247
|
+
normalized = normalize_items(urls)
|
|
248
|
+
|
|
249
|
+
with LogSpan(span="firecrawl.scrape_batch", url_count=len(normalized)) as span:
|
|
250
|
+
|
|
251
|
+
def _scrape_one(url: str, label: str) -> tuple[str, dict[str, Any] | str]:
|
|
252
|
+
result = scrape(
|
|
253
|
+
url=url,
|
|
254
|
+
formats=formats,
|
|
255
|
+
only_main_content=only_main_content,
|
|
256
|
+
)
|
|
257
|
+
return label, result
|
|
258
|
+
|
|
259
|
+
results = batch_execute(_scrape_one, normalized, max_workers=max_workers)
|
|
260
|
+
span.add(success_count=sum(1 for r in results.values() if isinstance(r, dict)))
|
|
261
|
+
return results
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def map_urls(
|
|
265
|
+
*,
|
|
266
|
+
url: str,
|
|
267
|
+
search: str | None = None,
|
|
268
|
+
ignore_sitemap: bool = False,
|
|
269
|
+
sitemap_only: bool = False,
|
|
270
|
+
include_subdomains: bool = False,
|
|
271
|
+
limit: int | None = None,
|
|
272
|
+
) -> list[str] | str:
|
|
273
|
+
"""Discover URLs from a website.
|
|
274
|
+
|
|
275
|
+
Maps all accessible URLs from a site via sitemap and crawling.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
url: The starting URL to map
|
|
279
|
+
search: Optional search term to filter URLs
|
|
280
|
+
ignore_sitemap: Skip sitemap discovery, only crawl (default: False)
|
|
281
|
+
sitemap_only: Only use sitemap, no crawling (default: False)
|
|
282
|
+
include_subdomains: Include URLs from subdomains (default: False)
|
|
283
|
+
limit: Maximum number of URLs to return
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
List of discovered URLs, or error message
|
|
287
|
+
|
|
288
|
+
Example:
|
|
289
|
+
# Map entire site
|
|
290
|
+
firecrawl.map_urls(url="https://docs.python.org")
|
|
291
|
+
|
|
292
|
+
# Search for specific pages
|
|
293
|
+
firecrawl.map_urls(url="https://docs.python.org", search="asyncio")
|
|
294
|
+
|
|
295
|
+
# Limit results
|
|
296
|
+
firecrawl.map_urls(url="https://example.com", limit=100)
|
|
297
|
+
"""
|
|
298
|
+
# Validate URL
|
|
299
|
+
if url_error := _validate_url(url):
|
|
300
|
+
return f"Error: {url_error}"
|
|
301
|
+
|
|
302
|
+
with LogSpan(span="firecrawl.map_urls", url=url, search=search) as span:
|
|
303
|
+
client = _get_client()
|
|
304
|
+
if client is None:
|
|
305
|
+
return "Error: FIRECRAWL_API_KEY secret not configured"
|
|
306
|
+
|
|
307
|
+
try:
|
|
308
|
+
# Build kwargs for v2 API
|
|
309
|
+
kwargs: dict[str, Any] = {}
|
|
310
|
+
|
|
311
|
+
if search:
|
|
312
|
+
kwargs["search"] = search
|
|
313
|
+
if ignore_sitemap:
|
|
314
|
+
kwargs["sitemap"] = "skip"
|
|
315
|
+
if sitemap_only:
|
|
316
|
+
kwargs["sitemap"] = "only"
|
|
317
|
+
if include_subdomains:
|
|
318
|
+
kwargs["include_subdomains"] = True
|
|
319
|
+
if limit:
|
|
320
|
+
kwargs["limit"] = limit
|
|
321
|
+
|
|
322
|
+
result = client.map(url, **kwargs)
|
|
323
|
+
|
|
324
|
+
# Extract links list from result
|
|
325
|
+
if isinstance(result, list):
|
|
326
|
+
links = result
|
|
327
|
+
else:
|
|
328
|
+
# Handle MapResponse object
|
|
329
|
+
links = getattr(result, "links", None) or []
|
|
330
|
+
|
|
331
|
+
# Convert LinkResult objects to URL strings
|
|
332
|
+
urls = []
|
|
333
|
+
for link in links:
|
|
334
|
+
if isinstance(link, str):
|
|
335
|
+
urls.append(link)
|
|
336
|
+
elif hasattr(link, "url"):
|
|
337
|
+
urls.append(link.url)
|
|
338
|
+
elif hasattr(link, "model_dump"):
|
|
339
|
+
urls.append(link.model_dump().get("url", str(link)))
|
|
340
|
+
else:
|
|
341
|
+
urls.append(str(link))
|
|
342
|
+
|
|
343
|
+
span.add(url_count=len(urls))
|
|
344
|
+
return urls
|
|
345
|
+
|
|
346
|
+
except Exception as e:
|
|
347
|
+
error_msg = f"Map failed: {e}"
|
|
348
|
+
span.add(error=str(e))
|
|
349
|
+
return error_msg
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def search(
|
|
353
|
+
*,
|
|
354
|
+
query: str,
|
|
355
|
+
limit: int = 5,
|
|
356
|
+
lang: str | None = None,
|
|
357
|
+
country: str | None = None,
|
|
358
|
+
scrape_options: dict[str, Any] | None = None,
|
|
359
|
+
) -> list[dict[str, Any]] | str:
|
|
360
|
+
"""Search the web and optionally scrape results.
|
|
361
|
+
|
|
362
|
+
Performs web search with optional content retrieval for each result.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
query: Search query string
|
|
366
|
+
limit: Maximum number of results (default: 5)
|
|
367
|
+
lang: Language code for results (e.g., "en")
|
|
368
|
+
country: Country code for results (e.g., "US")
|
|
369
|
+
scrape_options: Options for scraping result pages (see scrape() params)
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
List of search results with optional scraped content, or error message
|
|
373
|
+
|
|
374
|
+
Example:
|
|
375
|
+
# Basic search
|
|
376
|
+
firecrawl.search(query="Python async best practices")
|
|
377
|
+
|
|
378
|
+
# Search with scraping
|
|
379
|
+
firecrawl.search(
|
|
380
|
+
query="machine learning tutorials",
|
|
381
|
+
limit=3,
|
|
382
|
+
scrape_options={"formats": ["markdown"]}
|
|
383
|
+
)
|
|
384
|
+
"""
|
|
385
|
+
with LogSpan(span="firecrawl.search", query=query, limit=limit) as span:
|
|
386
|
+
client = _get_client()
|
|
387
|
+
if client is None:
|
|
388
|
+
return "Error: FIRECRAWL_API_KEY secret not configured"
|
|
389
|
+
|
|
390
|
+
try:
|
|
391
|
+
# Build kwargs for v2 API
|
|
392
|
+
kwargs: dict[str, Any] = {"limit": limit}
|
|
393
|
+
|
|
394
|
+
if lang:
|
|
395
|
+
kwargs["lang"] = lang
|
|
396
|
+
if country:
|
|
397
|
+
kwargs["location"] = country
|
|
398
|
+
if scrape_options:
|
|
399
|
+
kwargs["scrape_options"] = scrape_options
|
|
400
|
+
|
|
401
|
+
result = client.search(query, **kwargs)
|
|
402
|
+
|
|
403
|
+
if isinstance(result, list):
|
|
404
|
+
span.add(result_count=len(result))
|
|
405
|
+
return [_to_dict(item) for item in result]
|
|
406
|
+
# Handle SearchData object (v2 API returns .web, .news, .images)
|
|
407
|
+
data = getattr(result, "web", None) or getattr(result, "data", None) or []
|
|
408
|
+
span.add(result_count=len(data))
|
|
409
|
+
return [_to_dict(item) for item in data]
|
|
410
|
+
|
|
411
|
+
except Exception as e:
|
|
412
|
+
error_msg = f"Search failed: {e}"
|
|
413
|
+
span.add(error=str(e))
|
|
414
|
+
return error_msg
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def crawl(
|
|
418
|
+
*,
|
|
419
|
+
url: str,
|
|
420
|
+
max_depth: int | None = None,
|
|
421
|
+
limit: int | None = None,
|
|
422
|
+
include_paths: list[str] | None = None,
|
|
423
|
+
exclude_paths: list[str] | None = None,
|
|
424
|
+
ignore_sitemap: bool = False,
|
|
425
|
+
scrape_options: dict[str, Any] | None = None,
|
|
426
|
+
webhook: str | None = None,
|
|
427
|
+
) -> dict[str, Any] | str:
|
|
428
|
+
"""Start an asynchronous multi-page crawl job.
|
|
429
|
+
|
|
430
|
+
Crawls a website starting from the given URL. Returns immediately with
|
|
431
|
+
a job ID. Use crawl_status() to poll for results.
|
|
432
|
+
|
|
433
|
+
Args:
|
|
434
|
+
url: The starting URL to crawl
|
|
435
|
+
max_depth: Maximum link depth to crawl
|
|
436
|
+
limit: Maximum number of pages to crawl
|
|
437
|
+
include_paths: URL patterns to include (glob syntax)
|
|
438
|
+
exclude_paths: URL patterns to exclude (glob syntax)
|
|
439
|
+
ignore_sitemap: Skip sitemap discovery (default: False)
|
|
440
|
+
scrape_options: Options for scraping pages (see scrape() params)
|
|
441
|
+
webhook: URL to receive completion notification
|
|
442
|
+
|
|
443
|
+
Returns:
|
|
444
|
+
Dict with job ID and status URL, or error message
|
|
445
|
+
|
|
446
|
+
Example:
|
|
447
|
+
# Start a crawl
|
|
448
|
+
job = firecrawl.crawl(url="https://docs.python.org", max_depth=2, limit=100)
|
|
449
|
+
|
|
450
|
+
# Check status
|
|
451
|
+
firecrawl.crawl_status(id=job["id"])
|
|
452
|
+
"""
|
|
453
|
+
# Validate URL
|
|
454
|
+
if url_error := _validate_url(url):
|
|
455
|
+
return f"Error: {url_error}"
|
|
456
|
+
|
|
457
|
+
with LogSpan(span="firecrawl.crawl", url=url, max_depth=max_depth, limit=limit) as span:
|
|
458
|
+
client = _get_client()
|
|
459
|
+
if client is None:
|
|
460
|
+
return "Error: FIRECRAWL_API_KEY secret not configured"
|
|
461
|
+
|
|
462
|
+
try:
|
|
463
|
+
# Build kwargs for v2 API
|
|
464
|
+
kwargs: dict[str, Any] = {}
|
|
465
|
+
|
|
466
|
+
if max_depth is not None:
|
|
467
|
+
kwargs["max_discovery_depth"] = max_depth
|
|
468
|
+
if limit is not None:
|
|
469
|
+
kwargs["limit"] = limit
|
|
470
|
+
if include_paths:
|
|
471
|
+
kwargs["include_paths"] = include_paths
|
|
472
|
+
if exclude_paths:
|
|
473
|
+
kwargs["exclude_paths"] = exclude_paths
|
|
474
|
+
if ignore_sitemap:
|
|
475
|
+
kwargs["ignore_sitemap"] = True
|
|
476
|
+
if scrape_options:
|
|
477
|
+
kwargs["scrape_options"] = scrape_options
|
|
478
|
+
if webhook:
|
|
479
|
+
kwargs["webhook"] = webhook
|
|
480
|
+
|
|
481
|
+
result = client.crawl(url, **kwargs)
|
|
482
|
+
|
|
483
|
+
# Convert to dict for consistent handling
|
|
484
|
+
result_dict = _to_dict(result)
|
|
485
|
+
|
|
486
|
+
# Extract job ID with multiple fallbacks
|
|
487
|
+
job_id = (
|
|
488
|
+
result_dict.get("id")
|
|
489
|
+
or result_dict.get("jobId")
|
|
490
|
+
or result_dict.get("job_id")
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
span.add(job_id=job_id)
|
|
494
|
+
|
|
495
|
+
# If result already has data (sync completion), return as-is
|
|
496
|
+
if result_dict.get("data"):
|
|
497
|
+
return result_dict
|
|
498
|
+
|
|
499
|
+
# Return normalized response with job info
|
|
500
|
+
return {
|
|
501
|
+
"id": job_id,
|
|
502
|
+
"status": result_dict.get("status", "started"),
|
|
503
|
+
"url": url,
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
except Exception as e:
|
|
507
|
+
error_msg = f"Crawl failed: {e}"
|
|
508
|
+
span.add(error=str(e))
|
|
509
|
+
return error_msg
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
def crawl_status(
|
|
513
|
+
*,
|
|
514
|
+
id: str,
|
|
515
|
+
) -> dict[str, Any] | str:
|
|
516
|
+
"""Check the status of a crawl job.
|
|
517
|
+
|
|
518
|
+
Polls the crawl job for current progress and results.
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
id: The crawl job ID returned by crawl()
|
|
522
|
+
|
|
523
|
+
Returns:
|
|
524
|
+
Dict with status, progress, and results (if complete), or error message
|
|
525
|
+
|
|
526
|
+
Example:
|
|
527
|
+
# Check crawl progress
|
|
528
|
+
status = firecrawl.crawl_status(id="abc123")
|
|
529
|
+
|
|
530
|
+
if status["status"] == "completed":
|
|
531
|
+
for page in status["data"]:
|
|
532
|
+
print(page["url"])
|
|
533
|
+
"""
|
|
534
|
+
# Validate job ID
|
|
535
|
+
if not id or not id.strip():
|
|
536
|
+
return "Error: Job ID is required and cannot be empty"
|
|
537
|
+
|
|
538
|
+
with LogSpan(span="firecrawl.crawl_status", job_id=id) as span:
|
|
539
|
+
client = _get_client()
|
|
540
|
+
if client is None:
|
|
541
|
+
return "Error: FIRECRAWL_API_KEY secret not configured"
|
|
542
|
+
|
|
543
|
+
try:
|
|
544
|
+
result = client.get_crawl_status(id)
|
|
545
|
+
|
|
546
|
+
if isinstance(result, dict):
|
|
547
|
+
span.add(status=result.get("status"))
|
|
548
|
+
return result
|
|
549
|
+
|
|
550
|
+
# Handle CrawlStatusResponse object
|
|
551
|
+
status = getattr(result, "status", "unknown")
|
|
552
|
+
span.add(status=status)
|
|
553
|
+
|
|
554
|
+
response: dict[str, Any] = {
|
|
555
|
+
"id": id,
|
|
556
|
+
"status": status,
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
# Add optional fields if present
|
|
560
|
+
if hasattr(result, "completed"):
|
|
561
|
+
response["completed"] = result.completed
|
|
562
|
+
if hasattr(result, "total"):
|
|
563
|
+
response["total"] = result.total
|
|
564
|
+
if hasattr(result, "data"):
|
|
565
|
+
response["data"] = result.data
|
|
566
|
+
|
|
567
|
+
return response
|
|
568
|
+
|
|
569
|
+
except Exception as e:
|
|
570
|
+
error_msg = f"Status check failed: {e}"
|
|
571
|
+
span.add(error=str(e))
|
|
572
|
+
return error_msg
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def extract(
|
|
576
|
+
*,
|
|
577
|
+
urls: list[str],
|
|
578
|
+
prompt: str | None = None,
|
|
579
|
+
schema: dict[str, Any] | None = None,
|
|
580
|
+
system_prompt: str | None = None,
|
|
581
|
+
allow_external_links: bool = False,
|
|
582
|
+
) -> dict[str, Any] | str:
|
|
583
|
+
"""Extract structured data from URLs using LLM.
|
|
584
|
+
|
|
585
|
+
Uses an LLM to extract data matching a JSON schema from web pages.
|
|
586
|
+
|
|
587
|
+
Args:
|
|
588
|
+
urls: URLs to extract data from
|
|
589
|
+
prompt: Natural language description of what to extract
|
|
590
|
+
schema: JSON schema defining the structure of extracted data
|
|
591
|
+
(OpenAI JSON schema format)
|
|
592
|
+
system_prompt: Custom system prompt for the LLM
|
|
593
|
+
allow_external_links: Follow external links during extraction (default: False)
|
|
594
|
+
|
|
595
|
+
Returns:
|
|
596
|
+
Dict with extracted data matching schema, or error message
|
|
597
|
+
|
|
598
|
+
Example:
|
|
599
|
+
# Extract with prompt
|
|
600
|
+
firecrawl.extract(
|
|
601
|
+
urls=["https://example.com/products"],
|
|
602
|
+
prompt="Extract product names and prices"
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
# Extract with schema
|
|
606
|
+
firecrawl.extract(
|
|
607
|
+
urls=["https://example.com/team"],
|
|
608
|
+
schema={
|
|
609
|
+
"type": "object",
|
|
610
|
+
"properties": {
|
|
611
|
+
"team_members": {
|
|
612
|
+
"type": "array",
|
|
613
|
+
"items": {
|
|
614
|
+
"type": "object",
|
|
615
|
+
"properties": {
|
|
616
|
+
"name": {"type": "string"},
|
|
617
|
+
"role": {"type": "string"}
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
)
|
|
624
|
+
"""
|
|
625
|
+
with LogSpan(span="firecrawl.extract", url_count=len(urls)) as span:
|
|
626
|
+
client = _get_client()
|
|
627
|
+
if client is None:
|
|
628
|
+
return "Error: FIRECRAWL_API_KEY secret not configured"
|
|
629
|
+
|
|
630
|
+
if not prompt and not schema:
|
|
631
|
+
return "Error: Either prompt or schema is required"
|
|
632
|
+
|
|
633
|
+
try:
|
|
634
|
+
# Build kwargs for v2 API
|
|
635
|
+
kwargs: dict[str, Any] = {}
|
|
636
|
+
|
|
637
|
+
if prompt:
|
|
638
|
+
kwargs["prompt"] = prompt
|
|
639
|
+
if schema:
|
|
640
|
+
kwargs["schema"] = schema
|
|
641
|
+
if system_prompt:
|
|
642
|
+
kwargs["system_prompt"] = system_prompt
|
|
643
|
+
if allow_external_links:
|
|
644
|
+
kwargs["allow_external_links"] = True
|
|
645
|
+
|
|
646
|
+
result = client.extract(urls, **kwargs)
|
|
647
|
+
|
|
648
|
+
if isinstance(result, dict):
|
|
649
|
+
span.add(success=True)
|
|
650
|
+
return result
|
|
651
|
+
|
|
652
|
+
# Handle ExtractResponse object
|
|
653
|
+
data = getattr(result, "data", None)
|
|
654
|
+
span.add(success=True)
|
|
655
|
+
return {"data": data} if data else result
|
|
656
|
+
|
|
657
|
+
except Exception as e:
|
|
658
|
+
error_msg = f"Extract failed: {e}"
|
|
659
|
+
span.add(error=str(e))
|
|
660
|
+
return error_msg
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
def deep_research(
|
|
664
|
+
*,
|
|
665
|
+
prompt: str,
|
|
666
|
+
urls: list[str] | None = None,
|
|
667
|
+
timeout: int | None = None,
|
|
668
|
+
max_credits: int | None = None,
|
|
669
|
+
) -> dict[str, Any] | str:
|
|
670
|
+
"""Run autonomous deep research on a topic.
|
|
671
|
+
|
|
672
|
+
Launches an AI agent that autonomously researches a topic by
|
|
673
|
+
searching, crawling, and synthesizing information from the web.
|
|
674
|
+
|
|
675
|
+
Args:
|
|
676
|
+
prompt: Research question or topic
|
|
677
|
+
urls: Starting URLs to research (optional, will search if not provided)
|
|
678
|
+
timeout: Time limit in seconds for the research
|
|
679
|
+
max_credits: Maximum credits to spend on research
|
|
680
|
+
|
|
681
|
+
Returns:
|
|
682
|
+
Dict with research results and sources, or error message
|
|
683
|
+
|
|
684
|
+
Example:
|
|
685
|
+
# Research a topic
|
|
686
|
+
firecrawl.deep_research(
|
|
687
|
+
prompt="What are the latest developments in quantum computing?",
|
|
688
|
+
timeout=300
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
# Research from specific sources
|
|
692
|
+
firecrawl.deep_research(
|
|
693
|
+
prompt="Compare pricing models",
|
|
694
|
+
urls=["https://company1.com/pricing", "https://company2.com/pricing"]
|
|
695
|
+
)
|
|
696
|
+
"""
|
|
697
|
+
with LogSpan(span="firecrawl.deep_research", prompt=prompt[:100]) as span:
|
|
698
|
+
client = _get_client()
|
|
699
|
+
if client is None:
|
|
700
|
+
return "Error: FIRECRAWL_API_KEY secret not configured"
|
|
701
|
+
|
|
702
|
+
try:
|
|
703
|
+
# Build kwargs for v2 API (uses 'agent' method)
|
|
704
|
+
kwargs: dict[str, Any] = {"prompt": prompt}
|
|
705
|
+
|
|
706
|
+
if urls:
|
|
707
|
+
kwargs["urls"] = urls
|
|
708
|
+
if timeout is not None:
|
|
709
|
+
kwargs["timeout"] = timeout
|
|
710
|
+
if max_credits is not None:
|
|
711
|
+
kwargs["max_credits"] = max_credits
|
|
712
|
+
|
|
713
|
+
# The SDK's agent method corresponds to deep research
|
|
714
|
+
result = client.agent(**kwargs)
|
|
715
|
+
|
|
716
|
+
if isinstance(result, dict):
|
|
717
|
+
span.add(success=True)
|
|
718
|
+
return result
|
|
719
|
+
|
|
720
|
+
# Handle response object
|
|
721
|
+
data = getattr(result, "data", None)
|
|
722
|
+
sources = getattr(result, "sources", None)
|
|
723
|
+
span.add(success=True, source_count=len(sources) if sources else 0)
|
|
724
|
+
return {
|
|
725
|
+
"data": data,
|
|
726
|
+
"sources": sources,
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
except Exception as e:
|
|
730
|
+
error_msg = f"Deep research failed: {e}"
|
|
731
|
+
span.add(error=str(e))
|
|
732
|
+
return error_msg
|