onetool-mcp 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. bench/__init__.py +5 -0
  2. bench/cli.py +69 -0
  3. bench/harness/__init__.py +66 -0
  4. bench/harness/client.py +692 -0
  5. bench/harness/config.py +397 -0
  6. bench/harness/csv_writer.py +109 -0
  7. bench/harness/evaluate.py +512 -0
  8. bench/harness/metrics.py +283 -0
  9. bench/harness/runner.py +899 -0
  10. bench/py.typed +0 -0
  11. bench/reporter.py +629 -0
  12. bench/run.py +487 -0
  13. bench/secrets.py +101 -0
  14. bench/utils.py +16 -0
  15. onetool/__init__.py +4 -0
  16. onetool/cli.py +391 -0
  17. onetool/py.typed +0 -0
  18. onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
  19. onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
  20. onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
  21. onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
  22. onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
  23. onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
  24. ot/__init__.py +37 -0
  25. ot/__main__.py +6 -0
  26. ot/_cli.py +107 -0
  27. ot/_tui.py +53 -0
  28. ot/config/__init__.py +46 -0
  29. ot/config/defaults/bench.yaml +4 -0
  30. ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
  31. ot/config/defaults/diagram-templates/c4-context.puml +30 -0
  32. ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
  33. ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
  34. ot/config/defaults/diagram-templates/microservices.d2 +81 -0
  35. ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
  36. ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
  37. ot/config/defaults/onetool.yaml +25 -0
  38. ot/config/defaults/prompts.yaml +97 -0
  39. ot/config/defaults/servers.yaml +7 -0
  40. ot/config/defaults/snippets.yaml +4 -0
  41. ot/config/defaults/tool_templates/__init__.py +7 -0
  42. ot/config/defaults/tool_templates/extension.py +52 -0
  43. ot/config/defaults/tool_templates/isolated.py +61 -0
  44. ot/config/dynamic.py +121 -0
  45. ot/config/global_templates/__init__.py +2 -0
  46. ot/config/global_templates/bench-secrets-template.yaml +6 -0
  47. ot/config/global_templates/bench.yaml +9 -0
  48. ot/config/global_templates/onetool.yaml +27 -0
  49. ot/config/global_templates/secrets-template.yaml +44 -0
  50. ot/config/global_templates/servers.yaml +18 -0
  51. ot/config/global_templates/snippets.yaml +235 -0
  52. ot/config/loader.py +1087 -0
  53. ot/config/mcp.py +145 -0
  54. ot/config/secrets.py +190 -0
  55. ot/config/tool_config.py +125 -0
  56. ot/decorators.py +116 -0
  57. ot/executor/__init__.py +35 -0
  58. ot/executor/base.py +16 -0
  59. ot/executor/fence_processor.py +83 -0
  60. ot/executor/linter.py +142 -0
  61. ot/executor/pack_proxy.py +260 -0
  62. ot/executor/param_resolver.py +140 -0
  63. ot/executor/pep723.py +288 -0
  64. ot/executor/result_store.py +369 -0
  65. ot/executor/runner.py +496 -0
  66. ot/executor/simple.py +163 -0
  67. ot/executor/tool_loader.py +396 -0
  68. ot/executor/validator.py +398 -0
  69. ot/executor/worker_pool.py +388 -0
  70. ot/executor/worker_proxy.py +189 -0
  71. ot/http_client.py +145 -0
  72. ot/logging/__init__.py +37 -0
  73. ot/logging/config.py +315 -0
  74. ot/logging/entry.py +213 -0
  75. ot/logging/format.py +188 -0
  76. ot/logging/span.py +349 -0
  77. ot/meta.py +1555 -0
  78. ot/paths.py +453 -0
  79. ot/prompts.py +218 -0
  80. ot/proxy/__init__.py +21 -0
  81. ot/proxy/manager.py +396 -0
  82. ot/py.typed +0 -0
  83. ot/registry/__init__.py +189 -0
  84. ot/registry/models.py +57 -0
  85. ot/registry/parser.py +269 -0
  86. ot/registry/registry.py +413 -0
  87. ot/server.py +315 -0
  88. ot/shortcuts/__init__.py +15 -0
  89. ot/shortcuts/aliases.py +87 -0
  90. ot/shortcuts/snippets.py +258 -0
  91. ot/stats/__init__.py +35 -0
  92. ot/stats/html.py +250 -0
  93. ot/stats/jsonl_writer.py +283 -0
  94. ot/stats/reader.py +354 -0
  95. ot/stats/timing.py +57 -0
  96. ot/support.py +63 -0
  97. ot/tools.py +114 -0
  98. ot/utils/__init__.py +81 -0
  99. ot/utils/batch.py +161 -0
  100. ot/utils/cache.py +120 -0
  101. ot/utils/deps.py +403 -0
  102. ot/utils/exceptions.py +23 -0
  103. ot/utils/factory.py +179 -0
  104. ot/utils/format.py +65 -0
  105. ot/utils/http.py +202 -0
  106. ot/utils/platform.py +45 -0
  107. ot/utils/sanitize.py +130 -0
  108. ot/utils/truncate.py +69 -0
  109. ot_tools/__init__.py +4 -0
  110. ot_tools/_convert/__init__.py +12 -0
  111. ot_tools/_convert/excel.py +279 -0
  112. ot_tools/_convert/pdf.py +254 -0
  113. ot_tools/_convert/powerpoint.py +268 -0
  114. ot_tools/_convert/utils.py +358 -0
  115. ot_tools/_convert/word.py +283 -0
  116. ot_tools/brave_search.py +604 -0
  117. ot_tools/code_search.py +736 -0
  118. ot_tools/context7.py +495 -0
  119. ot_tools/convert.py +614 -0
  120. ot_tools/db.py +415 -0
  121. ot_tools/diagram.py +1604 -0
  122. ot_tools/diagram.yaml +167 -0
  123. ot_tools/excel.py +1372 -0
  124. ot_tools/file.py +1348 -0
  125. ot_tools/firecrawl.py +732 -0
  126. ot_tools/grounding_search.py +646 -0
  127. ot_tools/package.py +604 -0
  128. ot_tools/py.typed +0 -0
  129. ot_tools/ripgrep.py +544 -0
  130. ot_tools/scaffold.py +471 -0
  131. ot_tools/transform.py +213 -0
  132. ot_tools/web_fetch.py +384 -0
ot_tools/firecrawl.py ADDED
@@ -0,0 +1,732 @@
1
+ """Web scraping, crawling, and structured extraction via Firecrawl API.
2
+
3
+ Provides single URL scraping, batch scraping, URL discovery, web search,
4
+ multi-page crawling, and LLM-powered data extraction.
5
+
6
+ API docs: https://docs.firecrawl.dev/api-reference
7
+ Python SDK: https://pypi.org/project/firecrawl/
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ # Pack for dot notation: firecrawl.scrape(), firecrawl.crawl(), etc.
13
+ pack = "firecrawl"
14
+
15
+ __all__ = [
16
+ "crawl",
17
+ "crawl_status",
18
+ "deep_research",
19
+ "extract",
20
+ "map_urls",
21
+ "scrape",
22
+ "scrape_batch",
23
+ "search",
24
+ ]
25
+
26
+ # Dependency declarations for CLI validation
27
+ __ot_requires__ = {
28
+ "lib": [("firecrawl", "pip install firecrawl")],
29
+ "secrets": ["FIRECRAWL_API_KEY"],
30
+ }
31
+
32
+ from typing import Any, Literal
33
+ from urllib.parse import urlparse
34
+
35
+ from firecrawl import Firecrawl
36
+ from pydantic import BaseModel, Field
37
+
38
+ from ot.config import get_secret, get_tool_config
39
+ from ot.logging import LogSpan
40
+ from ot.utils import batch_execute, lazy_client, normalize_items
41
+
42
+
43
+ def _validate_url(url: str) -> str | None:
44
+ """Return error message if URL is invalid, None otherwise."""
45
+ if not url or not url.strip():
46
+ return "URL is required and cannot be empty"
47
+ try:
48
+ parsed = urlparse(url)
49
+ if not parsed.scheme or not parsed.netloc:
50
+ return f"Invalid URL: {url} (missing scheme or host)"
51
+ if parsed.scheme not in ("http", "https"):
52
+ return f"Invalid URL scheme: {parsed.scheme} (expected http/https)"
53
+ except Exception as e:
54
+ return f"Invalid URL: {e}"
55
+ return None
56
+
57
+
58
+ class Config(BaseModel):
59
+ """Pack configuration - discovered by registry."""
60
+
61
+ api_url: str | None = Field(
62
+ default=None,
63
+ description="Custom API URL for self-hosted Firecrawl instances",
64
+ )
65
+
66
+
67
+ def _get_config() -> Config:
68
+ """Get firecrawl pack configuration."""
69
+ return get_tool_config("firecrawl", Config)
70
+
71
+
72
+ def _create_client() -> Firecrawl | None:
73
+ """Create Firecrawl client with API key."""
74
+ api_key = get_secret("FIRECRAWL_API_KEY")
75
+ if not api_key:
76
+ return None
77
+
78
+ api_url = _get_config().api_url
79
+ if api_url:
80
+ return Firecrawl(api_key=api_key, api_url=api_url)
81
+ return Firecrawl(api_key=api_key)
82
+
83
+
84
+ # Thread-safe lazy client using SDK utility
85
+ _get_client = lazy_client(_create_client)
86
+
87
+
88
+ def _to_dict(obj: Any) -> dict[str, Any]:
89
+ """Convert SDK response objects to dicts for JSON serialization."""
90
+ if isinstance(obj, dict):
91
+ return obj
92
+ # Pydantic v2
93
+ if hasattr(obj, "model_dump"):
94
+ return obj.model_dump()
95
+ # Pydantic v1
96
+ if hasattr(obj, "dict"):
97
+ return obj.dict()
98
+ # Fallback for dataclasses or other objects
99
+ if hasattr(obj, "__dict__"):
100
+ return {k: v for k, v in obj.__dict__.items() if not k.startswith("_")}
101
+ return {"value": str(obj)}
102
+
103
+
104
+ def scrape(
105
+ *,
106
+ url: str,
107
+ formats: list[
108
+ Literal[
109
+ "markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"
110
+ ]
111
+ ]
112
+ | None = None,
113
+ only_main_content: bool = True,
114
+ include_tags: list[str] | None = None,
115
+ exclude_tags: list[str] | None = None,
116
+ wait_for: int | None = None,
117
+ timeout: int | None = None,
118
+ mobile: bool = False,
119
+ skip_tls_verification: bool = False,
120
+ remove_base64_images: bool = True,
121
+ location: dict[str, Any] | None = None,
122
+ ) -> dict[str, Any] | str:
123
+ """Scrape content from a single URL.
124
+
125
+ Extracts content in various formats with configurable filtering.
126
+
127
+ Args:
128
+ url: The URL to scrape
129
+ formats: Output formats to include. Options:
130
+ - "markdown": Clean markdown text (default)
131
+ - "html": Cleaned HTML
132
+ - "rawHtml": Original HTML
133
+ - "links": All hyperlinks on the page
134
+ - "screenshot": Screenshot image (base64)
135
+ - "screenshot@fullPage": Full page screenshot
136
+ only_main_content: Extract only main content, excluding nav/footer (default: True)
137
+ include_tags: HTML tags to include (e.g., ["article", "main"])
138
+ exclude_tags: HTML tags to exclude (e.g., ["nav", "footer"])
139
+ wait_for: Milliseconds to wait for dynamic content
140
+ timeout: Request timeout in milliseconds
141
+ mobile: Use mobile user agent
142
+ skip_tls_verification: Skip TLS certificate validation
143
+ remove_base64_images: Remove base64 images from markdown (default: True)
144
+ location: Geolocation for request (e.g., {"country": "US", "languages": ["en"]})
145
+
146
+ Returns:
147
+ Dict with scraped content in requested formats, or error message
148
+
149
+ Example:
150
+ # Basic scrape
151
+ firecrawl.scrape(url="https://example.com")
152
+
153
+ # Get markdown and links
154
+ firecrawl.scrape(url="https://example.com", formats=["markdown", "links"])
155
+
156
+ # Scrape with geolocation
157
+ firecrawl.scrape(url="https://example.com", location={"country": "US"})
158
+ """
159
+ # Validate URL
160
+ if url_error := _validate_url(url):
161
+ return f"Error: {url_error}"
162
+
163
+ with LogSpan(span="firecrawl.scrape", url=url) as span:
164
+ client = _get_client()
165
+ if client is None:
166
+ return "Error: FIRECRAWL_API_KEY secret not configured"
167
+
168
+ try:
169
+ # Build kwargs for v2 API
170
+ kwargs: dict[str, Any] = {}
171
+
172
+ if formats:
173
+ kwargs["formats"] = formats
174
+ if not only_main_content:
175
+ kwargs["only_main_content"] = False
176
+ if include_tags:
177
+ kwargs["include_tags"] = include_tags
178
+ if exclude_tags:
179
+ kwargs["exclude_tags"] = exclude_tags
180
+ if wait_for is not None:
181
+ kwargs["wait_for"] = wait_for
182
+ if timeout is not None:
183
+ kwargs["timeout"] = timeout
184
+ if mobile:
185
+ kwargs["mobile"] = True
186
+ if skip_tls_verification:
187
+ kwargs["skip_tls_verification"] = True
188
+ if not remove_base64_images:
189
+ kwargs["remove_base64_images"] = False
190
+ if location:
191
+ kwargs["location"] = location
192
+
193
+ result = client.scrape(url, **kwargs)
194
+
195
+ span.add(success=True)
196
+ result_dict = _to_dict(result)
197
+ if isinstance(result_dict, dict):
198
+ span.add(formats=list(result_dict.keys()))
199
+ return result_dict
200
+
201
+ except Exception as e:
202
+ error_msg = f"Scrape failed: {e}"
203
+ span.add(error=str(e))
204
+ return error_msg
205
+
206
+
207
+ def scrape_batch(
208
+ *,
209
+ urls: list[str] | list[tuple[str, str]],
210
+ formats: list[
211
+ Literal[
212
+ "markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"
213
+ ]
214
+ ]
215
+ | None = None,
216
+ only_main_content: bool = True,
217
+ max_workers: int = 5,
218
+ ) -> dict[str, dict[str, Any] | str]:
219
+ """Scrape multiple URLs concurrently.
220
+
221
+ Uses ThreadPoolExecutor for parallel execution with error isolation.
222
+
223
+ Args:
224
+ urls: List of URLs to scrape. Each item can be:
225
+ - A string (URL used as key)
226
+ - A tuple of (url, label) for custom labeling
227
+ formats: Output formats (see scrape() for options)
228
+ only_main_content: Extract only main content (default: True)
229
+ max_workers: Maximum concurrent scrapes (default: 5)
230
+
231
+ Returns:
232
+ Dict mapping URL/label to scraped content or error message
233
+
234
+ Example:
235
+ # Simple list
236
+ firecrawl.scrape_batch(urls=[
237
+ "https://docs.python.org/3/library/asyncio.html",
238
+ "https://docs.python.org/3/library/threading.html",
239
+ ])
240
+
241
+ # With labels
242
+ firecrawl.scrape_batch(urls=[
243
+ ("https://example.com/page1", "Page 1"),
244
+ ("https://example.com/page2", "Page 2"),
245
+ ])
246
+ """
247
+ normalized = normalize_items(urls)
248
+
249
+ with LogSpan(span="firecrawl.scrape_batch", url_count=len(normalized)) as span:
250
+
251
+ def _scrape_one(url: str, label: str) -> tuple[str, dict[str, Any] | str]:
252
+ result = scrape(
253
+ url=url,
254
+ formats=formats,
255
+ only_main_content=only_main_content,
256
+ )
257
+ return label, result
258
+
259
+ results = batch_execute(_scrape_one, normalized, max_workers=max_workers)
260
+ span.add(success_count=sum(1 for r in results.values() if isinstance(r, dict)))
261
+ return results
262
+
263
+
264
+ def map_urls(
265
+ *,
266
+ url: str,
267
+ search: str | None = None,
268
+ ignore_sitemap: bool = False,
269
+ sitemap_only: bool = False,
270
+ include_subdomains: bool = False,
271
+ limit: int | None = None,
272
+ ) -> list[str] | str:
273
+ """Discover URLs from a website.
274
+
275
+ Maps all accessible URLs from a site via sitemap and crawling.
276
+
277
+ Args:
278
+ url: The starting URL to map
279
+ search: Optional search term to filter URLs
280
+ ignore_sitemap: Skip sitemap discovery, only crawl (default: False)
281
+ sitemap_only: Only use sitemap, no crawling (default: False)
282
+ include_subdomains: Include URLs from subdomains (default: False)
283
+ limit: Maximum number of URLs to return
284
+
285
+ Returns:
286
+ List of discovered URLs, or error message
287
+
288
+ Example:
289
+ # Map entire site
290
+ firecrawl.map_urls(url="https://docs.python.org")
291
+
292
+ # Search for specific pages
293
+ firecrawl.map_urls(url="https://docs.python.org", search="asyncio")
294
+
295
+ # Limit results
296
+ firecrawl.map_urls(url="https://example.com", limit=100)
297
+ """
298
+ # Validate URL
299
+ if url_error := _validate_url(url):
300
+ return f"Error: {url_error}"
301
+
302
+ with LogSpan(span="firecrawl.map_urls", url=url, search=search) as span:
303
+ client = _get_client()
304
+ if client is None:
305
+ return "Error: FIRECRAWL_API_KEY secret not configured"
306
+
307
+ try:
308
+ # Build kwargs for v2 API
309
+ kwargs: dict[str, Any] = {}
310
+
311
+ if search:
312
+ kwargs["search"] = search
313
+ if ignore_sitemap:
314
+ kwargs["sitemap"] = "skip"
315
+ if sitemap_only:
316
+ kwargs["sitemap"] = "only"
317
+ if include_subdomains:
318
+ kwargs["include_subdomains"] = True
319
+ if limit:
320
+ kwargs["limit"] = limit
321
+
322
+ result = client.map(url, **kwargs)
323
+
324
+ # Extract links list from result
325
+ if isinstance(result, list):
326
+ links = result
327
+ else:
328
+ # Handle MapResponse object
329
+ links = getattr(result, "links", None) or []
330
+
331
+ # Convert LinkResult objects to URL strings
332
+ urls = []
333
+ for link in links:
334
+ if isinstance(link, str):
335
+ urls.append(link)
336
+ elif hasattr(link, "url"):
337
+ urls.append(link.url)
338
+ elif hasattr(link, "model_dump"):
339
+ urls.append(link.model_dump().get("url", str(link)))
340
+ else:
341
+ urls.append(str(link))
342
+
343
+ span.add(url_count=len(urls))
344
+ return urls
345
+
346
+ except Exception as e:
347
+ error_msg = f"Map failed: {e}"
348
+ span.add(error=str(e))
349
+ return error_msg
350
+
351
+
352
+ def search(
353
+ *,
354
+ query: str,
355
+ limit: int = 5,
356
+ lang: str | None = None,
357
+ country: str | None = None,
358
+ scrape_options: dict[str, Any] | None = None,
359
+ ) -> list[dict[str, Any]] | str:
360
+ """Search the web and optionally scrape results.
361
+
362
+ Performs web search with optional content retrieval for each result.
363
+
364
+ Args:
365
+ query: Search query string
366
+ limit: Maximum number of results (default: 5)
367
+ lang: Language code for results (e.g., "en")
368
+ country: Country code for results (e.g., "US")
369
+ scrape_options: Options for scraping result pages (see scrape() params)
370
+
371
+ Returns:
372
+ List of search results with optional scraped content, or error message
373
+
374
+ Example:
375
+ # Basic search
376
+ firecrawl.search(query="Python async best practices")
377
+
378
+ # Search with scraping
379
+ firecrawl.search(
380
+ query="machine learning tutorials",
381
+ limit=3,
382
+ scrape_options={"formats": ["markdown"]}
383
+ )
384
+ """
385
+ with LogSpan(span="firecrawl.search", query=query, limit=limit) as span:
386
+ client = _get_client()
387
+ if client is None:
388
+ return "Error: FIRECRAWL_API_KEY secret not configured"
389
+
390
+ try:
391
+ # Build kwargs for v2 API
392
+ kwargs: dict[str, Any] = {"limit": limit}
393
+
394
+ if lang:
395
+ kwargs["lang"] = lang
396
+ if country:
397
+ kwargs["location"] = country
398
+ if scrape_options:
399
+ kwargs["scrape_options"] = scrape_options
400
+
401
+ result = client.search(query, **kwargs)
402
+
403
+ if isinstance(result, list):
404
+ span.add(result_count=len(result))
405
+ return [_to_dict(item) for item in result]
406
+ # Handle SearchData object (v2 API returns .web, .news, .images)
407
+ data = getattr(result, "web", None) or getattr(result, "data", None) or []
408
+ span.add(result_count=len(data))
409
+ return [_to_dict(item) for item in data]
410
+
411
+ except Exception as e:
412
+ error_msg = f"Search failed: {e}"
413
+ span.add(error=str(e))
414
+ return error_msg
415
+
416
+
417
+ def crawl(
418
+ *,
419
+ url: str,
420
+ max_depth: int | None = None,
421
+ limit: int | None = None,
422
+ include_paths: list[str] | None = None,
423
+ exclude_paths: list[str] | None = None,
424
+ ignore_sitemap: bool = False,
425
+ scrape_options: dict[str, Any] | None = None,
426
+ webhook: str | None = None,
427
+ ) -> dict[str, Any] | str:
428
+ """Start an asynchronous multi-page crawl job.
429
+
430
+ Crawls a website starting from the given URL. Returns immediately with
431
+ a job ID. Use crawl_status() to poll for results.
432
+
433
+ Args:
434
+ url: The starting URL to crawl
435
+ max_depth: Maximum link depth to crawl
436
+ limit: Maximum number of pages to crawl
437
+ include_paths: URL patterns to include (glob syntax)
438
+ exclude_paths: URL patterns to exclude (glob syntax)
439
+ ignore_sitemap: Skip sitemap discovery (default: False)
440
+ scrape_options: Options for scraping pages (see scrape() params)
441
+ webhook: URL to receive completion notification
442
+
443
+ Returns:
444
+ Dict with job ID and status URL, or error message
445
+
446
+ Example:
447
+ # Start a crawl
448
+ job = firecrawl.crawl(url="https://docs.python.org", max_depth=2, limit=100)
449
+
450
+ # Check status
451
+ firecrawl.crawl_status(id=job["id"])
452
+ """
453
+ # Validate URL
454
+ if url_error := _validate_url(url):
455
+ return f"Error: {url_error}"
456
+
457
+ with LogSpan(span="firecrawl.crawl", url=url, max_depth=max_depth, limit=limit) as span:
458
+ client = _get_client()
459
+ if client is None:
460
+ return "Error: FIRECRAWL_API_KEY secret not configured"
461
+
462
+ try:
463
+ # Build kwargs for v2 API
464
+ kwargs: dict[str, Any] = {}
465
+
466
+ if max_depth is not None:
467
+ kwargs["max_discovery_depth"] = max_depth
468
+ if limit is not None:
469
+ kwargs["limit"] = limit
470
+ if include_paths:
471
+ kwargs["include_paths"] = include_paths
472
+ if exclude_paths:
473
+ kwargs["exclude_paths"] = exclude_paths
474
+ if ignore_sitemap:
475
+ kwargs["ignore_sitemap"] = True
476
+ if scrape_options:
477
+ kwargs["scrape_options"] = scrape_options
478
+ if webhook:
479
+ kwargs["webhook"] = webhook
480
+
481
+ result = client.crawl(url, **kwargs)
482
+
483
+ # Convert to dict for consistent handling
484
+ result_dict = _to_dict(result)
485
+
486
+ # Extract job ID with multiple fallbacks
487
+ job_id = (
488
+ result_dict.get("id")
489
+ or result_dict.get("jobId")
490
+ or result_dict.get("job_id")
491
+ )
492
+
493
+ span.add(job_id=job_id)
494
+
495
+ # If result already has data (sync completion), return as-is
496
+ if result_dict.get("data"):
497
+ return result_dict
498
+
499
+ # Return normalized response with job info
500
+ return {
501
+ "id": job_id,
502
+ "status": result_dict.get("status", "started"),
503
+ "url": url,
504
+ }
505
+
506
+ except Exception as e:
507
+ error_msg = f"Crawl failed: {e}"
508
+ span.add(error=str(e))
509
+ return error_msg
510
+
511
+
512
+ def crawl_status(
513
+ *,
514
+ id: str,
515
+ ) -> dict[str, Any] | str:
516
+ """Check the status of a crawl job.
517
+
518
+ Polls the crawl job for current progress and results.
519
+
520
+ Args:
521
+ id: The crawl job ID returned by crawl()
522
+
523
+ Returns:
524
+ Dict with status, progress, and results (if complete), or error message
525
+
526
+ Example:
527
+ # Check crawl progress
528
+ status = firecrawl.crawl_status(id="abc123")
529
+
530
+ if status["status"] == "completed":
531
+ for page in status["data"]:
532
+ print(page["url"])
533
+ """
534
+ # Validate job ID
535
+ if not id or not id.strip():
536
+ return "Error: Job ID is required and cannot be empty"
537
+
538
+ with LogSpan(span="firecrawl.crawl_status", job_id=id) as span:
539
+ client = _get_client()
540
+ if client is None:
541
+ return "Error: FIRECRAWL_API_KEY secret not configured"
542
+
543
+ try:
544
+ result = client.get_crawl_status(id)
545
+
546
+ if isinstance(result, dict):
547
+ span.add(status=result.get("status"))
548
+ return result
549
+
550
+ # Handle CrawlStatusResponse object
551
+ status = getattr(result, "status", "unknown")
552
+ span.add(status=status)
553
+
554
+ response: dict[str, Any] = {
555
+ "id": id,
556
+ "status": status,
557
+ }
558
+
559
+ # Add optional fields if present
560
+ if hasattr(result, "completed"):
561
+ response["completed"] = result.completed
562
+ if hasattr(result, "total"):
563
+ response["total"] = result.total
564
+ if hasattr(result, "data"):
565
+ response["data"] = result.data
566
+
567
+ return response
568
+
569
+ except Exception as e:
570
+ error_msg = f"Status check failed: {e}"
571
+ span.add(error=str(e))
572
+ return error_msg
573
+
574
+
575
+ def extract(
576
+ *,
577
+ urls: list[str],
578
+ prompt: str | None = None,
579
+ schema: dict[str, Any] | None = None,
580
+ system_prompt: str | None = None,
581
+ allow_external_links: bool = False,
582
+ ) -> dict[str, Any] | str:
583
+ """Extract structured data from URLs using LLM.
584
+
585
+ Uses an LLM to extract data matching a JSON schema from web pages.
586
+
587
+ Args:
588
+ urls: URLs to extract data from
589
+ prompt: Natural language description of what to extract
590
+ schema: JSON schema defining the structure of extracted data
591
+ (OpenAI JSON schema format)
592
+ system_prompt: Custom system prompt for the LLM
593
+ allow_external_links: Follow external links during extraction (default: False)
594
+
595
+ Returns:
596
+ Dict with extracted data matching schema, or error message
597
+
598
+ Example:
599
+ # Extract with prompt
600
+ firecrawl.extract(
601
+ urls=["https://example.com/products"],
602
+ prompt="Extract product names and prices"
603
+ )
604
+
605
+ # Extract with schema
606
+ firecrawl.extract(
607
+ urls=["https://example.com/team"],
608
+ schema={
609
+ "type": "object",
610
+ "properties": {
611
+ "team_members": {
612
+ "type": "array",
613
+ "items": {
614
+ "type": "object",
615
+ "properties": {
616
+ "name": {"type": "string"},
617
+ "role": {"type": "string"}
618
+ }
619
+ }
620
+ }
621
+ }
622
+ }
623
+ )
624
+ """
625
+ with LogSpan(span="firecrawl.extract", url_count=len(urls)) as span:
626
+ client = _get_client()
627
+ if client is None:
628
+ return "Error: FIRECRAWL_API_KEY secret not configured"
629
+
630
+ if not prompt and not schema:
631
+ return "Error: Either prompt or schema is required"
632
+
633
+ try:
634
+ # Build kwargs for v2 API
635
+ kwargs: dict[str, Any] = {}
636
+
637
+ if prompt:
638
+ kwargs["prompt"] = prompt
639
+ if schema:
640
+ kwargs["schema"] = schema
641
+ if system_prompt:
642
+ kwargs["system_prompt"] = system_prompt
643
+ if allow_external_links:
644
+ kwargs["allow_external_links"] = True
645
+
646
+ result = client.extract(urls, **kwargs)
647
+
648
+ if isinstance(result, dict):
649
+ span.add(success=True)
650
+ return result
651
+
652
+ # Handle ExtractResponse object
653
+ data = getattr(result, "data", None)
654
+ span.add(success=True)
655
+ return {"data": data} if data else result
656
+
657
+ except Exception as e:
658
+ error_msg = f"Extract failed: {e}"
659
+ span.add(error=str(e))
660
+ return error_msg
661
+
662
+
663
+ def deep_research(
664
+ *,
665
+ prompt: str,
666
+ urls: list[str] | None = None,
667
+ timeout: int | None = None,
668
+ max_credits: int | None = None,
669
+ ) -> dict[str, Any] | str:
670
+ """Run autonomous deep research on a topic.
671
+
672
+ Launches an AI agent that autonomously researches a topic by
673
+ searching, crawling, and synthesizing information from the web.
674
+
675
+ Args:
676
+ prompt: Research question or topic
677
+ urls: Starting URLs to research (optional, will search if not provided)
678
+ timeout: Time limit in seconds for the research
679
+ max_credits: Maximum credits to spend on research
680
+
681
+ Returns:
682
+ Dict with research results and sources, or error message
683
+
684
+ Example:
685
+ # Research a topic
686
+ firecrawl.deep_research(
687
+ prompt="What are the latest developments in quantum computing?",
688
+ timeout=300
689
+ )
690
+
691
+ # Research from specific sources
692
+ firecrawl.deep_research(
693
+ prompt="Compare pricing models",
694
+ urls=["https://company1.com/pricing", "https://company2.com/pricing"]
695
+ )
696
+ """
697
+ with LogSpan(span="firecrawl.deep_research", prompt=prompt[:100]) as span:
698
+ client = _get_client()
699
+ if client is None:
700
+ return "Error: FIRECRAWL_API_KEY secret not configured"
701
+
702
+ try:
703
+ # Build kwargs for v2 API (uses 'agent' method)
704
+ kwargs: dict[str, Any] = {"prompt": prompt}
705
+
706
+ if urls:
707
+ kwargs["urls"] = urls
708
+ if timeout is not None:
709
+ kwargs["timeout"] = timeout
710
+ if max_credits is not None:
711
+ kwargs["max_credits"] = max_credits
712
+
713
+ # The SDK's agent method corresponds to deep research
714
+ result = client.agent(**kwargs)
715
+
716
+ if isinstance(result, dict):
717
+ span.add(success=True)
718
+ return result
719
+
720
+ # Handle response object
721
+ data = getattr(result, "data", None)
722
+ sources = getattr(result, "sources", None)
723
+ span.add(success=True, source_count=len(sources) if sources else 0)
724
+ return {
725
+ "data": data,
726
+ "sources": sources,
727
+ }
728
+
729
+ except Exception as e:
730
+ error_msg = f"Deep research failed: {e}"
731
+ span.add(error=str(e))
732
+ return error_msg