firecrawl-sdk 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +197 -0
- data/lib/firecrawl/client.rb +396 -0
- data/lib/firecrawl/errors.rb +40 -0
- data/lib/firecrawl/http_client.rb +136 -0
- data/lib/firecrawl/models/agent_options.rb +33 -0
- data/lib/firecrawl/models/agent_response.rb +20 -0
- data/lib/firecrawl/models/agent_status_response.rb +25 -0
- data/lib/firecrawl/models/batch_scrape_job.rb +31 -0
- data/lib/firecrawl/models/batch_scrape_options.rb +31 -0
- data/lib/firecrawl/models/batch_scrape_response.rb +20 -0
- data/lib/firecrawl/models/concurrency_check.rb +19 -0
- data/lib/firecrawl/models/crawl_job.rb +31 -0
- data/lib/firecrawl/models/crawl_options.rb +53 -0
- data/lib/firecrawl/models/crawl_response.rb +19 -0
- data/lib/firecrawl/models/credit_usage.rb +22 -0
- data/lib/firecrawl/models/document.rb +36 -0
- data/lib/firecrawl/models/map_data.rb +27 -0
- data/lib/firecrawl/models/map_options.rb +32 -0
- data/lib/firecrawl/models/scrape_options.rb +45 -0
- data/lib/firecrawl/models/search_data.rb +23 -0
- data/lib/firecrawl/models/search_options.rb +33 -0
- data/lib/firecrawl/version.rb +5 -0
- data/lib/firecrawl.rb +23 -0
- metadata +72 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 7266e8ff84ad11eebc0312025933594af43172061ea8d2f5959b65d98eb34f64
|
|
4
|
+
data.tar.gz: 48abbfb695f5f9e688e9b02fe5120cfdb6c55bc9c1c22398fd6bbb32582d606e
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 0b69ffbc921e023aba67107a388b44aa90f1f479f9c62ad494734536beea0e831be9b6988ad50132fcfecce39250745b9e38edf37ef22ac1f7f2fecae761615b
|
|
7
|
+
data.tar.gz: 48f082ce92fb3bc1f6c48a4cdef5b3d2f4a074185fd2e87d3f661a0983835bf60f16c085b118cd134d132afd418e8bad21b54c17faa2c507261bbe7620cf9e00
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Firecrawl
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# Firecrawl Ruby SDK
|
|
2
|
+
|
|
3
|
+
Ruby SDK for the [Firecrawl](https://firecrawl.dev) v2 web scraping API.
|
|
4
|
+
|
|
5
|
+
## Prerequisites
|
|
6
|
+
|
|
7
|
+
- Ruby >= 3.0
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
Add to your `Gemfile`:
|
|
12
|
+
|
|
13
|
+
```ruby
|
|
14
|
+
gem "firecrawl-sdk", "~> 1.0"
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Or install directly:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
gem install firecrawl-sdk
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
```ruby
|
|
26
|
+
require "firecrawl"
|
|
27
|
+
|
|
28
|
+
# Create a client
|
|
29
|
+
client = Firecrawl::Client.new(api_key: "fc-your-api-key")
|
|
30
|
+
|
|
31
|
+
# Or load from FIRECRAWL_API_KEY environment variable
|
|
32
|
+
client = Firecrawl::Client.from_env
|
|
33
|
+
|
|
34
|
+
# Scrape a single page
|
|
35
|
+
doc = client.scrape("https://example.com")
|
|
36
|
+
puts doc.markdown
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Environment Setup
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
export FIRECRAWL_API_KEY="fc-your-api-key"
|
|
43
|
+
# Optional: custom API URL
|
|
44
|
+
export FIRECRAWL_API_URL="http://localhost:3002"
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## API Reference
|
|
48
|
+
|
|
49
|
+
### Scrape
|
|
50
|
+
|
|
51
|
+
```ruby
|
|
52
|
+
# Basic scrape
|
|
53
|
+
doc = client.scrape("https://example.com")
|
|
54
|
+
puts doc.markdown
|
|
55
|
+
|
|
56
|
+
# Scrape with options
|
|
57
|
+
doc = client.scrape("https://example.com",
|
|
58
|
+
Firecrawl::Models::ScrapeOptions.new(
|
|
59
|
+
formats: ["markdown", "html"],
|
|
60
|
+
only_main_content: true,
|
|
61
|
+
wait_for: 1000
|
|
62
|
+
))
|
|
63
|
+
puts doc.html
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Crawl
|
|
67
|
+
|
|
68
|
+
```ruby
|
|
69
|
+
# Crawl with auto-polling (blocks until complete)
|
|
70
|
+
job = client.crawl("https://example.com",
|
|
71
|
+
Firecrawl::Models::CrawlOptions.new(limit: 50))
|
|
72
|
+
job.data.each { |doc| puts doc.markdown }
|
|
73
|
+
|
|
74
|
+
# Async crawl
|
|
75
|
+
response = client.start_crawl("https://example.com",
|
|
76
|
+
Firecrawl::Models::CrawlOptions.new(limit: 10))
|
|
77
|
+
puts response.id
|
|
78
|
+
|
|
79
|
+
# Check status
|
|
80
|
+
status = client.get_crawl_status(response.id)
|
|
81
|
+
puts status.status
|
|
82
|
+
|
|
83
|
+
# Cancel
|
|
84
|
+
client.cancel_crawl(response.id)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Batch Scrape
|
|
88
|
+
|
|
89
|
+
```ruby
|
|
90
|
+
urls = ["https://example.com/page1", "https://example.com/page2"]
|
|
91
|
+
|
|
92
|
+
# Batch scrape with auto-polling
|
|
93
|
+
job = client.batch_scrape(urls,
|
|
94
|
+
Firecrawl::Models::BatchScrapeOptions.new(
|
|
95
|
+
options: Firecrawl::Models::ScrapeOptions.new(formats: ["markdown"])
|
|
96
|
+
))
|
|
97
|
+
job.data.each { |doc| puts doc.markdown }
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Map
|
|
101
|
+
|
|
102
|
+
```ruby
|
|
103
|
+
# Discover URLs on a website
|
|
104
|
+
result = client.map("https://example.com")
|
|
105
|
+
result.links.each { |link| puts link["url"] }
|
|
106
|
+
|
|
107
|
+
# With options
|
|
108
|
+
result = client.map("https://example.com",
|
|
109
|
+
Firecrawl::Models::MapOptions.new(limit: 100, search: "blog"))
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Search
|
|
113
|
+
|
|
114
|
+
```ruby
|
|
115
|
+
# Web search
|
|
116
|
+
results = client.search("firecrawl web scraping")
|
|
117
|
+
results.web&.each { |r| puts r["url"] }
|
|
118
|
+
|
|
119
|
+
# With options
|
|
120
|
+
results = client.search("latest news",
|
|
121
|
+
Firecrawl::Models::SearchOptions.new(limit: 5, location: "US"))
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Agent
|
|
125
|
+
|
|
126
|
+
```ruby
|
|
127
|
+
# Run an AI agent task (blocks until complete)
|
|
128
|
+
status = client.agent(
|
|
129
|
+
Firecrawl::Models::AgentOptions.new(
|
|
130
|
+
prompt: "Find the pricing information",
|
|
131
|
+
urls: ["https://example.com"]
|
|
132
|
+
))
|
|
133
|
+
puts status.data
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### Usage & Metrics
|
|
137
|
+
|
|
138
|
+
```ruby
|
|
139
|
+
# Check concurrency
|
|
140
|
+
concurrency = client.get_concurrency
|
|
141
|
+
puts concurrency.concurrency
|
|
142
|
+
|
|
143
|
+
# Check credit usage
|
|
144
|
+
usage = client.get_credit_usage
|
|
145
|
+
puts usage.remaining_credits
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## Configuration
|
|
149
|
+
|
|
150
|
+
```ruby
|
|
151
|
+
client = Firecrawl::Client.new(
|
|
152
|
+
api_key: "fc-your-api-key",
|
|
153
|
+
api_url: "https://api.firecrawl.dev", # custom API URL
|
|
154
|
+
timeout: 300, # HTTP timeout in seconds
|
|
155
|
+
max_retries: 3, # automatic retries
|
|
156
|
+
backoff_factor: 0.5 # exponential backoff factor
|
|
157
|
+
)
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Error Handling
|
|
161
|
+
|
|
162
|
+
```ruby
|
|
163
|
+
begin
|
|
164
|
+
doc = client.scrape("https://example.com")
|
|
165
|
+
rescue Firecrawl::AuthenticationError => e
|
|
166
|
+
puts "Invalid API key: #{e.message}"
|
|
167
|
+
rescue Firecrawl::RateLimitError => e
|
|
168
|
+
puts "Rate limited: #{e.message}"
|
|
169
|
+
rescue Firecrawl::JobTimeoutError => e
|
|
170
|
+
puts "Job #{e.job_id} timed out after #{e.timeout_seconds}s"
|
|
171
|
+
rescue Firecrawl::FirecrawlError => e
|
|
172
|
+
puts "Error (#{e.status_code}): #{e.message}"
|
|
173
|
+
end
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## Development
|
|
177
|
+
|
|
178
|
+
### Building from Source
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
cd apps/ruby-sdk
|
|
182
|
+
bundle install
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### Running Tests
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
# Unit tests
|
|
189
|
+
bundle exec rake test
|
|
190
|
+
|
|
191
|
+
# With API key for E2E tests
|
|
192
|
+
FIRECRAWL_API_KEY=fc-your-key bundle exec rake test
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## License
|
|
196
|
+
|
|
197
|
+
MIT License - see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Firecrawl
|
|
4
|
+
# Client for the Firecrawl v2 API.
|
|
5
|
+
#
|
|
6
|
+
# @example Quick start
|
|
7
|
+
# client = Firecrawl::Client.new(api_key: "fc-your-api-key")
|
|
8
|
+
#
|
|
9
|
+
# # Scrape a single page
|
|
10
|
+
# doc = client.scrape("https://example.com",
|
|
11
|
+
# Firecrawl::Models::ScrapeOptions.new(formats: ["markdown"]))
|
|
12
|
+
#
|
|
13
|
+
# # Crawl a website
|
|
14
|
+
# job = client.crawl("https://example.com",
|
|
15
|
+
# Firecrawl::Models::CrawlOptions.new(limit: 50))
|
|
16
|
+
class Client
|
|
17
|
+
DEFAULT_API_URL = "https://api.firecrawl.dev"
|
|
18
|
+
DEFAULT_TIMEOUT = 300 # seconds
|
|
19
|
+
DEFAULT_MAX_RETRIES = 3
|
|
20
|
+
DEFAULT_BACKOFF_FACTOR = 0.5
|
|
21
|
+
DEFAULT_POLL_INTERVAL = 2 # seconds
|
|
22
|
+
DEFAULT_JOB_TIMEOUT = 300 # seconds
|
|
23
|
+
|
|
24
|
+
# Creates a new Firecrawl client.
|
|
25
|
+
#
|
|
26
|
+
# @param api_key [String, nil] API key (falls back to FIRECRAWL_API_KEY env var)
|
|
27
|
+
# @param api_url [String] API base URL
|
|
28
|
+
# @param timeout [Integer] HTTP request timeout in seconds
|
|
29
|
+
# @param max_retries [Integer] maximum automatic retries for transient failures
|
|
30
|
+
# @param backoff_factor [Float] exponential backoff factor in seconds
|
|
31
|
+
def initialize(
|
|
32
|
+
api_key: nil,
|
|
33
|
+
api_url: nil,
|
|
34
|
+
timeout: DEFAULT_TIMEOUT,
|
|
35
|
+
max_retries: DEFAULT_MAX_RETRIES,
|
|
36
|
+
backoff_factor: DEFAULT_BACKOFF_FACTOR
|
|
37
|
+
)
|
|
38
|
+
resolved_key = api_key || ENV["FIRECRAWL_API_KEY"]
|
|
39
|
+
if resolved_key.nil? || resolved_key.strip.empty?
|
|
40
|
+
raise FirecrawlError, "API key is required. Provide api_key: or set FIRECRAWL_API_KEY environment variable."
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
resolved_url = api_url || ENV["FIRECRAWL_API_URL"] || DEFAULT_API_URL
|
|
44
|
+
unless resolved_url.match?(%r{\Ahttps?://}i)
|
|
45
|
+
raise FirecrawlError, "API URL must be a fully qualified HTTP or HTTPS URL (got: #{resolved_url})."
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
@http = HttpClient.new(
|
|
49
|
+
api_key: resolved_key,
|
|
50
|
+
base_url: resolved_url,
|
|
51
|
+
timeout: timeout,
|
|
52
|
+
max_retries: max_retries,
|
|
53
|
+
backoff_factor: backoff_factor
|
|
54
|
+
)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Creates a client from the FIRECRAWL_API_KEY environment variable.
|
|
58
|
+
#
|
|
59
|
+
# @return [Client]
|
|
60
|
+
def self.from_env
|
|
61
|
+
new
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# ================================================================
|
|
65
|
+
# SCRAPE
|
|
66
|
+
# ================================================================
|
|
67
|
+
|
|
68
|
+
# Scrapes a single URL and returns the document.
|
|
69
|
+
#
|
|
70
|
+
# @param url [String] the URL to scrape
|
|
71
|
+
# @param options [Models::ScrapeOptions, nil] scrape configuration
|
|
72
|
+
# @return [Models::Document]
|
|
73
|
+
def scrape(url, options = nil)
|
|
74
|
+
raise ArgumentError, "URL is required" if url.nil?
|
|
75
|
+
|
|
76
|
+
body = { "url" => url }
|
|
77
|
+
body.merge!(options.to_h) if options
|
|
78
|
+
raw = @http.post("/v2/scrape", body)
|
|
79
|
+
data = raw["data"] || raw
|
|
80
|
+
Models::Document.new(data)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Interacts with the scrape-bound browser session for a scrape job.
|
|
84
|
+
#
|
|
85
|
+
# @param job_id [String] the scrape job ID
|
|
86
|
+
# @param code [String] the code to execute
|
|
87
|
+
# @param language [String] "python", "node", or "bash" (default: "node")
|
|
88
|
+
# @param timeout [Integer, nil] execution timeout in seconds (1-300)
|
|
89
|
+
# @return [Hash] execution result with stdout, stderr, exit_code
|
|
90
|
+
def interact(job_id, code, language: "node", timeout: nil)
|
|
91
|
+
raise ArgumentError, "Job ID is required" if job_id.nil?
|
|
92
|
+
raise ArgumentError, "Code is required" if code.nil?
|
|
93
|
+
|
|
94
|
+
body = { "code" => code, "language" => language }
|
|
95
|
+
body["timeout"] = timeout if timeout
|
|
96
|
+
@http.post("/v2/scrape/#{job_id}/interact", body)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Stops the interactive browser session for a scrape job.
|
|
100
|
+
#
|
|
101
|
+
# @param job_id [String] the scrape job ID
|
|
102
|
+
# @return [Hash] stop response
|
|
103
|
+
def stop_interactive_browser(job_id)
|
|
104
|
+
raise ArgumentError, "Job ID is required" if job_id.nil?
|
|
105
|
+
|
|
106
|
+
@http.delete("/v2/scrape/#{job_id}/interact")
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# ================================================================
|
|
110
|
+
# CRAWL
|
|
111
|
+
# ================================================================
|
|
112
|
+
|
|
113
|
+
# Starts an async crawl job and returns immediately.
|
|
114
|
+
#
|
|
115
|
+
# @param url [String] the URL to start crawling from
|
|
116
|
+
# @param options [Models::CrawlOptions, nil] crawl configuration
|
|
117
|
+
# @return [Models::CrawlResponse]
|
|
118
|
+
def start_crawl(url, options = nil)
|
|
119
|
+
raise ArgumentError, "URL is required" if url.nil?
|
|
120
|
+
|
|
121
|
+
body = { "url" => url }
|
|
122
|
+
body.merge!(options.to_h) if options
|
|
123
|
+
raw = @http.post("/v2/crawl", body)
|
|
124
|
+
Models::CrawlResponse.new(raw)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Gets the status and results of a crawl job.
|
|
128
|
+
#
|
|
129
|
+
# @param job_id [String] the crawl job ID
|
|
130
|
+
# @return [Models::CrawlJob]
|
|
131
|
+
def get_crawl_status(job_id)
|
|
132
|
+
raise ArgumentError, "Job ID is required" if job_id.nil?
|
|
133
|
+
|
|
134
|
+
raw = @http.get("/v2/crawl/#{job_id}")
|
|
135
|
+
Models::CrawlJob.new(raw)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Crawls a website and waits for completion (auto-polling).
|
|
139
|
+
#
|
|
140
|
+
# @param url [String] the URL to crawl
|
|
141
|
+
# @param options [Models::CrawlOptions, nil] crawl configuration
|
|
142
|
+
# @param poll_interval [Integer] seconds between status checks
|
|
143
|
+
# @param timeout [Integer] maximum seconds to wait
|
|
144
|
+
# @return [Models::CrawlJob]
|
|
145
|
+
def crawl(url, options = nil, poll_interval: DEFAULT_POLL_INTERVAL, timeout: DEFAULT_JOB_TIMEOUT)
|
|
146
|
+
start = start_crawl(url, options)
|
|
147
|
+
poll_crawl(start.id, poll_interval, timeout)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Cancels a running crawl job.
|
|
151
|
+
#
|
|
152
|
+
# @param job_id [String] the crawl job ID
|
|
153
|
+
# @return [Hash]
|
|
154
|
+
def cancel_crawl(job_id)
|
|
155
|
+
raise ArgumentError, "Job ID is required" if job_id.nil?
|
|
156
|
+
|
|
157
|
+
@http.delete("/v2/crawl/#{job_id}")
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Gets errors from a crawl job.
|
|
161
|
+
#
|
|
162
|
+
# @param job_id [String] the crawl job ID
|
|
163
|
+
# @return [Hash]
|
|
164
|
+
def get_crawl_errors(job_id)
|
|
165
|
+
raise ArgumentError, "Job ID is required" if job_id.nil?
|
|
166
|
+
|
|
167
|
+
@http.get("/v2/crawl/#{job_id}/errors")
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# ================================================================
|
|
171
|
+
# BATCH SCRAPE
|
|
172
|
+
# ================================================================
|
|
173
|
+
|
|
174
|
+
# Starts an async batch scrape job.
|
|
175
|
+
#
|
|
176
|
+
# @param urls [Array<String>] the URLs to scrape
|
|
177
|
+
# @param options [Models::BatchScrapeOptions, nil] batch scrape configuration
|
|
178
|
+
# @return [Models::BatchScrapeResponse]
|
|
179
|
+
def start_batch_scrape(urls, options = nil)
|
|
180
|
+
raise ArgumentError, "URLs list is required" if urls.nil?
|
|
181
|
+
|
|
182
|
+
body = { "urls" => urls }
|
|
183
|
+
extra_headers = {}
|
|
184
|
+
if options
|
|
185
|
+
opts_hash = options.to_h
|
|
186
|
+
|
|
187
|
+
# idempotencyKey goes as a header, not in body
|
|
188
|
+
if options.idempotency_key && !options.idempotency_key.empty?
|
|
189
|
+
extra_headers["x-idempotency-key"] = options.idempotency_key
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Flatten nested scrape options to top level (API expects this)
|
|
193
|
+
nested = opts_hash.delete("options")
|
|
194
|
+
body.merge!(opts_hash)
|
|
195
|
+
body.merge!(nested) if nested
|
|
196
|
+
end
|
|
197
|
+
raw = @http.post("/v2/batch/scrape", body, extra_headers: extra_headers)
|
|
198
|
+
Models::BatchScrapeResponse.new(raw)
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
# Gets the status and results of a batch scrape job.
|
|
202
|
+
#
|
|
203
|
+
# @param job_id [String] the batch scrape job ID
|
|
204
|
+
# @return [Models::BatchScrapeJob]
|
|
205
|
+
def get_batch_scrape_status(job_id)
|
|
206
|
+
raise ArgumentError, "Job ID is required" if job_id.nil?
|
|
207
|
+
|
|
208
|
+
raw = @http.get("/v2/batch/scrape/#{job_id}")
|
|
209
|
+
Models::BatchScrapeJob.new(raw)
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# Batch-scrapes URLs and waits for completion (auto-polling).
|
|
213
|
+
#
|
|
214
|
+
# @param urls [Array<String>] the URLs to scrape
|
|
215
|
+
# @param options [Models::BatchScrapeOptions, nil] batch scrape configuration
|
|
216
|
+
# @param poll_interval [Integer] seconds between status checks
|
|
217
|
+
# @param timeout [Integer] maximum seconds to wait
|
|
218
|
+
# @return [Models::BatchScrapeJob]
|
|
219
|
+
def batch_scrape(urls, options = nil, poll_interval: DEFAULT_POLL_INTERVAL, timeout: DEFAULT_JOB_TIMEOUT)
|
|
220
|
+
start = start_batch_scrape(urls, options)
|
|
221
|
+
poll_batch_scrape(start.id, poll_interval, timeout)
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
# Cancels a running batch scrape job.
|
|
225
|
+
#
|
|
226
|
+
# @param job_id [String] the batch scrape job ID
|
|
227
|
+
# @return [Hash]
|
|
228
|
+
def cancel_batch_scrape(job_id)
|
|
229
|
+
raise ArgumentError, "Job ID is required" if job_id.nil?
|
|
230
|
+
|
|
231
|
+
@http.delete("/v2/batch/scrape/#{job_id}")
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
# ================================================================
|
|
235
|
+
# MAP
|
|
236
|
+
# ================================================================
|
|
237
|
+
|
|
238
|
+
# Discovers URLs on a website.
|
|
239
|
+
#
|
|
240
|
+
# @param url [String] the URL to map
|
|
241
|
+
# @param options [Models::MapOptions, nil] map configuration
|
|
242
|
+
# @return [Models::MapData]
|
|
243
|
+
def map(url, options = nil)
|
|
244
|
+
raise ArgumentError, "URL is required" if url.nil?
|
|
245
|
+
|
|
246
|
+
body = { "url" => url }
|
|
247
|
+
body.merge!(options.to_h) if options
|
|
248
|
+
raw = @http.post("/v2/map", body)
|
|
249
|
+
data = raw["data"] || raw
|
|
250
|
+
Models::MapData.new(data)
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
# ================================================================
|
|
254
|
+
# SEARCH
|
|
255
|
+
# ================================================================
|
|
256
|
+
|
|
257
|
+
# Performs a web search.
|
|
258
|
+
#
|
|
259
|
+
# @param query [String] the search query
|
|
260
|
+
# @param options [Models::SearchOptions, nil] search configuration
|
|
261
|
+
# @return [Models::SearchData]
|
|
262
|
+
def search(query, options = nil)
|
|
263
|
+
raise ArgumentError, "Query is required" if query.nil?
|
|
264
|
+
|
|
265
|
+
body = { "query" => query }
|
|
266
|
+
body.merge!(options.to_h) if options
|
|
267
|
+
raw = @http.post("/v2/search", body)
|
|
268
|
+
data = raw["data"] || raw
|
|
269
|
+
Models::SearchData.new(data)
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
# ================================================================
|
|
273
|
+
# AGENT
|
|
274
|
+
# ================================================================
|
|
275
|
+
|
|
276
|
+
# Starts an async agent task.
|
|
277
|
+
#
|
|
278
|
+
# @param options [Models::AgentOptions] agent configuration
|
|
279
|
+
# @return [Models::AgentResponse]
|
|
280
|
+
def start_agent(options)
|
|
281
|
+
raise ArgumentError, "Agent options are required" if options.nil?
|
|
282
|
+
|
|
283
|
+
raw = @http.post("/v2/agent", options.to_h)
|
|
284
|
+
Models::AgentResponse.new(raw)
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
# Gets the status of an agent task.
|
|
288
|
+
#
|
|
289
|
+
# @param job_id [String] the agent job ID
|
|
290
|
+
# @return [Models::AgentStatusResponse]
|
|
291
|
+
def get_agent_status(job_id)
|
|
292
|
+
raise ArgumentError, "Job ID is required" if job_id.nil?
|
|
293
|
+
|
|
294
|
+
raw = @http.get("/v2/agent/#{job_id}")
|
|
295
|
+
Models::AgentStatusResponse.new(raw)
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
# Runs an agent task and waits for completion (auto-polling).
|
|
299
|
+
#
|
|
300
|
+
# @param options [Models::AgentOptions] agent configuration
|
|
301
|
+
# @param poll_interval [Integer] seconds between status checks
|
|
302
|
+
# @param timeout [Integer] maximum seconds to wait
|
|
303
|
+
# @return [Models::AgentStatusResponse]
|
|
304
|
+
def agent(options, poll_interval: DEFAULT_POLL_INTERVAL, timeout: DEFAULT_JOB_TIMEOUT)
|
|
305
|
+
start = start_agent(options)
|
|
306
|
+
raise FirecrawlError, "Agent start did not return a job ID" if start.id.nil?
|
|
307
|
+
|
|
308
|
+
deadline = Time.now + timeout
|
|
309
|
+
while Time.now < deadline
|
|
310
|
+
status = get_agent_status(start.id)
|
|
311
|
+
return status if status.done?
|
|
312
|
+
|
|
313
|
+
sleep(poll_interval)
|
|
314
|
+
end
|
|
315
|
+
raise JobTimeoutError.new(start.id, timeout, "Agent")
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
# Cancels a running agent task.
|
|
319
|
+
#
|
|
320
|
+
# @param job_id [String] the agent job ID
|
|
321
|
+
# @return [Hash]
|
|
322
|
+
def cancel_agent(job_id)
|
|
323
|
+
raise ArgumentError, "Job ID is required" if job_id.nil?
|
|
324
|
+
|
|
325
|
+
@http.delete("/v2/agent/#{job_id}")
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
# ================================================================
|
|
329
|
+
# USAGE & METRICS
|
|
330
|
+
# ================================================================
|
|
331
|
+
|
|
332
|
+
# Gets current concurrency usage.
|
|
333
|
+
#
|
|
334
|
+
# @return [Models::ConcurrencyCheck]
|
|
335
|
+
def get_concurrency
|
|
336
|
+
raw = @http.get("/v2/concurrency-check")
|
|
337
|
+
Models::ConcurrencyCheck.new(raw)
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
# Gets current credit usage.
|
|
341
|
+
#
|
|
342
|
+
# @return [Models::CreditUsage]
|
|
343
|
+
def get_credit_usage
|
|
344
|
+
raw = @http.get("/v2/team/credit-usage")
|
|
345
|
+
Models::CreditUsage.new(raw)
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
private
|
|
349
|
+
|
|
350
|
+
def poll_crawl(job_id, poll_interval, timeout)
|
|
351
|
+
deadline = Time.now + timeout
|
|
352
|
+
while Time.now < deadline
|
|
353
|
+
job = get_crawl_status(job_id)
|
|
354
|
+
return paginate_crawl(job) if job.done?
|
|
355
|
+
|
|
356
|
+
sleep(poll_interval)
|
|
357
|
+
end
|
|
358
|
+
raise JobTimeoutError.new(job_id, timeout, "Crawl")
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
def poll_batch_scrape(job_id, poll_interval, timeout)
|
|
362
|
+
deadline = Time.now + timeout
|
|
363
|
+
while Time.now < deadline
|
|
364
|
+
job = get_batch_scrape_status(job_id)
|
|
365
|
+
return paginate_batch_scrape(job) if job.done?
|
|
366
|
+
|
|
367
|
+
sleep(poll_interval)
|
|
368
|
+
end
|
|
369
|
+
raise JobTimeoutError.new(job_id, timeout, "Batch scrape")
|
|
370
|
+
end
|
|
371
|
+
|
|
372
|
+
def paginate_crawl(job)
|
|
373
|
+
job.data ||= []
|
|
374
|
+
current = job
|
|
375
|
+
while current.next_url && !current.next_url.empty?
|
|
376
|
+
raw = @http.get_absolute(current.next_url)
|
|
377
|
+
next_page = Models::CrawlJob.new(raw)
|
|
378
|
+
job.data.concat(next_page.data) unless next_page.data.empty?
|
|
379
|
+
current = next_page
|
|
380
|
+
end
|
|
381
|
+
job
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
def paginate_batch_scrape(job)
|
|
385
|
+
job.data ||= []
|
|
386
|
+
current = job
|
|
387
|
+
while current.next_url && !current.next_url.empty?
|
|
388
|
+
raw = @http.get_absolute(current.next_url)
|
|
389
|
+
next_page = Models::BatchScrapeJob.new(raw)
|
|
390
|
+
job.data.concat(next_page.data) unless next_page.data.empty?
|
|
391
|
+
current = next_page
|
|
392
|
+
end
|
|
393
|
+
job
|
|
394
|
+
end
|
|
395
|
+
end
|
|
396
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Firecrawl
|
|
4
|
+
# Base error class for all Firecrawl SDK errors.
|
|
5
|
+
class FirecrawlError < StandardError
|
|
6
|
+
attr_reader :status_code, :error_code, :details
|
|
7
|
+
|
|
8
|
+
def initialize(message = nil, status_code: nil, error_code: nil, details: nil)
|
|
9
|
+
@status_code = status_code
|
|
10
|
+
@error_code = error_code
|
|
11
|
+
@details = details
|
|
12
|
+
super(message)
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Raised on 401 Unauthorized responses.
|
|
17
|
+
class AuthenticationError < FirecrawlError
|
|
18
|
+
def initialize(message = nil, error_code: nil, details: nil)
|
|
19
|
+
super(message, status_code: 401, error_code: error_code, details: details)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Raised on 429 Too Many Requests responses.
|
|
24
|
+
class RateLimitError < FirecrawlError
|
|
25
|
+
def initialize(message = nil, error_code: nil, details: nil)
|
|
26
|
+
super(message, status_code: 429, error_code: error_code, details: details)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Raised when an async job exceeds its timeout.
|
|
31
|
+
class JobTimeoutError < FirecrawlError
|
|
32
|
+
attr_reader :job_id, :timeout_seconds
|
|
33
|
+
|
|
34
|
+
def initialize(job_id, timeout_seconds, label = "Job")
|
|
35
|
+
@job_id = job_id
|
|
36
|
+
@timeout_seconds = timeout_seconds
|
|
37
|
+
super("#{label} #{job_id} timed out after #{timeout_seconds} seconds")
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|