firecrawl-sdk 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +23 -0
- data/lib/firecrawl/client.rb +30 -0
- data/lib/firecrawl/http_client.rb +48 -1
- data/lib/firecrawl/models/parse_file.rb +61 -0
- data/lib/firecrawl/models/parse_options.rb +74 -0
- data/lib/firecrawl/models/scrape_options.rb +2 -1
- data/lib/firecrawl/version.rb +1 -1
- data/lib/firecrawl.rb +2 -0
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d3ba7b13956543224f92cc64a39113b60c4b1d97a13fe10e50716ce57e24c439
|
|
4
|
+
data.tar.gz: 4bb002101feeae8249ee7d441a3dc5306a4c07b39f44c83911cda233904439eb
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 43c4e2417a6180f128d76a1294a60ddc6cb2c19097776c7c26dc035e1c06fe910e0e43e151ed005f06a367c76ea2503cc9a824c555b0b5859661d35aeb0bdd2a
|
|
7
|
+
data.tar.gz: bb8f67d78fb04ff15a2c4e7c7daf3a3f6a5d57064b995ac045895b705d91b330cbbb92439ab0a741a8e48fc50c45c2a9f92c6b8705994b16915aca0d7940da73
|
data/README.md
CHANGED
|
@@ -63,6 +63,29 @@ doc = client.scrape("https://example.com",
|
|
|
63
63
|
puts doc.html
|
|
64
64
|
```
|
|
65
65
|
|
|
66
|
+
### Parse
|
|
67
|
+
|
|
68
|
+
Upload a local file (`html`, `pdf`, `docx`, etc.) via multipart form data and
|
|
69
|
+
parse it synchronously. Parse options intentionally exclude browser-only
|
|
70
|
+
features such as change tracking, screenshot, branding, actions, wait_for,
|
|
71
|
+
location, and mobile. The `proxy` option only accepts `"auto"` or `"basic"`.
|
|
72
|
+
|
|
73
|
+
```ruby
|
|
74
|
+
# From disk
|
|
75
|
+
file = Firecrawl::Models::ParseFile.from_path("./document.pdf")
|
|
76
|
+
|
|
77
|
+
# Or from memory
|
|
78
|
+
file = Firecrawl::Models::ParseFile.new(
|
|
79
|
+
filename: "upload.html",
|
|
80
|
+
content: "<html>hi</html>",
|
|
81
|
+
content_type: "text/html"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
doc = client.parse(file,
|
|
85
|
+
Firecrawl::Models::ParseOptions.new(formats: ["markdown"]))
|
|
86
|
+
puts doc.markdown
|
|
87
|
+
```
|
|
88
|
+
|
|
66
89
|
### Crawl
|
|
67
90
|
|
|
68
91
|
```ruby
|
data/lib/firecrawl/client.rb
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
3
5
|
module Firecrawl
|
|
4
6
|
# Client for the Firecrawl v2 API.
|
|
5
7
|
#
|
|
@@ -106,6 +108,34 @@ module Firecrawl
|
|
|
106
108
|
@http.delete("/v2/scrape/#{job_id}/interact")
|
|
107
109
|
end
|
|
108
110
|
|
|
111
|
+
# ================================================================
|
|
112
|
+
# PARSE
|
|
113
|
+
# ================================================================
|
|
114
|
+
|
|
115
|
+
# Parses an uploaded file and returns the extracted document.
|
|
116
|
+
#
|
|
117
|
+
# @param file [Models::ParseFile] file payload to upload
|
|
118
|
+
# @param options [Models::ParseOptions, nil] parse configuration
|
|
119
|
+
# @return [Models::Document]
|
|
120
|
+
def parse(file, options = nil)
|
|
121
|
+
raise ArgumentError, "File is required" if file.nil?
|
|
122
|
+
unless file.is_a?(Models::ParseFile)
|
|
123
|
+
raise ArgumentError, "File must be a Firecrawl::Models::ParseFile"
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
options_hash = options.nil? ? {} : options.to_h
|
|
127
|
+
raw = @http.post_multipart(
|
|
128
|
+
"/v2/parse",
|
|
129
|
+
fields: { "options" => JSON.generate(options_hash) },
|
|
130
|
+
file_field: "file",
|
|
131
|
+
filename: file.filename,
|
|
132
|
+
content: file.content,
|
|
133
|
+
content_type: file.content_type,
|
|
134
|
+
)
|
|
135
|
+
data = raw["data"] || raw
|
|
136
|
+
Models::Document.new(data)
|
|
137
|
+
end
|
|
138
|
+
|
|
109
139
|
# ================================================================
|
|
110
140
|
# CRAWL
|
|
111
141
|
# ================================================================
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "net/http"
|
|
4
4
|
require "json"
|
|
5
|
+
require "securerandom"
|
|
5
6
|
require "uri"
|
|
6
7
|
|
|
7
8
|
module Firecrawl
|
|
@@ -58,9 +59,53 @@ module Firecrawl
|
|
|
58
59
|
execute_with_retry(uri, request)
|
|
59
60
|
end
|
|
60
61
|
|
|
62
|
+
# Sends a POST request with a multipart/form-data body.
|
|
63
|
+
#
|
|
64
|
+
# @param path [String] API path
|
|
65
|
+
# @param fields [Hash{String=>String}] additional form fields to include
|
|
66
|
+
# @param file_field [String] form field name for the file part (e.g. "file")
|
|
67
|
+
# @param filename [String] filename to send with the file part
|
|
68
|
+
# @param content [String] raw bytes for the file part
|
|
69
|
+
# @param content_type [String, nil] optional MIME type for the file part
|
|
70
|
+
def post_multipart(path, fields:, file_field:, filename:, content:, content_type: nil)
|
|
71
|
+
uri = URI("#{@base_url}#{path}")
|
|
72
|
+
boundary = "----FirecrawlBoundary#{SecureRandom.hex(16)}"
|
|
73
|
+
body = build_multipart_body(boundary, fields, file_field, filename, content, content_type)
|
|
74
|
+
|
|
75
|
+
builder = lambda do
|
|
76
|
+
request = Net::HTTP::Post.new(uri)
|
|
77
|
+
request["Authorization"] = "Bearer #{@api_key}"
|
|
78
|
+
request["Content-Type"] = "multipart/form-data; boundary=#{boundary}"
|
|
79
|
+
request.body = body
|
|
80
|
+
request
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
execute_with_retry(uri, builder.call, request_builder: builder)
|
|
84
|
+
end
|
|
85
|
+
|
|
61
86
|
private
|
|
62
87
|
|
|
63
|
-
def
|
|
88
|
+
def build_multipart_body(boundary, fields, file_field, filename, content, content_type)
|
|
89
|
+
parts = +""
|
|
90
|
+
fields.each do |name, value|
|
|
91
|
+
parts << "--#{boundary}\r\n"
|
|
92
|
+
parts << %(Content-Disposition: form-data; name="#{name}"\r\n\r\n)
|
|
93
|
+
parts << value.to_s
|
|
94
|
+
parts << "\r\n"
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
parts << "--#{boundary}\r\n"
|
|
98
|
+
safe_file_field = file_field.to_s.gsub(/[\r\n"]/, "_")
|
|
99
|
+
safe_filename = filename.to_s.gsub(/[\r\n"]/, "_")
|
|
100
|
+
parts << %(Content-Disposition: form-data; name="#{safe_file_field}"; filename="#{safe_filename}"\r\n)
|
|
101
|
+
parts << "Content-Type: #{content_type || "application/octet-stream"}\r\n\r\n"
|
|
102
|
+
parts.force_encoding(Encoding::ASCII_8BIT)
|
|
103
|
+
parts << content.to_s.dup.force_encoding(Encoding::ASCII_8BIT)
|
|
104
|
+
parts << "\r\n--#{boundary}--\r\n"
|
|
105
|
+
parts
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def execute_with_retry(uri, request, request_builder: nil)
|
|
64
109
|
attempt = 0
|
|
65
110
|
loop do
|
|
66
111
|
response = perform_request(uri, request)
|
|
@@ -89,6 +134,7 @@ module Firecrawl
|
|
|
89
134
|
if attempt < @max_retries
|
|
90
135
|
attempt += 1
|
|
91
136
|
sleep_with_backoff(attempt)
|
|
137
|
+
request = request_builder.call if request_builder
|
|
92
138
|
next
|
|
93
139
|
end
|
|
94
140
|
|
|
@@ -98,6 +144,7 @@ module Firecrawl
|
|
|
98
144
|
if attempt < @max_retries
|
|
99
145
|
attempt += 1
|
|
100
146
|
sleep_with_backoff(attempt)
|
|
147
|
+
request = request_builder.call if request_builder
|
|
101
148
|
retry
|
|
102
149
|
end
|
|
103
150
|
raise FirecrawlError.new("Request failed: #{e.message}")
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Firecrawl
|
|
4
|
+
module Models
|
|
5
|
+
# Binary upload payload for the `/v2/parse` endpoint.
|
|
6
|
+
#
|
|
7
|
+
# Supported file extensions: .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls
|
|
8
|
+
class ParseFile
|
|
9
|
+
attr_reader :filename, :content, :content_type
|
|
10
|
+
|
|
11
|
+
# Build a ParseFile directly.
|
|
12
|
+
#
|
|
13
|
+
# @param filename [String] filename for the upload (e.g., "document.pdf")
|
|
14
|
+
# @param content [String] raw bytes for the file
|
|
15
|
+
# @param content_type [String, nil] optional MIME type hint
|
|
16
|
+
def initialize(filename:, content:, content_type: nil)
|
|
17
|
+
raise ArgumentError, "filename is required" if filename.nil? || filename.to_s.strip.empty?
|
|
18
|
+
raise ArgumentError, "content is required" if content.nil? || content.bytesize.zero?
|
|
19
|
+
|
|
20
|
+
@filename = filename.to_s.strip
|
|
21
|
+
@content = content.to_s
|
|
22
|
+
@content_type = content_type
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Build a ParseFile by reading a file from disk.
|
|
26
|
+
#
|
|
27
|
+
# @param path [String] absolute or relative path to the file
|
|
28
|
+
# @param filename [String, nil] optional override for the upload filename
|
|
29
|
+
# @param content_type [String, nil] optional MIME type hint
|
|
30
|
+
# @return [ParseFile]
|
|
31
|
+
def self.from_path(path, filename: nil, content_type: nil)
|
|
32
|
+
raise ArgumentError, "path is required" if path.nil? || path.to_s.strip.empty?
|
|
33
|
+
unless File.file?(path)
|
|
34
|
+
raise ArgumentError, "file path does not exist: #{path}"
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
content = File.binread(path)
|
|
38
|
+
resolved_filename = filename || File.basename(path)
|
|
39
|
+
resolved_content_type = content_type || guess_content_type(resolved_filename)
|
|
40
|
+
new(filename: resolved_filename, content: content, content_type: resolved_content_type)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# @api private
|
|
44
|
+
def self.guess_content_type(filename)
|
|
45
|
+
ext = File.extname(filename).downcase
|
|
46
|
+
{
|
|
47
|
+
".pdf" => "application/pdf",
|
|
48
|
+
".html" => "text/html",
|
|
49
|
+
".htm" => "text/html",
|
|
50
|
+
".xhtml" => "application/xhtml+xml",
|
|
51
|
+
".docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
52
|
+
".doc" => "application/msword",
|
|
53
|
+
".odt" => "application/vnd.oasis.opendocument.text",
|
|
54
|
+
".rtf" => "application/rtf",
|
|
55
|
+
".xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
56
|
+
".xls" => "application/vnd.ms-excel",
|
|
57
|
+
}[ext]
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Firecrawl
|
|
4
|
+
module Models
|
|
5
|
+
# Options for parsing uploaded files via `/v2/parse`.
|
|
6
|
+
#
|
|
7
|
+
# Parse does not support browser-rendering features (actions, waitFor,
|
|
8
|
+
# location, mobile) nor screenshot/branding/changeTracking formats. The
|
|
9
|
+
# proxy field only accepts "auto" or "basic".
|
|
10
|
+
class ParseOptions
|
|
11
|
+
UNSUPPORTED_FORMATS = %w[changeTracking screenshot screenshot@fullPage branding].freeze
|
|
12
|
+
|
|
13
|
+
FIELDS = %i[
|
|
14
|
+
formats headers include_tags exclude_tags only_main_content
|
|
15
|
+
timeout parsers skip_tls_verification remove_base64_images
|
|
16
|
+
block_ads proxy integration json_options
|
|
17
|
+
].freeze
|
|
18
|
+
|
|
19
|
+
attr_reader(*FIELDS)
|
|
20
|
+
|
|
21
|
+
def initialize(**kwargs)
|
|
22
|
+
FIELDS.each { |f| instance_variable_set(:"@#{f}", kwargs[f]) }
|
|
23
|
+
|
|
24
|
+
validate!
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def to_h
|
|
28
|
+
{
|
|
29
|
+
"formats" => formats,
|
|
30
|
+
"headers" => headers,
|
|
31
|
+
"includeTags" => include_tags,
|
|
32
|
+
"excludeTags" => exclude_tags,
|
|
33
|
+
"onlyMainContent" => only_main_content,
|
|
34
|
+
"timeout" => timeout,
|
|
35
|
+
"parsers" => parsers,
|
|
36
|
+
"skipTlsVerification" => skip_tls_verification,
|
|
37
|
+
"removeBase64Images" => remove_base64_images,
|
|
38
|
+
"blockAds" => block_ads,
|
|
39
|
+
"proxy" => proxy,
|
|
40
|
+
"integration" => integration,
|
|
41
|
+
"jsonOptions" => json_options.is_a?(Hash) ? json_options : json_options&.to_h,
|
|
42
|
+
}.compact
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
def validate!
|
|
48
|
+
if !timeout.nil? && timeout.to_i <= 0
|
|
49
|
+
raise ArgumentError, "timeout must be positive"
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
if !proxy.nil? && !proxy.to_s.empty? && !%w[auto basic].include?(proxy.to_s)
|
|
53
|
+
raise ArgumentError, "parse only supports proxy values 'auto' or 'basic'"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
(formats || []).each do |fmt|
|
|
57
|
+
type = extract_format_type(fmt)
|
|
58
|
+
if UNSUPPORTED_FORMATS.include?(type)
|
|
59
|
+
raise ArgumentError, "parse does not support format: #{type}"
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def extract_format_type(fmt)
|
|
65
|
+
case fmt
|
|
66
|
+
when String then fmt
|
|
67
|
+
when Hash then fmt["type"] || fmt[:type]
|
|
68
|
+
else
|
|
69
|
+
fmt.respond_to?(:type) ? fmt.type : nil
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -8,7 +8,7 @@ module Firecrawl
|
|
|
8
8
|
formats headers include_tags exclude_tags only_main_content
|
|
9
9
|
timeout wait_for mobile parsers actions location
|
|
10
10
|
skip_tls_verification remove_base64_images block_ads proxy
|
|
11
|
-
max_age store_in_cache integration
|
|
11
|
+
max_age store_in_cache lockdown integration
|
|
12
12
|
].freeze
|
|
13
13
|
|
|
14
14
|
attr_reader(*FIELDS)
|
|
@@ -37,6 +37,7 @@ module Firecrawl
|
|
|
37
37
|
"proxy" => proxy,
|
|
38
38
|
"maxAge" => max_age,
|
|
39
39
|
"storeInCache" => store_in_cache,
|
|
40
|
+
"lockdown" => lockdown,
|
|
40
41
|
"integration" => integration,
|
|
41
42
|
}.compact
|
|
42
43
|
end
|
data/lib/firecrawl/version.rb
CHANGED
data/lib/firecrawl.rb
CHANGED
|
@@ -15,6 +15,8 @@ require_relative "firecrawl/models/map_options"
|
|
|
15
15
|
require_relative "firecrawl/models/map_data"
|
|
16
16
|
require_relative "firecrawl/models/search_options"
|
|
17
17
|
require_relative "firecrawl/models/search_data"
|
|
18
|
+
require_relative "firecrawl/models/parse_file"
|
|
19
|
+
require_relative "firecrawl/models/parse_options"
|
|
18
20
|
require_relative "firecrawl/models/agent_options"
|
|
19
21
|
require_relative "firecrawl/models/agent_response"
|
|
20
22
|
require_relative "firecrawl/models/agent_status_response"
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: firecrawl-sdk
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Firecrawl
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-22 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: A type-safe Ruby client for the Firecrawl v2 API. Supports scraping,
|
|
14
14
|
crawling, batch scraping, URL mapping, web search, and AI agent operations.
|
|
@@ -38,6 +38,8 @@ files:
|
|
|
38
38
|
- lib/firecrawl/models/document.rb
|
|
39
39
|
- lib/firecrawl/models/map_data.rb
|
|
40
40
|
- lib/firecrawl/models/map_options.rb
|
|
41
|
+
- lib/firecrawl/models/parse_file.rb
|
|
42
|
+
- lib/firecrawl/models/parse_options.rb
|
|
41
43
|
- lib/firecrawl/models/scrape_options.rb
|
|
42
44
|
- lib/firecrawl/models/search_data.rb
|
|
43
45
|
- lib/firecrawl/models/search_options.rb
|