firecrawl-sdk 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7266e8ff84ad11eebc0312025933594af43172061ea8d2f5959b65d98eb34f64
4
- data.tar.gz: 48abbfb695f5f9e688e9b02fe5120cfdb6c55bc9c1c22398fd6bbb32582d606e
3
+ metadata.gz: 714d7f95b4d8a0c8d0c414445011a4ab74667dcb8b8bb061c9256db09608d11b
4
+ data.tar.gz: f154bcfb66abe769f267b5961ea79954d883895678870351bbd1c5db663595ef
5
5
  SHA512:
6
- metadata.gz: 0b69ffbc921e023aba67107a388b44aa90f1f479f9c62ad494734536beea0e831be9b6988ad50132fcfecce39250745b9e38edf37ef22ac1f7f2fecae761615b
7
- data.tar.gz: 48f082ce92fb3bc1f6c48a4cdef5b3d2f4a074185fd2e87d3f661a0983835bf60f16c085b118cd134d132afd418e8bad21b54c17faa2c507261bbe7620cf9e00
6
+ metadata.gz: 91b0c2f2d3547be3c15a75d8da88a2f59c49fc5c1fa07e19570f183216ed822f878a167e336707fffb114c06870009433207cd043f2a75514257ed6d2ca07a01
7
+ data.tar.gz: 83eff71651dfb1eb5db0d84a3258c5928633b18d2acb441c64440a19c9307b924a6458c568078a5bdf86d8d9d999cf1254424d3c28721ee8c4d4387d9c6772ac
data/README.md CHANGED
@@ -63,6 +63,29 @@ doc = client.scrape("https://example.com",
63
63
  puts doc.html
64
64
  ```
65
65
 
66
+ ### Parse
67
+
68
+ Upload a local file (`html`, `pdf`, `docx`, etc.) via multipart form data and
69
+ parse it synchronously. Parse options intentionally exclude browser-only
70
+ features such as change tracking, screenshot, branding, actions, wait_for,
71
+ location, and mobile. The `proxy` option only accepts `"auto"` or `"basic"`.
72
+
73
+ ```ruby
74
+ # From disk
75
+ file = Firecrawl::Models::ParseFile.from_path("./document.pdf")
76
+
77
+ # Or from memory
78
+ file = Firecrawl::Models::ParseFile.new(
79
+ filename: "upload.html",
80
+ content: "<html>hi</html>",
81
+ content_type: "text/html"
82
+ )
83
+
84
+ doc = client.parse(file,
85
+ Firecrawl::Models::ParseOptions.new(formats: ["markdown"]))
86
+ puts doc.markdown
87
+ ```
88
+
66
89
  ### Crawl
67
90
 
68
91
  ```ruby
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "json"
4
+
3
5
  module Firecrawl
4
6
  # Client for the Firecrawl v2 API.
5
7
  #
@@ -106,6 +108,34 @@ module Firecrawl
106
108
  @http.delete("/v2/scrape/#{job_id}/interact")
107
109
  end
108
110
 
111
+ # ================================================================
112
+ # PARSE
113
+ # ================================================================
114
+
115
+ # Parses an uploaded file and returns the extracted document.
116
+ #
117
+ # @param file [Models::ParseFile] file payload to upload
118
+ # @param options [Models::ParseOptions, nil] parse configuration
119
+ # @return [Models::Document]
120
+ def parse(file, options = nil)
121
+ raise ArgumentError, "File is required" if file.nil?
122
+ unless file.is_a?(Models::ParseFile)
123
+ raise ArgumentError, "File must be a Firecrawl::Models::ParseFile"
124
+ end
125
+
126
+ options_hash = options.nil? ? {} : options.to_h
127
+ raw = @http.post_multipart(
128
+ "/v2/parse",
129
+ fields: { "options" => JSON.generate(options_hash) },
130
+ file_field: "file",
131
+ filename: file.filename,
132
+ content: file.content,
133
+ content_type: file.content_type,
134
+ )
135
+ data = raw["data"] || raw
136
+ Models::Document.new(data)
137
+ end
138
+
109
139
  # ================================================================
110
140
  # CRAWL
111
141
  # ================================================================
@@ -2,6 +2,7 @@
2
2
 
3
3
  require "net/http"
4
4
  require "json"
5
+ require "securerandom"
5
6
  require "uri"
6
7
 
7
8
  module Firecrawl
@@ -58,9 +59,53 @@ module Firecrawl
58
59
  execute_with_retry(uri, request)
59
60
  end
60
61
 
62
+ # Sends a POST request with a multipart/form-data body.
63
+ #
64
+ # @param path [String] API path
65
+ # @param fields [Hash{String=>String}] additional form fields to include
66
+ # @param file_field [String] form field name for the file part (e.g. "file")
67
+ # @param filename [String] filename to send with the file part
68
+ # @param content [String] raw bytes for the file part
69
+ # @param content_type [String, nil] optional MIME type for the file part
70
+ def post_multipart(path, fields:, file_field:, filename:, content:, content_type: nil)
71
+ uri = URI("#{@base_url}#{path}")
72
+ boundary = "----FirecrawlBoundary#{SecureRandom.hex(16)}"
73
+ body = build_multipart_body(boundary, fields, file_field, filename, content, content_type)
74
+
75
+ builder = lambda do
76
+ request = Net::HTTP::Post.new(uri)
77
+ request["Authorization"] = "Bearer #{@api_key}"
78
+ request["Content-Type"] = "multipart/form-data; boundary=#{boundary}"
79
+ request.body = body
80
+ request
81
+ end
82
+
83
+ execute_with_retry(uri, builder.call, request_builder: builder)
84
+ end
85
+
61
86
  private
62
87
 
63
- def execute_with_retry(uri, request)
88
+ def build_multipart_body(boundary, fields, file_field, filename, content, content_type)
89
+ parts = +""
90
+ fields.each do |name, value|
91
+ parts << "--#{boundary}\r\n"
92
+ parts << %(Content-Disposition: form-data; name="#{name}"\r\n\r\n)
93
+ parts << value.to_s
94
+ parts << "\r\n"
95
+ end
96
+
97
+ parts << "--#{boundary}\r\n"
98
+ safe_file_field = file_field.to_s.gsub(/[\r\n"]/, "_")
99
+ safe_filename = filename.to_s.gsub(/[\r\n"]/, "_")
100
+ parts << %(Content-Disposition: form-data; name="#{safe_file_field}"; filename="#{safe_filename}"\r\n)
101
+ parts << "Content-Type: #{content_type || "application/octet-stream"}\r\n\r\n"
102
+ parts.force_encoding(Encoding::ASCII_8BIT)
103
+ parts << content.to_s.dup.force_encoding(Encoding::ASCII_8BIT)
104
+ parts << "\r\n--#{boundary}--\r\n"
105
+ parts
106
+ end
107
+
108
+ def execute_with_retry(uri, request, request_builder: nil)
64
109
  attempt = 0
65
110
  loop do
66
111
  response = perform_request(uri, request)
@@ -89,6 +134,7 @@ module Firecrawl
89
134
  if attempt < @max_retries
90
135
  attempt += 1
91
136
  sleep_with_backoff(attempt)
137
+ request = request_builder.call if request_builder
92
138
  next
93
139
  end
94
140
 
@@ -98,6 +144,7 @@ module Firecrawl
98
144
  if attempt < @max_retries
99
145
  attempt += 1
100
146
  sleep_with_backoff(attempt)
147
+ request = request_builder.call if request_builder
101
148
  retry
102
149
  end
103
150
  raise FirecrawlError.new("Request failed: #{e.message}")
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Firecrawl
4
+ module Models
5
+ # Binary upload payload for the `/v2/parse` endpoint.
6
+ #
7
+ # Supported file extensions: .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls
8
+ class ParseFile
9
+ attr_reader :filename, :content, :content_type
10
+
11
+ # Build a ParseFile directly.
12
+ #
13
+ # @param filename [String] filename for the upload (e.g., "document.pdf")
14
+ # @param content [String] raw bytes for the file
15
+ # @param content_type [String, nil] optional MIME type hint
16
+ def initialize(filename:, content:, content_type: nil)
17
+ raise ArgumentError, "filename is required" if filename.nil? || filename.to_s.strip.empty?
18
+ raise ArgumentError, "content is required" if content.nil? || content.bytesize.zero?
19
+
20
+ @filename = filename.to_s.strip
21
+ @content = content.to_s
22
+ @content_type = content_type
23
+ end
24
+
25
+ # Build a ParseFile by reading a file from disk.
26
+ #
27
+ # @param path [String] absolute or relative path to the file
28
+ # @param filename [String, nil] optional override for the upload filename
29
+ # @param content_type [String, nil] optional MIME type hint
30
+ # @return [ParseFile]
31
+ def self.from_path(path, filename: nil, content_type: nil)
32
+ raise ArgumentError, "path is required" if path.nil? || path.to_s.strip.empty?
33
+ unless File.file?(path)
34
+ raise ArgumentError, "file path does not exist: #{path}"
35
+ end
36
+
37
+ content = File.binread(path)
38
+ resolved_filename = filename || File.basename(path)
39
+ resolved_content_type = content_type || guess_content_type(resolved_filename)
40
+ new(filename: resolved_filename, content: content, content_type: resolved_content_type)
41
+ end
42
+
43
+ # @api private
44
+ def self.guess_content_type(filename)
45
+ ext = File.extname(filename).downcase
46
+ {
47
+ ".pdf" => "application/pdf",
48
+ ".html" => "text/html",
49
+ ".htm" => "text/html",
50
+ ".xhtml" => "application/xhtml+xml",
51
+ ".docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
52
+ ".doc" => "application/msword",
53
+ ".odt" => "application/vnd.oasis.opendocument.text",
54
+ ".rtf" => "application/rtf",
55
+ ".xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
56
+ ".xls" => "application/vnd.ms-excel",
57
+ }[ext]
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Firecrawl
4
+ module Models
5
+ # Options for parsing uploaded files via `/v2/parse`.
6
+ #
7
+ # Parse does not support browser-rendering features (actions, waitFor,
8
+ # location, mobile) nor screenshot/branding/changeTracking formats. The
9
+ # proxy field only accepts "auto" or "basic".
10
+ class ParseOptions
11
+ UNSUPPORTED_FORMATS = %w[changeTracking screenshot screenshot@fullPage branding].freeze
12
+
13
+ FIELDS = %i[
14
+ formats headers include_tags exclude_tags only_main_content
15
+ timeout parsers skip_tls_verification remove_base64_images
16
+ block_ads proxy integration json_options
17
+ ].freeze
18
+
19
+ attr_reader(*FIELDS)
20
+
21
+ def initialize(**kwargs)
22
+ FIELDS.each { |f| instance_variable_set(:"@#{f}", kwargs[f]) }
23
+
24
+ validate!
25
+ end
26
+
27
+ def to_h
28
+ {
29
+ "formats" => formats,
30
+ "headers" => headers,
31
+ "includeTags" => include_tags,
32
+ "excludeTags" => exclude_tags,
33
+ "onlyMainContent" => only_main_content,
34
+ "timeout" => timeout,
35
+ "parsers" => parsers,
36
+ "skipTlsVerification" => skip_tls_verification,
37
+ "removeBase64Images" => remove_base64_images,
38
+ "blockAds" => block_ads,
39
+ "proxy" => proxy,
40
+ "integration" => integration,
41
+ "jsonOptions" => json_options.is_a?(Hash) ? json_options : json_options&.to_h,
42
+ }.compact
43
+ end
44
+
45
+ private
46
+
47
+ def validate!
48
+ if !timeout.nil? && timeout.to_i <= 0
49
+ raise ArgumentError, "timeout must be positive"
50
+ end
51
+
52
+ if !proxy.nil? && !proxy.to_s.empty? && !%w[auto basic].include?(proxy.to_s)
53
+ raise ArgumentError, "parse only supports proxy values 'auto' or 'basic'"
54
+ end
55
+
56
+ (formats || []).each do |fmt|
57
+ type = extract_format_type(fmt)
58
+ if UNSUPPORTED_FORMATS.include?(type)
59
+ raise ArgumentError, "parse does not support format: #{type}"
60
+ end
61
+ end
62
+ end
63
+
64
+ def extract_format_type(fmt)
65
+ case fmt
66
+ when String then fmt
67
+ when Hash then fmt["type"] || fmt[:type]
68
+ else
69
+ fmt.respond_to?(:type) ? fmt.type : nil
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Firecrawl
4
- VERSION = "1.0.0"
4
+ VERSION = "1.1.0"
5
5
  end
data/lib/firecrawl.rb CHANGED
@@ -15,6 +15,8 @@ require_relative "firecrawl/models/map_options"
15
15
  require_relative "firecrawl/models/map_data"
16
16
  require_relative "firecrawl/models/search_options"
17
17
  require_relative "firecrawl/models/search_data"
18
+ require_relative "firecrawl/models/parse_file"
19
+ require_relative "firecrawl/models/parse_options"
18
20
  require_relative "firecrawl/models/agent_options"
19
21
  require_relative "firecrawl/models/agent_response"
20
22
  require_relative "firecrawl/models/agent_status_response"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: firecrawl-sdk
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Firecrawl
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-04-16 00:00:00.000000000 Z
11
+ date: 2026-04-21 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: A type-safe Ruby client for the Firecrawl v2 API. Supports scraping,
14
14
  crawling, batch scraping, URL mapping, web search, and AI agent operations.
@@ -38,6 +38,8 @@ files:
38
38
  - lib/firecrawl/models/document.rb
39
39
  - lib/firecrawl/models/map_data.rb
40
40
  - lib/firecrawl/models/map_options.rb
41
+ - lib/firecrawl/models/parse_file.rb
42
+ - lib/firecrawl/models/parse_options.rb
41
43
  - lib/firecrawl/models/scrape_options.rb
42
44
  - lib/firecrawl/models/search_data.rb
43
45
  - lib/firecrawl/models/search_options.rb