RubyGems - firecrawl-sdk - Versions diffs - 1.0.0 → 1.1.0 - Mend

firecrawl-sdk 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/README.md +23 -0
data/lib/firecrawl/client.rb +30 -0
data/lib/firecrawl/http_client.rb +48 -1
data/lib/firecrawl/models/parse_file.rb +61 -0
data/lib/firecrawl/models/parse_options.rb +74 -0
data/lib/firecrawl/version.rb +1 -1
data/lib/firecrawl.rb +2 -0
metadata +4 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 7266e8ff84ad11eebc0312025933594af43172061ea8d2f5959b65d98eb34f64
-  data.tar.gz: 48abbfb695f5f9e688e9b02fe5120cfdb6c55bc9c1c22398fd6bbb32582d606e
+  metadata.gz: 714d7f95b4d8a0c8d0c414445011a4ab74667dcb8b8bb061c9256db09608d11b
+  data.tar.gz: f154bcfb66abe769f267b5961ea79954d883895678870351bbd1c5db663595ef
 SHA512:
-  metadata.gz: 0b69ffbc921e023aba67107a388b44aa90f1f479f9c62ad494734536beea0e831be9b6988ad50132fcfecce39250745b9e38edf37ef22ac1f7f2fecae761615b
-  data.tar.gz: 48f082ce92fb3bc1f6c48a4cdef5b3d2f4a074185fd2e87d3f661a0983835bf60f16c085b118cd134d132afd418e8bad21b54c17faa2c507261bbe7620cf9e00
+  metadata.gz: 91b0c2f2d3547be3c15a75d8da88a2f59c49fc5c1fa07e19570f183216ed822f878a167e336707fffb114c06870009433207cd043f2a75514257ed6d2ca07a01
+  data.tar.gz: 83eff71651dfb1eb5db0d84a3258c5928633b18d2acb441c64440a19c9307b924a6458c568078a5bdf86d8d9d999cf1254424d3c28721ee8c4d4387d9c6772ac

data/README.md CHANGED Viewed

@@ -63,6 +63,29 @@ doc = client.scrape("https://example.com",
 puts doc.html
 ```
+### Parse
+Upload a local file (`html`, `pdf`, `docx`, etc.) via multipart form data and
+parse it synchronously. Parse options intentionally exclude browser-only
+features such as change tracking, screenshot, branding, actions, wait_for,
+location, and mobile. The `proxy` option only accepts `"auto"` or `"basic"`.
+```ruby
+# From disk
+file = Firecrawl::Models::ParseFile.from_path("./document.pdf")
+# Or from memory
+file = Firecrawl::Models::ParseFile.new(
+  filename: "upload.html",
+  content: "<html>hi</html>",
+  content_type: "text/html"
+)
+doc = client.parse(file,
+  Firecrawl::Models::ParseOptions.new(formats: ["markdown"]))
+puts doc.markdown
+```
 ### Crawl
 ```ruby

data/lib/firecrawl/client.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 # frozen_string_literal: true
+require "json"
 module Firecrawl
   # Client for the Firecrawl v2 API.
   #
@@ -106,6 +108,34 @@ module Firecrawl
       @http.delete("/v2/scrape/#{job_id}/interact")
     end
+    # ================================================================
+    # PARSE
+    # ================================================================
+    # Parses an uploaded file and returns the extracted document.
+    #
+    # @param file [Models::ParseFile] file payload to upload
+    # @param options [Models::ParseOptions, nil] parse configuration
+    # @return [Models::Document]
+    def parse(file, options = nil)
+      raise ArgumentError, "File is required" if file.nil?
+      unless file.is_a?(Models::ParseFile)
+        raise ArgumentError, "File must be a Firecrawl::Models::ParseFile"
+      end
+      options_hash = options.nil? ? {} : options.to_h
+      raw = @http.post_multipart(
+        "/v2/parse",
+        fields: { "options" => JSON.generate(options_hash) },
+        file_field: "file",
+        filename: file.filename,
+        content: file.content,
+        content_type: file.content_type,
+      )
+      data = raw["data"] || raw
+      Models::Document.new(data)
+    end
     # ================================================================
     # CRAWL
     # ================================================================

data/lib/firecrawl/http_client.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require "net/http"
 require "json"
+require "securerandom"
 require "uri"
 module Firecrawl
@@ -58,9 +59,53 @@ module Firecrawl
       execute_with_retry(uri, request)
     end
+    # Sends a POST request with a multipart/form-data body.
+    #
+    # @param path [String] API path
+    # @param fields [Hash{String=>String}] additional form fields to include
+    # @param file_field [String] form field name for the file part (e.g. "file")
+    # @param filename [String] filename to send with the file part
+    # @param content [String] raw bytes for the file part
+    # @param content_type [String, nil] optional MIME type for the file part
+    def post_multipart(path, fields:, file_field:, filename:, content:, content_type: nil)
+      uri = URI("#{@base_url}#{path}")
+      boundary = "----FirecrawlBoundary#{SecureRandom.hex(16)}"
+      body = build_multipart_body(boundary, fields, file_field, filename, content, content_type)
+      builder = lambda do
+        request = Net::HTTP::Post.new(uri)
+        request["Authorization"] = "Bearer #{@api_key}"
+        request["Content-Type"] = "multipart/form-data; boundary=#{boundary}"
+        request.body = body
+        request
+      end
+      execute_with_retry(uri, builder.call, request_builder: builder)
+    end
     private
-    def execute_with_retry(uri, request)
+    def build_multipart_body(boundary, fields, file_field, filename, content, content_type)
+      parts = +""
+      fields.each do |name, value|
+        parts << "--#{boundary}\r\n"
+        parts << %(Content-Disposition: form-data; name="#{name}"\r\n\r\n)
+        parts << value.to_s
+        parts << "\r\n"
+      end
+      parts << "--#{boundary}\r\n"
+      safe_file_field = file_field.to_s.gsub(/[\r\n"]/, "_")
+      safe_filename = filename.to_s.gsub(/[\r\n"]/, "_")
+      parts << %(Content-Disposition: form-data; name="#{safe_file_field}"; filename="#{safe_filename}"\r\n)
+      parts << "Content-Type: #{content_type || "application/octet-stream"}\r\n\r\n"
+      parts.force_encoding(Encoding::ASCII_8BIT)
+      parts << content.to_s.dup.force_encoding(Encoding::ASCII_8BIT)
+      parts << "\r\n--#{boundary}--\r\n"
+      parts
+    end
+    def execute_with_retry(uri, request, request_builder: nil)
       attempt = 0
       loop do
         response = perform_request(uri, request)
@@ -89,6 +134,7 @@ module Firecrawl
         if attempt < @max_retries
           attempt += 1
           sleep_with_backoff(attempt)
+          request = request_builder.call if request_builder
           next
         end
@@ -98,6 +144,7 @@ module Firecrawl
         if attempt < @max_retries
           attempt += 1
           sleep_with_backoff(attempt)
+          request = request_builder.call if request_builder
           retry
         end
         raise FirecrawlError.new("Request failed: #{e.message}")

data/lib/firecrawl/models/parse_file.rb ADDED Viewed

@@ -0,0 +1,61 @@
+# frozen_string_literal: true
+module Firecrawl
+  module Models
+    # Binary upload payload for the `/v2/parse` endpoint.
+    #
+    # Supported file extensions: .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls
+    class ParseFile
+      attr_reader :filename, :content, :content_type
+      # Build a ParseFile directly.
+      #
+      # @param filename [String] filename for the upload (e.g., "document.pdf")
+      # @param content [String] raw bytes for the file
+      # @param content_type [String, nil] optional MIME type hint
+      def initialize(filename:, content:, content_type: nil)
+        raise ArgumentError, "filename is required" if filename.nil? || filename.to_s.strip.empty?
+        raise ArgumentError, "content is required" if content.nil? || content.bytesize.zero?
+        @filename = filename.to_s.strip
+        @content = content.to_s
+        @content_type = content_type
+      end
+      # Build a ParseFile by reading a file from disk.
+      #
+      # @param path [String] absolute or relative path to the file
+      # @param filename [String, nil] optional override for the upload filename
+      # @param content_type [String, nil] optional MIME type hint
+      # @return [ParseFile]
+      def self.from_path(path, filename: nil, content_type: nil)
+        raise ArgumentError, "path is required" if path.nil? || path.to_s.strip.empty?
+        unless File.file?(path)
+          raise ArgumentError, "file path does not exist: #{path}"
+        end
+        content = File.binread(path)
+        resolved_filename = filename || File.basename(path)
+        resolved_content_type = content_type || guess_content_type(resolved_filename)
+        new(filename: resolved_filename, content: content, content_type: resolved_content_type)
+      end
+      # @api private
+      def self.guess_content_type(filename)
+        ext = File.extname(filename).downcase
+        {
+          ".pdf" => "application/pdf",
+          ".html" => "text/html",
+          ".htm" => "text/html",
+          ".xhtml" => "application/xhtml+xml",
+          ".docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+          ".doc" => "application/msword",
+          ".odt" => "application/vnd.oasis.opendocument.text",
+          ".rtf" => "application/rtf",
+          ".xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+          ".xls" => "application/vnd.ms-excel",
+        }[ext]
+      end
+    end
+  end
+end

data/lib/firecrawl/models/parse_options.rb ADDED Viewed

@@ -0,0 +1,74 @@
+# frozen_string_literal: true
+module Firecrawl
+  module Models
+    # Options for parsing uploaded files via `/v2/parse`.
+    #
+    # Parse does not support browser-rendering features (actions, waitFor,
+    # location, mobile) nor screenshot/branding/changeTracking formats. The
+    # proxy field only accepts "auto" or "basic".
+    class ParseOptions
+      UNSUPPORTED_FORMATS = %w[changeTracking screenshot screenshot@fullPage branding].freeze
+      FIELDS = %i[
+        formats headers include_tags exclude_tags only_main_content
+        timeout parsers skip_tls_verification remove_base64_images
+        block_ads proxy integration json_options
+      ].freeze
+      attr_reader(*FIELDS)
+      def initialize(**kwargs)
+        FIELDS.each { |f| instance_variable_set(:"@#{f}", kwargs[f]) }
+        validate!
+      end
+      def to_h
+        {
+          "formats" => formats,
+          "headers" => headers,
+          "includeTags" => include_tags,
+          "excludeTags" => exclude_tags,
+          "onlyMainContent" => only_main_content,
+          "timeout" => timeout,
+          "parsers" => parsers,
+          "skipTlsVerification" => skip_tls_verification,
+          "removeBase64Images" => remove_base64_images,
+          "blockAds" => block_ads,
+          "proxy" => proxy,
+          "integration" => integration,
+          "jsonOptions" => json_options.is_a?(Hash) ? json_options : json_options&.to_h,
+        }.compact
+      end
+      private
+      def validate!
+        if !timeout.nil? && timeout.to_i <= 0
+          raise ArgumentError, "timeout must be positive"
+        end
+        if !proxy.nil? && !proxy.to_s.empty? && !%w[auto basic].include?(proxy.to_s)
+          raise ArgumentError, "parse only supports proxy values 'auto' or 'basic'"
+        end
+        (formats || []).each do |fmt|
+          type = extract_format_type(fmt)
+          if UNSUPPORTED_FORMATS.include?(type)
+            raise ArgumentError, "parse does not support format: #{type}"
+          end
+        end
+      end
+      def extract_format_type(fmt)
+        case fmt
+        when String then fmt
+        when Hash then fmt["type"] || fmt[:type]
+        else
+          fmt.respond_to?(:type) ? fmt.type : nil
+        end
+      end
+    end
+  end
+end

data/lib/firecrawl/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Firecrawl
-  VERSION = "1.0.0"
+  VERSION = "1.1.0"
 end

data/lib/firecrawl.rb CHANGED Viewed

@@ -15,6 +15,8 @@ require_relative "firecrawl/models/map_options"
 require_relative "firecrawl/models/map_data"
 require_relative "firecrawl/models/search_options"
 require_relative "firecrawl/models/search_data"
+require_relative "firecrawl/models/parse_file"
+require_relative "firecrawl/models/parse_options"
 require_relative "firecrawl/models/agent_options"
 require_relative "firecrawl/models/agent_response"
 require_relative "firecrawl/models/agent_status_response"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: firecrawl-sdk
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.1.0
 platform: ruby
 authors:
 - Firecrawl
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2026-04-16 00:00:00.000000000 Z
+date: 2026-04-21 00:00:00.000000000 Z
 dependencies: []
 description: A type-safe Ruby client for the Firecrawl v2 API. Supports scraping,
   crawling, batch scraping, URL mapping, web search, and AI agent operations.
@@ -38,6 +38,8 @@ files:
 - lib/firecrawl/models/document.rb
 - lib/firecrawl/models/map_data.rb
 - lib/firecrawl/models/map_options.rb
+- lib/firecrawl/models/parse_file.rb
+- lib/firecrawl/models/parse_options.rb
 - lib/firecrawl/models/scrape_options.rb
 - lib/firecrawl/models/search_data.rb
 - lib/firecrawl/models/search_options.rb