RubyGems - rubycrawl - Versions diffs - 0.1.3 → 0.2.0 - Mend

rubycrawl 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/README.md +263 -311
data/lib/rubycrawl/browser/extraction.rb +106 -0
data/lib/rubycrawl/browser.rb +106 -0
data/lib/rubycrawl/errors.rb +1 -1
data/lib/rubycrawl/helpers.rb +9 -41
data/lib/rubycrawl/markdown_converter.rb +5 -5
data/lib/rubycrawl/result.rb +55 -25
data/lib/rubycrawl/site_crawler.rb +46 -20
data/lib/rubycrawl/tasks/install.rake +17 -56
data/lib/rubycrawl/url_normalizer.rb +5 -1
data/lib/rubycrawl/version.rb +1 -1
data/lib/rubycrawl.rb +37 -66
data/rubycrawl.gemspec +5 -5
metadata +20 -6
data/Gemfile +0 -11
data/lib/rubycrawl/service_client.rb +0 -86

data/lib/rubycrawl.rb CHANGED Viewed

@@ -3,25 +3,26 @@
 require_relative 'rubycrawl/version'
 require_relative 'rubycrawl/errors'
 require_relative 'rubycrawl/helpers'
-require_relative 'rubycrawl/service_client'
+require_relative 'rubycrawl/browser'
 require_relative 'rubycrawl/url_normalizer'
 require_relative 'rubycrawl/markdown_converter'
 require_relative 'rubycrawl/result'
 require_relative 'rubycrawl/site_crawler'
 require_relative 'rubycrawl/railtie' if defined?(Rails)
-# RubyCrawl provides a simple interface for crawling pages via a local Playwright service.
+# RubyCrawl — pure Ruby web crawler with full JavaScript rendering via Ferrum.
 class RubyCrawl
   include Helpers
-  DEFAULT_HOST = '127.0.0.1'
-  DEFAULT_PORT = 3344
   class << self
     def client
       @client ||= new
     end
+    # Crawl a single URL and return a Result.
+    # @param url [String]
+    # @param options [Hash] wait_until:, block_resources:, max_attempts:
+    # @return [RubyCrawl::Result]
     def crawl(url, **options)
       client.crawl(url, **options)
     end
@@ -34,12 +35,12 @@ class RubyCrawl
     # @param max_depth [Integer] Maximum link depth from start URL (default: 3)
     # @param same_host_only [Boolean] Only follow links on the same host (default: true)
     # @yield [page] Yields each page result as it is crawled
-    # @yieldparam page [SiteCrawler::PageResult] The crawled page result
+    # @yieldparam page [SiteCrawler::PageResult]
     # @return [Integer] Number of pages crawled
     #
-    # @example Save pages to database
+    # @example
     #   RubyCrawl.crawl_site("https://example.com", max_pages: 100) do |page|
-    #     Page.create!(url: page.url, html: page.html, depth: page.depth)
+    #     Page.create!(url: page.url, content: page.clean_text, depth: page.depth)
     #   end
     def crawl_site(url, ...)
       client.crawl_site(url, ...)
@@ -52,90 +53,60 @@ class RubyCrawl
   def initialize(**options)
     load_options(options)
-    build_service_client
+    @browser = Browser.new(
+      timeout:         @timeout,
+      headless:        @headless,
+      browser_options: @browser_options
+    )
   end
-  def crawl(url, wait_until: @wait_until, block_resources: @block_resources, retries: @max_retries)
+  def crawl(url, wait_until: @wait_until, block_resources: @block_resources, max_attempts: @max_attempts)
     validate_url!(url)
-    @service_client.ensure_running
-    with_retries(retries) do
-      payload = build_payload(url, wait_until, block_resources)
-      response = @service_client.post_json('/crawl', payload)
-      raise_node_error!(response)
-      build_result(response)
+    validate_wait_until!(wait_until)
+    with_retries(max_attempts) do
+      @browser.crawl(url, wait_until: wait_until, block_resources: block_resources)
     end
   end
-  # Crawl multiple pages starting from a URL, following links.
-  # @see RubyCrawl.crawl_site
   def crawl_site(url, **options, &block)
-    @service_client.ensure_running
     crawler_options = build_crawler_options(options)
-    crawler = SiteCrawler.new(self, crawler_options)
-    crawler.crawl(url, &block)
+    SiteCrawler.new(self, crawler_options).crawl(url, &block)
   end
   private
-  def raise_node_error!(response)
-    return unless response.is_a?(Hash) && response['error']
-    error_code = response['error']
-    error_message = response['message'] || error_code
-    raise error_class_for(error_code), error_message_for(error_code, error_message)
+  def load_options(options)
+    @wait_until      = options.fetch(:wait_until, nil)
+    @block_resources = options.fetch(:block_resources, nil)
+    @max_attempts    = options.fetch(:max_attempts, 3)
+    @timeout         = options.fetch(:timeout, 30)
+    @headless        = options.fetch(:headless, true)
+    @browser_options = options.fetch(:browser_options, {})
   end
-  def with_retries(retries)
+  def with_retries(max_attempts)
     attempt = 0
     begin
       yield
     rescue ServiceError, TimeoutError => e
       attempt += 1
-      raise unless attempt < retries
+      raise unless attempt < max_attempts
-      retry_with_backoff(attempt, retries, e)
+      backoff = 2**attempt
+      warn "[rubycrawl] Attempt #{attempt + 1}/#{max_attempts} failed, retrying in #{backoff}s: #{e.message}"
+      sleep(backoff)
       retry
     end
   end
-  def load_options(options)
-    @host = options.fetch(:host, DEFAULT_HOST)
-    @port = Integer(options.fetch(:port, DEFAULT_PORT))
-    @node_dir = options.fetch(:node_dir, default_node_dir)
-    @node_bin = options.fetch(:node_bin, ENV.fetch('RUBYCRAWL_NODE_BIN', nil)) || 'node'
-    @node_log = options.fetch(:node_log, ENV.fetch('RUBYCRAWL_NODE_LOG', nil))
-    @wait_until = options.fetch(:wait_until, nil)
-    @block_resources = options.fetch(:block_resources, nil)
-    @max_retries = options.fetch(:max_retries, 3)
-  end
-  def build_service_client
-    @service_client = ServiceClient.new(
-      host: @host,
-      port: @port,
-      node_dir: @node_dir,
-      node_bin: @node_bin,
-      node_log: @node_log
-    )
-  end
-  def retry_with_backoff(attempt, retries, error)
-    backoff_seconds = 2**attempt
-    warn "[rubycrawl] Retry #{attempt}/#{retries - 1} after #{backoff_seconds}s: #{error.message}"
-    sleep(backoff_seconds)
-  end
   def build_crawler_options(options)
     {
-      max_pages: options.fetch(:max_pages, 50),
-      max_depth: options.fetch(:max_depth, 3),
-      same_host_only: options.fetch(:same_host_only, true),
-      wait_until: options.fetch(:wait_until, @wait_until),
-      block_resources: options.fetch(:block_resources, @block_resources)
+      max_pages:       options.fetch(:max_pages, 50),
+      max_depth:       options.fetch(:max_depth, 3),
+      same_host_only:  options.fetch(:same_host_only, true),
+      wait_until:      options.fetch(:wait_until, @wait_until),
+      block_resources: options.fetch(:block_resources, @block_resources),
+      max_attempts:    options.fetch(:max_attempts, @max_attempts)
     }
   end
-  def default_node_dir
-    File.expand_path('../node', __dir__)
-  end
 end

data/rubycrawl.gemspec CHANGED Viewed

@@ -8,21 +8,21 @@ Gem::Specification.new do |spec|
   spec.authors = ['RubyCrawl contributors']
   spec.email = ['ganesh.navale@zohomail.in']
-  spec.summary = 'Playwright-based web crawler for Ruby'
-  spec.description = 'A Ruby-first web crawler that orchestrates a local Playwright service.'
+  spec.summary = 'Pure Ruby web crawler with full JavaScript rendering'
+  spec.description = 'rubycrawl uses Ferrum (Chrome DevTools Protocol) for JS rendering.'
   spec.homepage = 'https://github.com/craft-wise/rubycrawl'
   spec.license = 'MIT'
   spec.required_ruby_version = '>= 3.0'
-  spec.files = Dir.glob('{lib}/**/*', File::FNM_DOTMATCH).reject { |f| File.directory?(f) }
-  spec.files += %w[README.md LICENSE Gemfile Rakefile rubycrawl.gemspec .rspec]
+  spec.files  = Dir.glob('{lib}/**/*', File::FNM_DOTMATCH).reject { |f| File.directory?(f) }
+  spec.files += %w[README.md LICENSE Rakefile rubycrawl.gemspec .rspec]
   spec.bindir = 'bin'
   spec.executables = []
   spec.require_paths = ['lib']
-  # For HTML to Markdown conversion
+  spec.add_dependency 'ferrum',           '~> 0.15'
   spec.add_dependency 'reverse_markdown', '~> 2.1'
   spec.metadata['rubygems_mfa_required'] = 'true'

metadata CHANGED Viewed

@@ -1,15 +1,29 @@
 --- !ruby/object:Gem::Specification
 name: rubycrawl
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 0.2.0
 platform: ruby
 authors:
 - RubyCrawl contributors
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2026-01-26 00:00:00.000000000 Z
+date: 2026-03-16 00:00:00.000000000 Z
 dependencies:
+- !ruby/object:Gem::Dependency
+  name: ferrum
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.15'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.15'
 - !ruby/object:Gem::Dependency
   name: reverse_markdown
   requirement: !ruby/object:Gem::Requirement
@@ -24,7 +38,7 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '2.1'
-description: A Ruby-first web crawler that orchestrates a local Playwright service.
+description: rubycrawl uses Ferrum (Chrome DevTools Protocol) for JS rendering.
 email:
 - ganesh.navale@zohomail.in
 executables: []
@@ -32,17 +46,17 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".rspec"
-- Gemfile
 - LICENSE
 - README.md
 - Rakefile
 - lib/rubycrawl.rb
+- lib/rubycrawl/browser.rb
+- lib/rubycrawl/browser/extraction.rb
 - lib/rubycrawl/errors.rb
 - lib/rubycrawl/helpers.rb
 - lib/rubycrawl/markdown_converter.rb
 - lib/rubycrawl/railtie.rb
 - lib/rubycrawl/result.rb
-- lib/rubycrawl/service_client.rb
 - lib/rubycrawl/site_crawler.rb
 - lib/rubycrawl/tasks/install.rake
 - lib/rubycrawl/url_normalizer.rb
@@ -71,5 +85,5 @@ requirements: []
 rubygems_version: 3.5.22
 signing_key:
 specification_version: 4
-summary: Playwright-based web crawler for Ruby
+summary: Pure Ruby web crawler with full JavaScript rendering
 test_files: []

data/Gemfile DELETED Viewed

@@ -1,11 +0,0 @@
-# frozen_string_literal: true
-source 'https://rubygems.org'
-gemspec
-group :development do
-  gem 'rake', '>= 13.0'
-  gem 'rspec', '>= 3.12'
-  gem 'rubocop', '>= 1.50'
-end

data/lib/rubycrawl/service_client.rb DELETED Viewed

@@ -1,86 +0,0 @@
-# frozen_string_literal: true
-require 'json'
-require 'net/http'
-require 'uri'
-class RubyCrawl
-  # Handles node service lifecycle and HTTP requests.
-  class ServiceClient
-    def initialize(host:, port:, node_dir:, node_bin:, node_log:)
-      @host = host
-      @port = Integer(port)
-      @node_dir = node_dir
-      @node_bin = node_bin
-      @node_log = node_log
-      @node_pid = nil
-    end
-    def ensure_running
-      return if healthy?
-      start_service
-      wait_until_healthy
-    end
-    def post_json(path, body)
-      uri = URI("http://#{@host}:#{@port}#{path}")
-      request = build_request(uri, body)
-      response = perform_request(uri, request)
-      JSON.parse(response.body)
-    rescue JSON::ParserError => e
-      raise ServiceError, "Node service returned invalid JSON: #{e.message}"
-    rescue Errno::ECONNREFUSED, Errno::ECONNRESET => e
-      raise ServiceError, "Cannot connect to node service at #{uri}: #{e.message}"
-    rescue Net::OpenTimeout, Net::ReadTimeout => e
-      raise TimeoutError, "Request to node service timed out: #{e.message}"
-    end
-    private
-    def build_request(uri, body)
-      request = Net::HTTP::Post.new(uri)
-      request['Content-Type'] = 'application/json'
-      request.body = JSON.generate(body)
-      request
-    end
-    def perform_request(uri, request)
-      Net::HTTP.start(uri.host, uri.port, open_timeout: 5, read_timeout: 30) do |http|
-        http.request(request)
-      end
-    end
-    def start_service
-      raise ServiceError, "rubycrawl node service directory not found: #{@node_dir}" unless Dir.exist?(@node_dir)
-      env = { 'RUBYCRAWL_NODE_PORT' => @port.to_s }
-      out = @node_log ? File.open(@node_log, 'a') : File::NULL
-      err = @node_log ? out : File::NULL
-      @node_pid = Process.spawn(env, @node_bin, 'src/index.js', chdir: @node_dir, out: out, err: err)
-      Process.detach(@node_pid)
-    end
-    def wait_until_healthy(timeout: 5)
-      deadline = Time.now + timeout
-      until Time.now > deadline
-        return true if healthy?
-        sleep 0.2
-      end
-      raise ServiceError, "rubycrawl node service failed to start within #{timeout}s. " \
-                          "Check logs at #{@node_log || 'RUBYCRAWL_NODE_LOG'}"
-    end
-    def healthy?
-      uri = URI("http://#{@host}:#{@port}/health")
-      response = Net::HTTP.start(uri.host, uri.port, open_timeout: 1, read_timeout: 1) do |http|
-        http.get(uri.request_uri)
-      end
-      response.is_a?(Net::HTTPSuccess)
-    rescue StandardError
-      false
-    end
-  end
-end