RubyGems - rubycrawl - Versions diffs - 0.1.4 → 0.2.0 - Mend

rubycrawl 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +4 -4
data/README.md +167 -432
data/lib/rubycrawl/browser/extraction.rb +106 -0
data/lib/rubycrawl/browser.rb +106 -0
data/lib/rubycrawl/errors.rb +1 -1
data/lib/rubycrawl/helpers.rb +8 -44
data/lib/rubycrawl/markdown_converter.rb +2 -2
data/lib/rubycrawl/result.rb +49 -18
data/lib/rubycrawl/site_crawler.rb +40 -22
data/lib/rubycrawl/tasks/install.rake +17 -56
data/lib/rubycrawl/url_normalizer.rb +5 -1
data/lib/rubycrawl/version.rb +1 -1
data/lib/rubycrawl.rb +35 -90
data/rubycrawl.gemspec +3 -4
metadata +19 -10
data/lib/rubycrawl/service_client.rb +0 -108
data/node/.gitignore +0 -2
data/node/.npmrc +0 -1
data/node/README.md +0 -19
data/node/package-lock.json +0 -72
data/node/package.json +0 -14
data/node/src/index.js +0 -389

data/lib/rubycrawl.rb CHANGED Viewed

@@ -3,25 +3,26 @@
 require_relative 'rubycrawl/version'
 require_relative 'rubycrawl/errors'
 require_relative 'rubycrawl/helpers'
-require_relative 'rubycrawl/service_client'
+require_relative 'rubycrawl/browser'
 require_relative 'rubycrawl/url_normalizer'
 require_relative 'rubycrawl/markdown_converter'
 require_relative 'rubycrawl/result'
 require_relative 'rubycrawl/site_crawler'
 require_relative 'rubycrawl/railtie' if defined?(Rails)
-# RubyCrawl provides a simple interface for crawling pages via a local Playwright service.
+# RubyCrawl — pure Ruby web crawler with full JavaScript rendering via Ferrum.
 class RubyCrawl
   include Helpers
-  DEFAULT_HOST = '127.0.0.1'
-  DEFAULT_PORT = 3344
   class << self
     def client
       @client ||= new
     end
+    # Crawl a single URL and return a Result.
+    # @param url [String]
+    # @param options [Hash] wait_until:, block_resources:, max_attempts:
+    # @return [RubyCrawl::Result]
     def crawl(url, **options)
       client.crawl(url, **options)
     end
@@ -34,29 +35,17 @@ class RubyCrawl
     # @param max_depth [Integer] Maximum link depth from start URL (default: 3)
     # @param same_host_only [Boolean] Only follow links on the same host (default: true)
     # @yield [page] Yields each page result as it is crawled
-    # @yieldparam page [SiteCrawler::PageResult] The crawled page result
+    # @yieldparam page [SiteCrawler::PageResult]
     # @return [Integer] Number of pages crawled
     #
-    # @example Save pages to database
+    # @example
     #   RubyCrawl.crawl_site("https://example.com", max_pages: 100) do |page|
-    #     Page.create!(url: page.url, html: page.html, depth: page.depth)
+    #     Page.create!(url: page.url, content: page.clean_text, depth: page.depth)
     #   end
     def crawl_site(url, ...)
       client.crawl_site(url, ...)
     end
-    # Create a session for reusing browser context across multiple crawls.
-    # @return [String] session_id
-    def create_session
-      client.create_session
-    end
-    # Destroy a session and close its browser context.
-    # @param session_id [String]
-    def destroy_session(session_id)
-      client.destroy_session(session_id)
-    end
     def configure(**options)
       @client = new(**options)
     end
@@ -64,104 +53,60 @@ class RubyCrawl
   def initialize(**options)
     load_options(options)
-    build_service_client
+    @browser = Browser.new(
+      timeout:         @timeout,
+      headless:        @headless,
+      browser_options: @browser_options
+    )
   end
-  def crawl(url, wait_until: @wait_until, block_resources: @block_resources, max_attempts: @max_attempts, session_id: nil)
+  def crawl(url, wait_until: @wait_until, block_resources: @block_resources, max_attempts: @max_attempts)
     validate_url!(url)
-    @service_client.ensure_running
+    validate_wait_until!(wait_until)
     with_retries(max_attempts) do
-      payload = build_payload(url, wait_until, block_resources, session_id)
-      response = @service_client.post_json('/crawl', payload)
-      raise_node_error!(response)
-      build_result(response)
+      @browser.crawl(url, wait_until: wait_until, block_resources: block_resources)
     end
   end
-  # Create a session for reusing browser context.
-  # @return [String] session_id
-  def create_session
-    @service_client.ensure_running
-    @service_client.create_session
-  end
-  # Destroy a session.
-  # @param session_id [String]
-  def destroy_session(session_id)
-    @service_client.destroy_session(session_id)
-  end
-  # Crawl multiple pages starting from a URL, following links.
-  # @see RubyCrawl.crawl_site
   def crawl_site(url, **options, &block)
-    @service_client.ensure_running
     crawler_options = build_crawler_options(options)
-    crawler = SiteCrawler.new(self, crawler_options)
-    crawler.crawl(url, &block)
+    SiteCrawler.new(self, crawler_options).crawl(url, &block)
   end
   private
-  def raise_node_error!(response)
-    return unless response.is_a?(Hash) && response['error']
-    error_code = response['error']
-    error_message = response['message'] || error_code
-    raise error_class_for(error_code), error_message_for(error_code, error_message)
+  def load_options(options)
+    @wait_until      = options.fetch(:wait_until, nil)
+    @block_resources = options.fetch(:block_resources, nil)
+    @max_attempts    = options.fetch(:max_attempts, 3)
+    @timeout         = options.fetch(:timeout, 30)
+    @headless        = options.fetch(:headless, true)
+    @browser_options = options.fetch(:browser_options, {})
   end
-  def with_retries(retries)
+  def with_retries(max_attempts)
     attempt = 0
     begin
       yield
     rescue ServiceError, TimeoutError => e
       attempt += 1
-      raise unless attempt < retries
+      raise unless attempt < max_attempts
-      retry_with_backoff(attempt, retries, e)
+      backoff = 2**attempt
+      warn "[rubycrawl] Attempt #{attempt + 1}/#{max_attempts} failed, retrying in #{backoff}s: #{e.message}"
+      sleep(backoff)
       retry
     end
   end
-  def load_options(options)
-    @host = options.fetch(:host, DEFAULT_HOST)
-    @port = Integer(options.fetch(:port, DEFAULT_PORT))
-    @node_dir = options.fetch(:node_dir, default_node_dir)
-    @node_bin = options.fetch(:node_bin, ENV.fetch('RUBYCRAWL_NODE_BIN', nil)) || 'node'
-    @node_log = options.fetch(:node_log, ENV.fetch('RUBYCRAWL_NODE_LOG', nil))
-    @wait_until = options.fetch(:wait_until, nil)
-    @block_resources = options.fetch(:block_resources, nil)
-    @max_attempts = options.fetch(:max_attempts, 3)
-  end
-  def build_service_client
-    @service_client = ServiceClient.new(
-      host: @host,
-      port: @port,
-      node_dir: @node_dir,
-      node_bin: @node_bin,
-      node_log: @node_log
-    )
-  end
-  def retry_with_backoff(attempt, max_attempts, error)
-    backoff_seconds = 2**attempt
-    warn "[rubycrawl] Attempt #{attempt + 1}/#{max_attempts} failed, retrying in #{backoff_seconds}s: #{error.message}"
-    sleep(backoff_seconds)
-  end
   def build_crawler_options(options)
     {
-      max_pages: options.fetch(:max_pages, 50),
-      max_depth: options.fetch(:max_depth, 3),
-      same_host_only: options.fetch(:same_host_only, true),
-      wait_until: options.fetch(:wait_until, @wait_until),
+      max_pages:       options.fetch(:max_pages, 50),
+      max_depth:       options.fetch(:max_depth, 3),
+      same_host_only:  options.fetch(:same_host_only, true),
+      wait_until:      options.fetch(:wait_until, @wait_until),
       block_resources: options.fetch(:block_resources, @block_resources),
-      max_attempts: options.fetch(:max_attempts, @max_attempts)
+      max_attempts:    options.fetch(:max_attempts, @max_attempts)
     }
   end
-  def default_node_dir
-    File.expand_path('../node', __dir__)
-  end
 end

data/rubycrawl.gemspec CHANGED Viewed

@@ -8,22 +8,21 @@ Gem::Specification.new do |spec|
   spec.authors = ['RubyCrawl contributors']
   spec.email = ['ganesh.navale@zohomail.in']
-  spec.summary = 'Playwright-based web crawler for Ruby'
-  spec.description = 'A Ruby-first web crawler that orchestrates a local Playwright service.'
+  spec.summary = 'Pure Ruby web crawler with full JavaScript rendering'
+  spec.description = 'rubycrawl uses Ferrum (Chrome DevTools Protocol) for JS rendering.'
   spec.homepage = 'https://github.com/craft-wise/rubycrawl'
   spec.license = 'MIT'
   spec.required_ruby_version = '>= 3.0'
   spec.files  = Dir.glob('{lib}/**/*', File::FNM_DOTMATCH).reject { |f| File.directory?(f) }
-  spec.files += Dir.glob('node/**/*', File::FNM_DOTMATCH).reject { |f| File.directory?(f) || f.include?('node_modules') }
   spec.files += %w[README.md LICENSE Rakefile rubycrawl.gemspec .rspec]
   spec.bindir = 'bin'
   spec.executables = []
   spec.require_paths = ['lib']
-  # For HTML to Markdown conversion
+  spec.add_dependency 'ferrum',           '~> 0.15'
   spec.add_dependency 'reverse_markdown', '~> 2.1'
   spec.metadata['rubygems_mfa_required'] = 'true'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rubycrawl
 version: !ruby/object:Gem::Version
-  version: 0.1.4
+  version: 0.2.0
 platform: ruby
 authors:
 - RubyCrawl contributors
@@ -10,6 +10,20 @@ bindir: bin
 cert_chain: []
 date: 2026-03-16 00:00:00.000000000 Z
 dependencies:
+- !ruby/object:Gem::Dependency
+  name: ferrum
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.15'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.15'
 - !ruby/object:Gem::Dependency
   name: reverse_markdown
   requirement: !ruby/object:Gem::Requirement
@@ -24,7 +38,7 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '2.1'
-description: A Ruby-first web crawler that orchestrates a local Playwright service.
+description: rubycrawl uses Ferrum (Chrome DevTools Protocol) for JS rendering.
 email:
 - ganesh.navale@zohomail.in
 executables: []
@@ -36,22 +50,17 @@ files:
 - README.md
 - Rakefile
 - lib/rubycrawl.rb
+- lib/rubycrawl/browser.rb
+- lib/rubycrawl/browser/extraction.rb
 - lib/rubycrawl/errors.rb
 - lib/rubycrawl/helpers.rb
 - lib/rubycrawl/markdown_converter.rb
 - lib/rubycrawl/railtie.rb
 - lib/rubycrawl/result.rb
-- lib/rubycrawl/service_client.rb
 - lib/rubycrawl/site_crawler.rb
 - lib/rubycrawl/tasks/install.rake
 - lib/rubycrawl/url_normalizer.rb
 - lib/rubycrawl/version.rb
-- node/.gitignore
-- node/.npmrc
-- node/README.md
-- node/package-lock.json
-- node/package.json
-- node/src/index.js
 - rubycrawl.gemspec
 homepage: https://github.com/craft-wise/rubycrawl
 licenses:
@@ -76,5 +85,5 @@ requirements: []
 rubygems_version: 3.5.22
 signing_key:
 specification_version: 4
-summary: Playwright-based web crawler for Ruby
+summary: Pure Ruby web crawler with full JavaScript rendering
 test_files: []

data/lib/rubycrawl/service_client.rb DELETED Viewed

@@ -1,108 +0,0 @@
-# frozen_string_literal: true
-require 'json'
-require 'net/http'
-require 'uri'
-class RubyCrawl
-  # Handles node service lifecycle and HTTP requests.
-  class ServiceClient
-    def initialize(host:, port:, node_dir:, node_bin:, node_log:)
-      @host = host
-      @port = Integer(port)
-      @node_dir = node_dir
-      @node_bin = node_bin
-      @node_log = node_log
-      @node_pid = nil
-    end
-    def ensure_running
-      return if healthy?
-      start_service
-      wait_until_healthy
-    end
-    def post_json(path, body)
-      uri = URI("http://#{@host}:#{@port}#{path}")
-      request = build_request(uri, body)
-      response = perform_request(uri, request)
-      JSON.parse(response.body)
-    rescue JSON::ParserError => e
-      raise ServiceError, "Node service returned invalid JSON: #{e.message}"
-    rescue Errno::ECONNREFUSED, Errno::ECONNRESET => e
-      raise ServiceError, "Cannot connect to node service at #{uri}: #{e.message}"
-    rescue Net::OpenTimeout, Net::ReadTimeout => e
-      raise TimeoutError, "Request to node service timed out: #{e.message}"
-    end
-    # Create a session for reusing browser context across multiple crawls.
-    # @return [String] session_id
-    def create_session
-      response = post_json('/session/create', {})
-      raise ServiceError, "Failed to create session: #{response['error']}" if response['error']
-      response['session_id']
-    end
-    # Destroy a session and close its browser context.
-    # @param session_id [String]
-    def destroy_session(session_id)
-      post_json('/session/destroy', { session_id: session_id })
-    rescue StandardError
-      # Ignore errors on destroy - context may already be closed
-      nil
-    end
-    private
-    def build_request(uri, body)
-      request = Net::HTTP::Post.new(uri)
-      request['Content-Type'] = 'application/json'
-      request.body = JSON.generate(body)
-      request
-    end
-    def perform_request(uri, request)
-      Net::HTTP.start(uri.host, uri.port, open_timeout: 5, read_timeout: 30) do |http|
-        http.request(request)
-      end
-    end
-    def start_service
-      raise ServiceError, "rubycrawl node service directory not found: #{@node_dir}" unless Dir.exist?(@node_dir)
-      env = { 'RUBYCRAWL_NODE_PORT' => @port.to_s }
-      if @node_log
-        out = File.open(@node_log, 'a')
-        @node_pid = Process.spawn(env, @node_bin, 'src/index.js', chdir: @node_dir, out: out, err: out)
-        out.close
-      else
-        @node_pid = Process.spawn(env, @node_bin, 'src/index.js', chdir: @node_dir, out: File::NULL, err: File::NULL)
-      end
-      Process.detach(@node_pid)
-    end
-    def wait_until_healthy(timeout: 5)
-      deadline = Time.now + timeout
-      until Time.now > deadline
-        return true if healthy?
-        sleep 0.2
-      end
-      raise ServiceError, "rubycrawl node service failed to start within #{timeout}s. " \
-                          "Check logs at #{@node_log || 'RUBYCRAWL_NODE_LOG'}"
-    end
-    def healthy?
-      uri = URI("http://#{@host}:#{@port}/health")
-      response = Net::HTTP.start(uri.host, uri.port, open_timeout: 1, read_timeout: 1) do |http|
-        http.get(uri.request_uri)
-      end
-      response.is_a?(Net::HTTPSuccess)
-    rescue StandardError
-      false
-    end
-  end
-end

data/node/.gitignore DELETED Viewed

	@@ -1,2 +0,0 @@
1	- /node_modules
2	- /.env

data/node/.npmrc DELETED Viewed

	@@ -1 +0,0 @@
1	- fund=false

data/node/README.md DELETED Viewed

@@ -1,19 +0,0 @@
-# rubycrawl Node Service
-Local Playwright-backed HTTP service used by the Ruby gem.
-## Run
-```
-npm install
-npm start
-```
-## Environment
-Create a `.env` file (or copy from `.env.example`) if you need custom settings.
-## Endpoints
-- `POST /crawl` JSON body: `{ "url": "https://example.com" }`
-- `GET /health`

data/node/package-lock.json DELETED Viewed

@@ -1,72 +0,0 @@
-{
-  "name": "rubycrawl-node",
-  "version": "0.1.0",
-  "lockfileVersion": 3,
-  "requires": true,
-  "packages": {
-    "": {
-      "name": "rubycrawl-node",
-      "version": "0.1.0",
-      "dependencies": {
-        "dotenv": "^16.4.5",
-        "playwright": "^1.41.0"
-      }
-    },
-    "node_modules/dotenv": {
-      "version": "16.6.1",
-      "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz",
-      "integrity": "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==",
-      "license": "BSD-2-Clause",
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://dotenvx.com"
-      }
-    },
-    "node_modules/fsevents": {
-      "version": "2.3.2",
-      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
-      "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
-      "hasInstallScript": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "darwin"
-      ],
-      "engines": {
-        "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
-      }
-    },
-    "node_modules/playwright": {
-      "version": "1.58.0",
-      "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.58.0.tgz",
-      "integrity": "sha512-2SVA0sbPktiIY/MCOPX8e86ehA/e+tDNq+e5Y8qjKYti2Z/JG7xnronT/TXTIkKbYGWlCbuucZ6dziEgkoEjQQ==",
-      "license": "Apache-2.0",
-      "dependencies": {
-        "playwright-core": "1.58.0"
-      },
-      "bin": {
-        "playwright": "cli.js"
-      },
-      "engines": {
-        "node": ">=18"
-      },
-      "optionalDependencies": {
-        "fsevents": "2.3.2"
-      }
-    },
-    "node_modules/playwright-core": {
-      "version": "1.58.0",
-      "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.58.0.tgz",
-      "integrity": "sha512-aaoB1RWrdNi3//rOeKuMiS65UCcgOVljU46At6eFcOFPFHWtd2weHRRow6z/n+Lec0Lvu0k9ZPKJSjPugikirw==",
-      "license": "Apache-2.0",
-      "bin": {
-        "playwright-core": "cli.js"
-      },
-      "engines": {
-        "node": ">=18"
-      }
-    }
-  }
-}

data/node/package.json DELETED Viewed

@@ -1,14 +0,0 @@
-{
-  "name": "rubycrawl-node",
-  "version": "0.1.0",
-  "private": true,
-  "type": "module",
-  "main": "src/index.js",
-  "scripts": {
-    "start": "node src/index.js"
-  },
-  "dependencies": {
-    "dotenv": "^16.4.5",
-    "playwright": "^1.41.0"
-  }
-}