RubyGems - rubycrawl - Versions diffs - 0.1.4 → 0.2.0 - Mend

rubycrawl 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +4 -4
data/README.md +167 -432
data/lib/rubycrawl/browser/extraction.rb +106 -0
data/lib/rubycrawl/browser.rb +106 -0
data/lib/rubycrawl/errors.rb +1 -1
data/lib/rubycrawl/helpers.rb +8 -44
data/lib/rubycrawl/markdown_converter.rb +2 -2
data/lib/rubycrawl/result.rb +49 -18
data/lib/rubycrawl/site_crawler.rb +40 -22
data/lib/rubycrawl/tasks/install.rake +17 -56
data/lib/rubycrawl/url_normalizer.rb +5 -1
data/lib/rubycrawl/version.rb +1 -1
data/lib/rubycrawl.rb +35 -90
data/rubycrawl.gemspec +3 -4
metadata +19 -10
data/lib/rubycrawl/service_client.rb +0 -108
data/node/.gitignore +0 -2
data/node/.npmrc +0 -1
data/node/README.md +0 -19
data/node/package-lock.json +0 -72
data/node/package.json +0 -14
data/node/src/index.js +0 -389

data/lib/rubycrawl/browser/extraction.rb ADDED Viewed

@@ -0,0 +1,106 @@
+# frozen_string_literal: true
+class RubyCrawl
+  class Browser
+    # JavaScript extraction constants, evaluated inside Chromium via page.evaluate().
+    # Ported verbatim from node/src/index.js — logic is unchanged.
+    # NOISE_SELECTORS is interpolated directly into EXTRACT_CONTENT_JS (no need to
+    # pass as a JS argument as the Node version did).
+    module Extraction
+      # All constants are IIFEs — Ferrum's page.evaluate() evaluates an expression,
+      # it does NOT call function definitions. Wrapping as (() => { ... })() ensures
+      # the function is immediately invoked and its return value is captured.
+      EXTRACT_METADATA_JS = <<~JS
+        (() => {
+          const getMeta = (name) => {
+            const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
+            return meta?.getAttribute("content") || null;
+          };
+          const getLink = (rel) => {
+            const link = document.querySelector(`link[rel="${rel}"]`);
+            return link?.getAttribute("href") || null;
+          };
+          return {
+            title:               document.title || null,
+            description:         getMeta("description") || getMeta("og:description") || null,
+            keywords:            getMeta("keywords"),
+            author:              getMeta("author"),
+            og_title:            getMeta("og:title"),
+            og_description:      getMeta("og:description"),
+            og_image:            getMeta("og:image"),
+            og_url:              getMeta("og:url"),
+            og_type:             getMeta("og:type"),
+            twitter_card:        getMeta("twitter:card"),
+            twitter_title:       getMeta("twitter:title"),
+            twitter_description: getMeta("twitter:description"),
+            twitter_image:       getMeta("twitter:image"),
+            canonical:           getLink("canonical"),
+            lang:                document.documentElement.lang || null,
+            charset:             document.characterSet || null,
+          };
+        })()
+      JS
+      EXTRACT_LINKS_JS = <<~JS
+        (() => Array.from(document.querySelectorAll("a[href]")).map(link => ({
+          url:   link.href,
+          text:  (link.textContent || "").trim(),
+          title: link.getAttribute("title") || null,
+          rel:   link.getAttribute("rel")   || null,
+        })))()
+      JS
+      EXTRACT_RAW_TEXT_JS = <<~JS
+        (() => (document.body?.innerText || "").trim())()
+      JS
+      # Semantic noise selectors — covers standard HTML5 elements and ARIA roles.
+      # Interpolated directly into EXTRACT_CONTENT_JS as a string literal.
+      NOISE_SELECTORS = [
+        'nav', 'header', 'footer', 'aside',
+        '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
+        '[role="complementary"]', '[role="dialog"]', '[role="tooltip"]',
+        '[role="alert"]', '[aria-hidden="true"]',
+        'script', 'style', 'noscript', 'iframe'
+      ].join(', ').freeze
+      # Removes semantic noise (nav/header/footer/aside + ARIA roles) and high
+      # link-density containers, then returns both clean plain text and clean HTML.
+      # DOM mutations are reversed after extraction so the page is unchanged.
+      EXTRACT_CONTENT_JS = <<~JS.freeze
+        (() => {
+          const noiseSelectors = #{NOISE_SELECTORS.to_json};
+          function linkDensity(el) {
+            const total = (el.innerText || "").trim().length;
+            if (!total) return 1;
+            const linked = Array.from(el.querySelectorAll("a"))
+              .reduce((sum, a) => sum + (a.innerText || "").trim().length, 0);
+            return linked / total;
+          }
+          const removed = [];
+          function stash(el) {
+            if (el.parentNode) {
+              removed.push({ el, parent: el.parentNode, next: el.nextSibling });
+              el.parentNode.removeChild(el);
+            }
+          }
+          document.body.querySelectorAll(noiseSelectors).forEach(stash);
+          const blockTags = new Set(["script", "style", "noscript", "link", "meta"]);
+          const topChildren = Array.from(document.body.children)
+            .filter(el => !blockTags.has(el.tagName.toLowerCase()));
+          const roots = topChildren.length === 1
+            ? [document.body, topChildren[0]] : [document.body];
+          for (const root of roots) {
+            for (const el of Array.from(root.children)) {
+              const text = (el.innerText || "").trim();
+              if (text.length >= 20 && linkDensity(el) > 0.5) stash(el);
+            }
+          }
+          const cleanHtml = document.body.innerHTML;
+          removed.reverse().forEach(({ el, parent, next }) => parent.insertBefore(el, next));
+          return { cleanHtml };
+        })()
+      JS
+    end
+  end
+end

data/lib/rubycrawl/browser.rb ADDED Viewed

@@ -0,0 +1,106 @@
+# frozen_string_literal: true
+require 'ferrum'
+require_relative 'result'
+require_relative 'errors'
+require_relative 'browser/extraction'
+class RubyCrawl
+  # Wraps Ferrum to provide a simple crawl interface.
+  # Each crawl gets its own isolated page (own context = own cookies/storage).
+  # Browser (Chrome) is launched once lazily and reused across crawls.
+  class Browser
+    BLOCKED_RESOURCE_TYPES = %w[image media font stylesheet].freeze
+    def initialize(timeout: 30, headless: true, browser_options: {})
+      @timeout         = timeout
+      @headless        = headless
+      @browser_options = browser_options
+      @browser         = nil
+      @mutex           = Mutex.new
+    end
+    # Crawl a URL and return a RubyCrawl::Result.
+    #
+    # @param url [String]
+    # @param wait_until [String, nil] "load", "domcontentloaded", "networkidle", "commit"
+    # @param block_resources [Boolean] block images/fonts/CSS/media for speed
+    # @return [RubyCrawl::Result]
+    def crawl(url, wait_until: nil, block_resources: true)
+      page = lazy_browser.create_page(new_context: true)
+      begin
+        setup_resource_blocking(page) if block_resources
+        navigate(page, url, wait_until.to_s)
+        extract(page)
+      rescue ::Ferrum::TimeoutError => e
+        raise RubyCrawl::TimeoutError, "Navigation timed out: #{e.message}"
+      rescue ::Ferrum::StatusError => e
+        raise RubyCrawl::NavigationError, "Navigation failed: #{e.message}"
+      rescue ::Ferrum::Error => e
+        raise RubyCrawl::ServiceError, "Browser error: #{e.message}"
+      ensure
+        begin
+          page&.close
+        rescue StandardError
+          nil
+        end
+      end
+    end
+    private
+    # Lazy-initialise the Ferrum::Browser singleton.
+    # Mutex prevents double-init in threaded environments (Puma).
+    # Chrome launches after fork in forking servers (Unicorn) since @browser is nil in each worker.
+    def lazy_browser
+      @mutex.synchronize { @browser ||= launch_browser }
+    rescue ::Ferrum::Error => e
+      raise RubyCrawl::ServiceError, "Failed to launch browser: #{e.message}"
+    end
+    def launch_browser
+      b = Ferrum::Browser.new(
+        headless:        @headless,
+        timeout:         @timeout,
+        browser_options: @browser_options
+      )
+      at_exit do
+        b.quit
+      rescue StandardError
+        nil # process is exiting anyway
+      end
+      b
+    end
+    def setup_resource_blocking(page)
+      page.network.intercept
+      page.on(:request) do |request|
+        BLOCKED_RESOURCE_TYPES.include?(request.resource_type) ? request.abort : request.continue
+      end
+    end
+    def navigate(page, url, wait_until)
+      page.go_to(url)
+      # go_to waits for load by default. networkidle needs an extra wait.
+      page.network.wait_for_idle(connections: 0, duration: 0.5) if wait_until == 'networkidle'
+    end
+    def extract(page)
+      html      = page.body
+      final_url = page.current_url
+      metadata  = page.evaluate(Extraction::EXTRACT_METADATA_JS)
+      links     = page.evaluate(Extraction::EXTRACT_LINKS_JS)
+      raw_text  = page.evaluate(Extraction::EXTRACT_RAW_TEXT_JS)
+      content   = page.evaluate(Extraction::EXTRACT_CONTENT_JS)
+      Result.new(
+        html:       html,
+        raw_text:   raw_text.to_s,
+        clean_html: content['cleanHtml'].to_s,
+        links:      Array(links),
+        metadata:   { 'final_url' => final_url }.merge(metadata || {})
+      )
+    end
+  end
+end

data/lib/rubycrawl/errors.rb CHANGED Viewed

@@ -4,7 +4,7 @@ class RubyCrawl
   # Base error class for all RubyCrawl errors
   class Error < StandardError; end
-  # Raised when the Node.js service fails to start or is unavailable
+  # Raised when the browser fails to launch or is unavailable
   class ServiceError < Error; end
   # Raised when page navigation fails (timeout, DNS, SSL, etc.)

data/lib/rubycrawl/helpers.rb CHANGED Viewed

@@ -3,8 +3,10 @@
 require 'uri'
 class RubyCrawl
-  # Helper methods for payloads, validation, and errors.
+  # Validation helpers mixed into RubyCrawl.
   module Helpers
+    VALID_WAIT_UNTIL = %w[load domcontentloaded networkidle commit].freeze
     private
     def validate_url!(url)
@@ -21,50 +23,12 @@ class RubyCrawl
       raise ConfigurationError, "Invalid URL: #{e.message}"
     end
-    VALID_WAIT_UNTIL = %w[load domcontentloaded networkidle commit].freeze
-    def build_payload(url, wait_until, block_resources, session_id = nil)
-      if wait_until && !VALID_WAIT_UNTIL.include?(wait_until.to_s)
-        raise ConfigurationError,
-              "Invalid wait_until: #{wait_until.inspect}. Must be one of: #{VALID_WAIT_UNTIL.join(', ')}"
-      end
-      payload = { url: url }
-      payload[:wait_until] = wait_until if wait_until
-      payload[:block_resources] = block_resources unless block_resources.nil?
-      payload[:session_id] = session_id if session_id
-      payload
-    end
+    def validate_wait_until!(wait_until)
+      return unless wait_until
+      return if VALID_WAIT_UNTIL.include?(wait_until.to_s)
-    def build_result(response)
-      Result.new(
-        text: response['text'].to_s,
-        html: response['html'].to_s,
-        links: Array(response['links']),
-        metadata: response['metadata'].is_a?(Hash) ? response['metadata'] : {}
-      )
-    end
-    def error_class_for(error_code)
-      case error_code
-      when 'crawl_failed'
-        NavigationError
-      when 'invalid_json', 'session_create_failed', 'session_destroy_failed'
-        ServiceError
-      else
-        Error
-      end
-    end
-    def error_message_for(error_code, error_message)
-      case error_code
-      when 'crawl_failed'
-        "Navigation failed: #{error_message}"
-      when 'invalid_json', 'session_create_failed', 'session_destroy_failed'
-        "Service error [#{error_code}]: #{error_message}"
-      else
-        "Crawl error [#{error_code}]: #{error_message}"
-      end
+      raise ConfigurationError,
+            "Invalid wait_until: #{wait_until.inspect}. Must be one of: #{VALID_WAIT_UNTIL.join(', ')}"
     end
   end
 end

data/lib/rubycrawl/markdown_converter.rb CHANGED Viewed

@@ -55,9 +55,9 @@ class RubyCrawl
     def default_options
       {
-        unknown_tags: :bypass,
+        unknown_tags:    :bypass,
         github_flavored: true,
-        tag_border: ''
+        tag_border:      ''
       }
     end
   end

data/lib/rubycrawl/result.rb CHANGED Viewed

@@ -1,34 +1,47 @@
 # frozen_string_literal: true
+require 'cgi'
 class RubyCrawl
-  # Result object with lazy clean_markdown conversion.
+  # Immutable result object returned from every crawl.
+  # clean_text and clean_markdown are both derived lazily from clean_html so
+  # they have consistent content coverage (including hidden/collapsed elements).
   class Result
-    attr_reader :text, :html, :links, :metadata
+    attr_reader :raw_text, :clean_html, :html, :links, :metadata
-    def initialize(text:, html:, links:, metadata:)
-      @text = text
-      @html = html
-      @links = links
-      @metadata = metadata
+    def initialize(raw_text:, clean_html:, html:, links:, metadata:)
+      @raw_text   = raw_text
+      @clean_html = clean_html
+      @html       = html
+      @links      = links
+      @metadata   = metadata
     end
-    # Returns clean markdown converted from the page HTML.
-    # Relative URLs are resolved using the page's final_url.
+    # Plain text derived from noise-stripped HTML.
+    # Captures hidden/collapsed content (accordions, tabs) that innerText misses.
+    # Lazy — computed on first access.
     #
-    # @return [String] Markdown content with absolute URLs
+    # @return [String]
+    def clean_text
+      @clean_text ||= html_to_text(clean_html.empty? ? html : clean_html)
+    end
+    # Markdown derived from noise-stripped HTML.
+    # Preserves document structure (headings, lists, links).
+    # Lazy — computed on first access.
+    #
+    # @return [String]
     def clean_markdown
-      @clean_markdown ||= MarkdownConverter.convert(html, base_url: final_url)
+      source = clean_html.empty? ? html : clean_html
+      @clean_markdown ||= MarkdownConverter.convert(source, base_url: final_url)
     end
     # The final URL after redirects.
-    #
     # @return [String, nil]
     def final_url
       metadata['final_url']
     end
-    # Check if clean_markdown has been computed.
-    #
     # @return [Boolean]
     def clean_markdown?
       !@clean_markdown.nil?
@@ -36,12 +49,30 @@ class RubyCrawl
     def to_h
       {
-        text: text,
-        html: html,
-        links: links,
-        metadata: metadata,
+        raw_text:       raw_text,
+        clean_text:     @clean_text,
+        clean_html:     clean_html,
+        html:           html,
+        links:          links,
+        metadata:       metadata,
         clean_markdown: @clean_markdown
       }
     end
+    private
+    # Convert HTML to plain text without any external dependencies.
+    # Block-level elements (p, div, h1-h6, li, br, etc.) become newlines
+    # so paragraph structure is preserved. HTML entities are unescaped.
+    def html_to_text(source)
+      text = source
+             .gsub(%r{</?(p|div|h[1-6]|li|br|tr|section|article|blockquote|pre)[^>]*>}i, "\n")
+             .gsub(/<[^>]+>/, '')
+      CGI.unescapeHTML(text)
+         .gsub(/[ \t]+/, ' ')
+         .gsub(/ *\n */, "\n")
+         .gsub(/\n{3,}/, "\n\n")
+         .strip
+    end
   end
 end

data/lib/rubycrawl/site_crawler.rb CHANGED Viewed

@@ -7,20 +7,30 @@ class RubyCrawl
   class SiteCrawler
     # Page result yielded to the block with lazy clean_markdown.
     class PageResult
-      attr_reader :url, :html, :links, :metadata, :depth
-      def initialize(url:, html:, links:, metadata:, depth:)
-        @url = url
-        @html = html
-        @links = links
-        @metadata = metadata
-        @depth = depth
+      attr_reader :url, :html, :raw_text, :clean_html, :links, :metadata, :depth
+      def initialize(url:, html:, raw_text:, clean_html:, links:, metadata:, depth:)
+        @url        = url
+        @html       = html
+        @raw_text   = raw_text
+        @clean_html = clean_html
+        @links      = links
+        @metadata   = metadata
+        @depth      = depth
       end
-      # Returns clean markdown converted from the page HTML.
-      # Relative URLs are resolved using the page's final_url.
+      # Plain text derived from noise-stripped HTML. Lazy — same as Result#clean_text.
+      def clean_text
+        @clean_text ||= Result.new(
+          html: html, raw_text: raw_text, clean_html: clean_html,
+          links: links, metadata: metadata
+        ).clean_text
+      end
+      # Markdown derived from noise-stripped HTML. Lazy — same as Result#clean_markdown.
       def clean_markdown
-        @clean_markdown ||= MarkdownConverter.convert(html, base_url: final_url)
+        source = clean_html.empty? ? html : clean_html
+        @clean_markdown ||= MarkdownConverter.convert(source, base_url: final_url)
       end
       # The final URL after redirects.
@@ -39,7 +49,6 @@ class RubyCrawl
       @max_attempts = options.fetch(:max_attempts, nil)
       @visited = Set.new
       @queue = []
-      @session_id = nil
     end
     def crawl(start_url, &block)
@@ -49,11 +58,8 @@ class RubyCrawl
       raise ConfigurationError, "Invalid start URL: #{start_url}" unless normalized
       @base_url = normalized
-      @session_id = @client.create_session
       enqueue(normalized, 0)
       process_queue(&block)
-    ensure
-      @client.destroy_session(@session_id) if @session_id
     end
     private
@@ -78,12 +84,22 @@ class RubyCrawl
     def process_page(url, depth)
       @visited.add(url)
       result = crawl_page(url, depth)
-      enqueue_links(result.links, depth + 1) if result && depth < @max_depth
+      return unless result
+      # Mark final_url visited to prevent re-crawling after redirects
+      # e.g. axonchat.ai → www.axonchat.ai should not crawl www again.
+      final = UrlNormalizer.normalize(result.final_url)
+      @visited.add(final) if final
+      # Update base_url on first crawl so same_host checks use the canonical host.
+      @base_url = final if depth.zero? && final
+      enqueue_links(result.links, depth + 1) if depth < @max_depth
       result
     end
     def crawl_page(url, depth)
-      opts = { wait_until: @wait_until, block_resources: @block_resources, session_id: @session_id }
+      opts = { wait_until: @wait_until, block_resources: @block_resources }
       opts[:max_attempts] = @max_attempts if @max_attempts
       result = @client.crawl(url, **opts)
       build_page_result(url, depth, result)
@@ -94,11 +110,13 @@ class RubyCrawl
     def build_page_result(url, depth, result)
       PageResult.new(
-        url: url,
-        html: result.html,
-        links: extract_urls(result.links),
-        metadata: result.metadata,
-        depth: depth
+        url:        url,
+        html:       result.html,
+        raw_text:   result.raw_text,
+        clean_html: result.clean_html,
+        links:      extract_urls(result.links),
+        metadata:   result.metadata,
+        depth:      depth
       )
     end

data/lib/rubycrawl/tasks/install.rake CHANGED Viewed

@@ -1,85 +1,46 @@
 # frozen_string_literal: true
-# rubocop:disable Metrics/BlockLength
 namespace :rubycrawl do
-  desc 'Install Node dependencies and create initializer'
+  desc 'Check system dependencies and generate Rails initializer'
   task :install do
     require 'fileutils'
-    # Check Node.js is installed
-    unless system('node', '--version', out: File::NULL, err: File::NULL)
-      abort <<~MSG
-        [rubycrawl] ERROR: Node.js is not installed or not in PATH.
-        RubyCrawl requires Node.js (v18+ recommended) for browser automation.
-        Install Node.js:
-          - macOS:   brew install node
-          - Ubuntu:  curl -fsSL https://deb.nodesource.com/setup_lts.x | sudo -E bash - && sudo apt-get install -y nodejs
-          - Windows: https://nodejs.org/en/download/
-        After installing, run this task again:
-          bundle exec rake rubycrawl:install
-      MSG
+    # Ferrum manages Chrome automatically, but warn if not found in common locations
+    chrome_found = %w[google-chrome chromium-browser chromium].any? do |cmd|
+      system("which #{cmd}", out: File::NULL, err: File::NULL)
     end
-    gem_root = File.expand_path('../../../', __dir__)
-    node_dir = File.join(gem_root, 'node')
-    abort("[rubycrawl] ERROR: node directory not found at #{node_dir}") unless Dir.exist?(node_dir)
-    Dir.chdir(node_dir) do
-      puts('[rubycrawl] Installing Node dependencies...')
-      system('npm', 'install') || abort('[rubycrawl] ERROR: npm install failed')
-      puts('[rubycrawl] Installing Playwright browsers...')
-      system('npx', 'playwright', 'install') || abort('[rubycrawl] ERROR: playwright install failed')
+    unless chrome_found
+      warn '[rubycrawl] Chrome/Chromium not found in PATH. Ferrum will attempt to locate it automatically.'
+      warn '[rubycrawl] macOS:  brew install --cask google-chrome'
+      warn '[rubycrawl] Ubuntu: sudo apt-get install -y chromium-browser'
+      warn '[rubycrawl] See README for Docker examples.'
     end
     if defined?(Rails)
       initializer_path = Rails.root.join('config', 'initializers', 'rubycrawl.rb')
       if File.exist?(initializer_path)
-        puts("[rubycrawl] Initializer already exists at #{initializer_path}")
+        puts "[rubycrawl] Initializer already exists at #{initializer_path}"
       else
         content = <<~RUBY
           # frozen_string_literal: true
           # RubyCrawl Configuration
-          # =======================
-          # Uncomment and modify options as needed.
           RubyCrawl.configure(
-            # wait_until - Page load strategy:
-            #   "load"             - Wait for load event (fastest, good for static sites)
-            #   "domcontentloaded" - Wait for DOM ready (medium speed)
-            #   "networkidle"      - Wait until no network requests for 500ms (best for SPAs)
-            # wait_until: "load",
-            # Block images, fonts, CSS, media for faster crawls (2-3x speedup)
-            # block_resources: true,
-            # Maximum retry attempts for transient failures (with exponential backoff)
-            # max_retries: 3,
-            # Node service settings (usually no need to change)
-            # host: "127.0.0.1",
-            # port: 3344,
-            # Custom Node.js binary path (if not in PATH)
-            # node_bin: "/usr/local/bin/node",
-            # Log file for Node service output (useful for debugging)
-            # node_log: Rails.root.join("log", "rubycrawl.log").to_s
+            # wait_until: "load",       # "load", "domcontentloaded", "networkidle"
+            # block_resources: true,    # block images/fonts/CSS/media for speed
+            # max_attempts: 3,          # retry count with exponential backoff
+            # timeout: 30,             # browser navigation timeout in seconds
+            # headless: true,          # set false to see the browser (debugging)
           )
         RUBY
         FileUtils.mkdir_p(File.dirname(initializer_path))
         File.write(initializer_path, content)
-        puts("[rubycrawl] Created initializer at #{initializer_path}")
+        puts "[rubycrawl] Created initializer at #{initializer_path}"
       end
     else
-      puts('[rubycrawl] Rails not detected. Skipping initializer creation.')
+      puts '[rubycrawl] Rails not detected. Skipping initializer creation.'
     end
   end
 end
-# rubocop:enable Metrics/BlockLength

data/lib/rubycrawl/url_normalizer.rb CHANGED Viewed

@@ -29,11 +29,15 @@ class RubyCrawl
     def same_host?(url, base_url)
       uri = URI.parse(url)
       base_uri = URI.parse(base_url)
-      uri.host&.downcase == base_uri.host&.downcase
+      canonical_host(uri.host) == canonical_host(base_uri.host)
     rescue URI::InvalidURIError
       false
     end
+    def canonical_host(host)
+      host&.downcase&.delete_prefix('www.')
+    end
     def parse_uri(url, base_url)
       uri = URI.parse(url)
       return uri if uri.absolute?

data/lib/rubycrawl/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 class RubyCrawl
-  VERSION = '0.1.4'
+  VERSION = '0.2.0'
 end