RubyGems - spidy - Versions diffs - 0.4.0 → 1.0.0 - Mend

spidy 0.4.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +4 -4
data/.rubocop.yml +53 -8
data/Gemfile +18 -4
data/Gemfile.lock +71 -19
data/README.md +95 -16
data/Rakefile +0 -2
data/example/check_ferrum.rb +114 -0
data/example/check_lightpanda.rb +59 -0
data/example/connect_test.rb +48 -0
data/example/lightpanda_links.rb +80 -0
data/example/master_detail.rb +1 -3
data/example/proxy.rb +0 -2
data/example/retry.rb +0 -2
data/example/run_with_lightpanda.rb +25 -0
data/example/simple_test.rb +53 -0
data/example/test_lightpanda.rb +86 -0
data/example/wikip.rb +2 -4
data/exe/spidy +0 -2
data/lib/spidy/binder/error.rb +0 -2
data/lib/spidy/binder/html.rb +0 -2
data/lib/spidy/binder/json.rb +0 -2
data/lib/spidy/binder/xml.rb +0 -2
data/lib/spidy/binder.rb +0 -2
data/lib/spidy/command_line.rb +4 -6
data/lib/spidy/connector/direct.rb +0 -2
data/lib/spidy/connector/html.rb +0 -2
data/lib/spidy/connector/json.rb +2 -4
data/lib/spidy/connector/lightpanda.rb +161 -0
data/lib/spidy/connector/xml.rb +4 -6
data/lib/spidy/connector.rb +7 -8
data/lib/spidy/console.rb +0 -2
data/lib/spidy/definition.rb +2 -4
data/lib/spidy/definition_file.rb +0 -2
data/lib/spidy/definition_object.rb +0 -2
data/lib/spidy/shell.rb +0 -2
data/lib/spidy/spider.rb +2 -4
data/lib/spidy/version.rb +1 -3
data/lib/spidy.rb +3 -5
data/spidy.gemspec +4 -15
metadata +11 -102
data/.rubocop_todo.yml +0 -13

data/example/lightpanda_links.rb ADDED Viewed

@@ -0,0 +1,80 @@
+#!/usr/bin/env ruby
+# Enhanced Lightpanda link extractor
+Spidy.define do
+  # Set wait time to avoid timeouts
+  wait_time 10
+  # Set a standard user agent
+  user_agent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
+             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+  spider(as: :lightpanda) do |yielder, connector, start_url|
+    puts "Extracting links from: #{start_url}" if ENV['DEBUG']
+    connector.call(start_url) do |page|
+      links = page.search('a')
+      puts "Found #{links.size} links on page" if ENV['DEBUG']
+      links.each do |a|
+        # Try different ways to get the href attribute
+        href = nil
+        # First try the hash access method
+        begin
+          href = a['href']
+        rescue StandardError
+          # Then try the attr method
+          begin
+            href = a.attr('href')
+          rescue StandardError
+            # Then try the attribute method
+            begin
+              href = a.attribute('href')&.to_s
+            rescue StandardError
+              # Give up on this link
+              next
+            end
+          end
+        end
+        # Skip empty or invalid links
+        next if href.nil? || href.empty? || href == '#'
+        # Try to get link text for better output
+        link_text = nil
+        # Try different ways to get the text
+        begin
+          link_text = a.text.to_s.strip
+        rescue StandardError
+          begin
+            link_text = a.inner_text.to_s.strip
+          rescue StandardError
+            begin
+              link_text = a.content.to_s.strip
+            rescue StandardError
+              link_text = '(Could not extract text)'
+            end
+          end
+        end
+        # Truncate long text
+        link_text = "#{link_text[0...47]}..." if link_text.length > 50
+        link_text = '(No text)' if link_text.nil? || link_text.empty?
+        # Output the link with its text if in debug mode
+        puts "  - #{link_text}: #{href}" if ENV['DEBUG']
+        # Yield the link
+        yielder.call(href)
+      end
+    rescue StandardError => e
+      puts "Error processing links: #{e.message}" if ENV['DEBUG']
+      raise e
+    end
+  rescue StandardError => e
+    puts "Error in spider: #{e.message}" if ENV['DEBUG']
+    raise e
+  end
+end

data/example/master_detail.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 Spidy.define do
   url_to_params = lambda { |url|
     uri = URI.parse(url)
@@ -9,7 +7,7 @@ Spidy.define do
   master_page = proc { |url, &yielder|
     params = url_to_params.call(url)
-    page = params&.dig('page')&.to_i || 0
+    page = params&.dig('page').to_i
     limit_page = 3
     per_page = 25

data/example/proxy.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 Spidy.define do
   user_agent 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
   socks_proxy '127.0.0.1', 9050

data/example/retry.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 Spidy.define do
   spider(as: :json) do |yielder, connector|
     connector.call('https://httpbin.org/status/500') do |json|

data/example/run_with_lightpanda.rb ADDED Viewed

@@ -0,0 +1,25 @@
+#!/usr/bin/env ruby
+# This is a wrapper script that ensures Lightpanda is running before executing Spidy commands
+require_relative 'check_lightpanda'
+# Check if Lightpanda is running, start it if not
+unless lightpanda_running?
+  puts 'Lightpanda is not running. Starting it now...'
+  unless start_lightpanda
+    puts 'Failed to start Lightpanda. Exiting.'
+    exit 1
+  end
+end
+# Execute the Spidy command with debug mode enabled
+ENV['DEBUG'] = 'true'
+url = ARGV[0] || 'https://example.com'
+definition = ARGV[1] || 'example/lightpanda_links.rb'
+puts "Running Spidy with URL: #{url} and definition: #{definition}"
+cmd = "echo '#{url}' | spidy each #{definition}"
+puts "Executing: #{cmd}"
+exec(cmd)

data/example/simple_test.rb ADDED Viewed

@@ -0,0 +1,53 @@
+#!/usr/bin/env ruby
+# Simple Ferrum usage test
+begin
+  require 'ferrum'
+  puts 'Successfully loaded Ferrum'
+rescue LoadError => e
+  puts "Ferrum is not installed: #{e.message}"
+  puts "Run 'gem install ferrum' to install it"
+  exit 1
+end
+puts 'Simple Browser Test'
+puts '==================='
+begin
+  # Initialize Ferrum browser (headless mode)
+  browser = Ferrum::Browser.new(headless: true)
+  # Access URL
+  url = 'https://react-shopping-cart-67954.firebaseapp.com/'
+  puts "Accessing: #{url}"
+  browser.goto(url)
+  # Wait for page to load
+  sleep 2
+  # Get page title
+  title = browser.title
+  puts "Page title: #{title}"
+  # Search for "Add to cart" buttons
+  cart_buttons = browser.css('button.sc-124al1g-0')
+  if cart_buttons.any?
+    puts "\nFound #{cart_buttons.size} 'Add to cart' buttons"
+  else
+    puts "\nNo 'Add to cart' buttons found."
+    # Try to get all buttons for debugging
+    all_buttons = browser.css('button')
+    puts "Found #{all_buttons.size} buttons on the page."
+  end
+  # Clean up browser
+  browser.quit
+rescue StandardError => e
+  puts "Error: #{e.message}"
+  puts e.backtrace.join("\n")
+end
+puts "\nTest completed"

data/example/test_lightpanda.rb ADDED Viewed

@@ -0,0 +1,86 @@
+#!/usr/bin/env ruby
+# This script tests the fixed Lightpanda connector
+require_relative '../lib/spidy'
+# Enable debug output
+ENV['DEBUG'] = 'true'
+# Define a scraper using the lightpanda connector
+scraper = Spidy.define do
+  # Define user agent
+  user_agent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
+             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+  # Define a spider that uses the lightpanda connector
+  spider(as: :lightpanda) do |yielder, connector, url|
+    puts "Processing page with Chrome/Lightpanda: #{url}"
+    connector.call(url) do |page|
+      yielder.call(page)
+    end
+  end
+  # Define the object structure we want to extract
+  define(as: :html) do
+    let(:title, 'title')
+    # Example: extract links
+    let(:links) do |doc|
+      doc.css('a').take(5).map do |link|
+        {
+          text: link.text.to_s.strip,
+          href: link['href'].to_s
+        }
+      end
+    end
+  end
+end
+# URL to scrape
+url = 'https://example.com'
+puts '=== Testing Chrome/Lightpanda Connection ==='
+puts "URL: #{url}\n\n"
+begin
+  # First, check if Lightpanda is running
+  require 'net/http'
+  version_url = URI('http://127.0.0.1:9222/json/version')
+  http = Net::HTTP.new(version_url.host, version_url.port)
+  response = http.get(version_url.path)
+  if response.code == '200'
+    puts 'Lightpanda is running at 127.0.0.1:9222'
+    puts "Version info: #{response.body[0..100]}...\n\n"
+  else
+    puts "Warning: Couldn't connect to Lightpanda at 127.0.0.1:9222"
+    puts "Make sure it's running with: /Users/aileron/bin/lightpanda serve --host 127.0.0.1 --port 9222\n\n"
+  end
+rescue StandardError => e
+  puts "Error checking Lightpanda: #{e.message}"
+  puts "Make sure Lightpanda is running with: /Users/aileron/bin/lightpanda serve --host 127.0.0.1 --port 9222\n\n"
+end
+begin
+  # Execute the scraper
+  result = scraper.call(url)
+  # Display the results
+  puts "Page Title: #{result.title}"
+  if result.links&.any?
+    puts "\nFound #{result.links.size} links:"
+    result.links.each_with_index do |link, index|
+      puts "  #{index + 1}. #{link[:text]} - URL: #{link[:href]}"
+    end
+  else
+    puts "\nNo links found."
+  end
+rescue StandardError => e
+  puts "Error: #{e.message}"
+  puts e.backtrace.join("\n")
+end
+puts "\n=== Test completed ==="

data/example/wikip.rb CHANGED Viewed

@@ -1,8 +1,6 @@
-# frozen_string_literal: true
 Spidy.define do
-  def self.infobox_scrape(params, &block)
-    call(params.html.at('.infobox'), name: :infobox, &block)
+  def self.infobox_scrape(params, &)
+    call(params.html.at('.infobox'), name: :infobox, &)
   end
   define(as: :html) do

data/exe/spidy CHANGED Viewed

@@ -1,6 +1,4 @@
 #!/usr/bin/env ruby
-# frozen_string_literal: true
 require 'spidy'
 if ARGV[1].blank?

data/lib/spidy/binder/error.rb CHANGED Viewed

@@ -1,4 +1,2 @@
-# frozen_string_literal: true
 class Spidy::Binder::Error < StandardError
 end

data/lib/spidy/binder/html.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # Bind html and convert to object
 #

data/lib/spidy/binder/json.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # Bind json and convert to object
 #

data/lib/spidy/binder/xml.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # Bind xml and convert to object
 #

data/lib/spidy/binder.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # Bind resource received from the connection to the result object
 #

data/lib/spidy/command_line.rb CHANGED Viewed

@@ -1,14 +1,12 @@
-# frozen_string_literal: true
 #
 # spidy shell interface
 #
 class Spidy::CommandLine
   delegate :spidy, to: :@definition_file
-  class_attribute :output, default: (proc { |result| $stdout.puts(result.to_s) })
-  class_attribute :error_handler, default: (proc { |e, url|
-                                              warn({ url: url, message: e.message, backtrace: e.backtrace }.to_json)
-                                            })
+  class_attribute :output, default: proc { |result| $stdout.puts(result.to_s) }
+  class_attribute :error_handler, default: proc { |e, url|
+    warn({ url: url, message: e.message, backtrace: e.backtrace }.to_json)
+  }
   def eval_call(script)
     @definition_file.spidy.instance_eval(script)

data/lib/spidy/connector/direct.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # Direct resource ( not network resource  )
 #

data/lib/spidy/connector/html.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # Mechanize wrapper
 #

data/lib/spidy/connector/json.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # OpenURI to JSON.parse
 #
@@ -12,9 +10,9 @@ class Spidy::Connector::Json
     @user_agent = user_agent
   end
-  def call(url, &block)
+  def call(url, &)
     fail 'url is not specified' if url.blank?
-    connect(url, &block)
+    connect(url, &)
   end
   def connect(url)

data/lib/spidy/connector/lightpanda.rb ADDED Viewed

@@ -0,0 +1,161 @@
+#
+# Lightpanda connector for JavaScript-rendered pages via CDP
+# Using Ferrum for direct CDP connection
+#
+class Spidy::Connector::Lightpanda
+  include Spidy::Connector::StaticAccessor
+  attr_reader :user_agent, :host, :port
+  DEFAULT_HOST = '127.0.0.1'.freeze
+  DEFAULT_PORT = 9222
+  def initialize(user_agent:, host: nil, port: nil)
+    begin
+      require 'ferrum'
+    rescue LoadError
+      raise 'Ferrum gem is required. Please install with: gem install ferrum'
+    end
+    @user_agent = user_agent
+    @host = host || ENV['LIGHTPANDA_HOST'] || DEFAULT_HOST
+    @port = port || ENV['LIGHTPANDA_PORT'] || DEFAULT_PORT
+  end
+  def call(url)
+    fail 'url is not specified' if url.blank?
+    # Clean the URL by removing any whitespace or newlines
+    clean_url = url.to_s.strip
+    puts "Processing URL: #{clean_url}" if ENV['DEBUG']
+    # Create a page-like object similar to Mechanize
+    page = fetch_with_ferrum(clean_url)
+    # Apply yielder to the page
+    yield(page)
+  end
+  def refresh!
+    # No special refresh actions needed for Lightpanda
+  end
+  private
+  # Try to wait for network to be idle
+  def wait_for_network_idle(browser)
+    puts 'Waiting for network idle...' if ENV['DEBUG']
+    begin
+      # Try with timeout parameter
+      browser.network.wait_for_idle(timeout: 15)
+    rescue ArgumentError
+      begin
+        # Try without timeout parameter
+        browser.network.wait_for_idle
+      rescue StandardError => e
+        # If wait_for_idle fails, fall back to a simple sleep
+        puts "Warning: Could not wait for network idle: #{e.message}" if ENV['DEBUG']
+        sleep 5 # Simple fallback
+      end
+    end
+  end
+  # Navigate to URL and get page content
+  def navigate_and_get_content(browser, url)
+    # Navigate to the URL
+    puts "Navigating to: #{url}" if ENV['DEBUG']
+    browser.goto(url)
+    # Wait for network idle
+    wait_for_network_idle(browser)
+    # Get the content
+    puts 'Getting page content...' if ENV['DEBUG']
+    browser.body
+  end
+  # Create browser options
+  def create_browser_options
+    options = {
+      headless: true,         # Run in headless mode
+      timeout: 20,            # Increase timeout
+      process_timeout: 20,    # Process timeout
+      window_size: [1280, 800] # Set window size
+    }
+    # Add Chrome path if available
+    options[:browser_path] = ENV['CHROME_PATH'] if ENV['CHROME_PATH'] && File.exist?(ENV['CHROME_PATH'])
+    options
+  end
+  # Connect to CDP server with Ferrum and fetch the page
+  def fetch_with_ferrum(url) # rubocop:todo Metrics/PerceivedComplexity, Metrics/CyclomaticComplexity
+    puts 'Using direct Chrome/Chromium instead of Lightpanda' if ENV['DEBUG']
+    browser = nil
+    html_content = nil
+    begin
+      # Create Ferrum browser
+      browser = Ferrum::Browser.new(create_browser_options)
+      # Skip user agent setting - not supported in this Ferrum version
+      if @user_agent.present? && ENV.fetch('DEBUG', nil)
+        puts 'User agent setting will be skipped - not supported in your Ferrum version'
+      end
+      # Navigate and get content
+      html_content = navigate_and_get_content(browser, url)
+    rescue StandardError => e
+      puts "Error during page navigation: #{e.class} - #{e.message}" if ENV['DEBUG']
+      raise e
+    ensure
+      # Clean up - ensure browser is always closed, even if an error occurred
+      browser&.quit if defined?(browser) && browser
+    end
+    # Create a Mechanize-like page object
+    LightpandaPage.new(url, html_content)
+  end
+  # Page-like object that mimics the Mechanize::Page interface
+  class LightpandaPage
+    attr_reader :uri, :body, :title, :code, :response_code
+    def initialize(url, html_content)
+      @uri = url
+      @body = html_content
+      @doc = Nokogiri::HTML(html_content)
+      @title = @doc.title
+      @code = '200'
+      @response_code = '200'
+    end
+    # Common methods from Mechanize::Page that might be used in the application
+    def search(*)
+      @doc.search(*)
+    end
+    def at(*)
+      @doc.at(*)
+    end
+    def css(*)
+      @doc.css(*)
+    end
+    def xpath(*)
+      @doc.xpath(*)
+    end
+    def encoding
+      @doc.encoding
+    end
+    def try(*args)
+      send(*args) if respond_to?(args.first)
+    end
+  end
+end

data/lib/spidy/connector/xml.rb CHANGED Viewed

@@ -1,20 +1,18 @@
-# frozen_string_literal: true
 #
 # xml
 #
 class Spidy::Connector::Xml
   include Spidy::Connector::StaticAccessor
-  def call(url, &block)
+  def call(url, &)
     fail 'URL is undefined' if url.blank?
-    connect(url, &block)
+    connect(url, &)
   end
-  def connect(url, &block)
+  def connect(url)
     OpenURI.open_uri(url, 'User-Agent' => @user_agent) do |body|
-      block.call Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
+      yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
     end
   rescue OpenURI::HTTPError => e
     raise Spidy::Connector::Retry.new(error: e, response_code: e.io.status[0])

data/lib/spidy/connector.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # This class is responsible for actually making a network connection and downloading hypertext
 #
@@ -9,6 +7,7 @@ module Spidy::Connector
   autoload :Html
   autoload :Json
   autoload :Xml
+  autoload :Lightpanda
   DEFAULT_WAIT_TIME = 5
@@ -35,9 +34,9 @@ module Spidy::Connector
   module StaticAccessor
     extend ActiveSupport::Concern
     class_methods do
-      def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &block)
+      def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &)
         ::Spidy::Connector::RetryableCaller.new(new(user_agent: user_agent), wait_time: wait_time, logger: logger).call(
-          url, &block
+          url, &
         )
       end
     end
@@ -75,9 +74,9 @@ module Spidy::Connector
       connect(url, &block)
     end
-    def connect(url, retry_attempt_count: @retry_attempt_count, &block)
+    def connect(url, retry_attempt_count: @retry_attempt_count, &)
       logger.call('connnector.get': url, 'connnector.accessed': Time.current)
-      origin_connector.call(url, &block)
+      origin_connector.call(url, &)
     rescue Spidy::Connector::Retry => e
       logger.call('retry.accessed': Time.current,
                   'retry.uri': url,
@@ -105,9 +104,9 @@ module Spidy::Connector
       @socks_proxy = socks_proxy
     end
-    def call(url, &block)
+    def call(url, &)
       Socksify.proxy(socks_proxy[:host], socks_proxy[:port]) do
-        connector.call(url, &block)
+        connector.call(url, &)
       end
     end

data/lib/spidy/console.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # spidy console
 #

data/lib/spidy/definition.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # Class representing a website defined by DSL
 #
@@ -42,12 +40,12 @@ module Spidy::Definition
     end
   end
-  def spider(name = :default, connector: nil, as: nil, &define_block)
+  def spider(name = :default, connector: nil, as: nil)
     @namespace ||= {}
     connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
                                                       socks_proxy: @socks_proxy)
     @namespace[:"#{name}_spider"] = proc do |source, &yielder|
-      define_block.call(yielder, connector, source)
+      yield(yielder, connector, source)
     end
   end

data/lib/spidy/definition_file.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # spidy interface binding
 #

data/lib/spidy/definition_object.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # An object that represents the scraper defined by define block.
 #

data/lib/spidy/shell.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # spidy Shell
 #

data/lib/spidy/spider.rb CHANGED Viewed

@@ -1,11 +1,9 @@
-# frozen_string_literal: true
 #
 # Spider
 #
 class Spidy::Spider
-  def initialize(&block)
-    define_singleton_method(:bind, &block)
+  def initialize(&)
+    define_singleton_method(:bind, &)
   end
   def call(resource)