RubyGems - spidy - Versions diffs - 0.3.12 → 1.0.0 - Mend

spidy 0.3.12 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

checksums.yaml +4 -4
data/.rubocop.yml +53 -8
data/.ruby-version +1 -1
data/CLAUDE.md +28 -0
data/Gemfile +20 -2
data/Gemfile.lock +178 -69
data/README.md +96 -17
data/Rakefile +0 -2
data/bin/console +2 -3
data/example/check_ferrum.rb +114 -0
data/example/check_lightpanda.rb +59 -0
data/example/connect_test.rb +48 -0
data/example/lightpanda_links.rb +80 -0
data/example/master_detail.rb +1 -3
data/example/proxy.rb +0 -2
data/example/retry.rb +0 -2
data/example/run_with_lightpanda.rb +25 -0
data/example/simple_test.rb +53 -0
data/example/test_lightpanda.rb +86 -0
data/example/wikip.rb +2 -4
data/exe/spidy +0 -3
data/lib/spidy/binder/error.rb +0 -2
data/lib/spidy/binder/html.rb +0 -2
data/lib/spidy/binder/json.rb +0 -2
data/lib/spidy/binder/xml.rb +0 -2
data/lib/spidy/binder.rb +0 -2
data/lib/spidy/command_line.rb +4 -6
data/lib/spidy/connector/direct.rb +0 -2
data/lib/spidy/connector/html.rb +0 -2
data/lib/spidy/connector/json.rb +2 -4
data/lib/spidy/connector/lightpanda.rb +161 -0
data/lib/spidy/connector/xml.rb +4 -6
data/lib/spidy/connector.rb +7 -8
data/lib/spidy/console.rb +0 -2
data/lib/spidy/definition.rb +2 -4
data/lib/spidy/definition_file.rb +0 -2
data/lib/spidy/definition_object.rb +0 -2
data/lib/spidy/shell.rb +6 -3
data/lib/spidy/spider.rb +2 -4
data/lib/spidy/version.rb +1 -3
data/lib/spidy.rb +3 -5
data/spidy.gemspec +4 -17
metadata +16 -138
data/.rubocop_todo.yml +0 -13

data/example/check_ferrum.rb ADDED Viewed

@@ -0,0 +1,114 @@
+#!/usr/bin/env ruby
+# This script checks what Ferrum API methods are available
+begin
+  require 'ferrum'
+  puts 'Ferrum gem is loaded!'
+  # Check Ferrum version
+  puts "Ferrum version: #{begin
+    Ferrum::VERSION
+  rescue StandardError
+    'unknown'
+  end}"
+  # Try to create a browser instance
+  puts "\nTrying to create a browser instance..."
+  # Find Chrome executable path
+  def find_chrome_path
+    # Common locations on macOS
+    macos_paths = [
+      '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+      '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
+      '/Applications/Chromium.app/Contents/MacOS/Chromium',
+      '/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'
+    ]
+    # Check macOS paths
+    macos_paths.each do |path|
+      return path if File.exist?(path)
+    end
+    # Try to locate Chrome using 'which' command
+    %w[google-chrome chromium chromium-browser].each do |browser|
+      path = `which #{browser} 2>/dev/null`.strip
+      return path if path != '' && File.exist?(path)
+    end
+    nil
+  end
+  # Get Chrome path
+  chrome_path = ENV['CHROME_PATH'] || find_chrome_path
+  if chrome_path
+    puts "Using Chrome executable: #{chrome_path}"
+  else
+    puts 'No Chrome executable found. Using default.'
+  end
+  # Create browser with options
+  options = {
+    headless: true,
+    window_size: [1280, 800]
+  }
+  # Add Chrome path if available
+  options[:browser_path] = chrome_path if chrome_path
+  browser = Ferrum::Browser.new(options)
+  puts 'Browser instance created successfully!'
+  # Check available methods on browser
+  puts "\nAvailable methods on browser object:"
+  browser_methods = (browser.methods - Object.methods).sort
+  puts browser_methods.join(', ')
+  # Check if headers method exists
+  puts "\nDoes browser respond to 'headers='? #{browser.respond_to?(:headers=)}"
+  # Check available methods on browser.network
+  if browser.respond_to?(:network)
+    puts "\nAvailable methods on browser.network object:"
+    network_methods = (browser.network.methods - Object.methods).sort
+    puts network_methods.join(', ')
+    # Check if wait_for_idle method exists and what parameters it accepts
+    if browser.network.respond_to?(:wait_for_idle)
+      puts "\nExamine wait_for_idle method:"
+      begin
+        # Try with timeout parameter
+        browser.network.wait_for_idle(timeout: 1)
+        puts 'wait_for_idle accepts timeout parameter'
+      rescue ArgumentError => e
+        puts "wait_for_idle does not accept timeout parameter: #{e.message}"
+      rescue StandardError => e
+        puts "Error calling wait_for_idle with timeout: #{e.message}"
+      end
+    else
+      puts "\nwait_for_idle method not available on network object"
+    end
+  else
+    puts "\nnetwork method not available on browser object"
+  end
+  # Test goto method
+  puts "\nTesting navigation with goto method:"
+  begin
+    browser.goto('https://example.com')
+    puts 'Navigation successful!'
+    puts "Page title: #{browser.title}"
+  rescue StandardError => e
+    puts "Error during navigation: #{e.message}"
+  end
+  # Clean up
+  browser.quit
+  puts "\nBrowser closed successfully"
+rescue LoadError => e
+  puts "Error: Ferrum gem is not installed: #{e.message}"
+  puts 'Install it with: gem install ferrum'
+rescue StandardError => e
+  puts "Error: #{e.message}"
+  puts e.backtrace.join("\n")
+end

data/example/check_lightpanda.rb ADDED Viewed

@@ -0,0 +1,59 @@
+#!/usr/bin/env ruby
+# Script to check if Lightpanda is running and start it if needed
+require 'net/http'
+def lightpanda_running?(host = '127.0.0.1', port = 9222)
+  uri = URI("http://#{host}:#{port}/json/version")
+  response = Net::HTTP.get_response(uri)
+  response.is_a?(Net::HTTPSuccess)
+rescue StandardError
+  false
+end
+def start_lightpanda(host = '127.0.0.1', port = 9222)
+  puts 'Starting Lightpanda...'
+  # Build the command to start Lightpanda in the background
+  cmd = "/Users/aileron/bin/lightpanda serve --host #{host} --port #{port} > /tmp/lightpanda.log 2>&1 &"
+  # Execute the command
+  result = system(cmd)
+  if result
+    puts "Lightpanda started! Service should be available at http://#{host}:#{port}"
+    # Wait for it to be ready
+    10.times do
+      if lightpanda_running?(host, port)
+        puts 'Lightpanda is now running and accepting connections!'
+        return true
+      end
+      puts 'Waiting for Lightpanda to start...'
+      sleep 1
+    end
+    puts "Lightpanda might have started but isn't responding yet."
+    puts 'Check /tmp/lightpanda.log for details.'
+  else
+    puts 'Failed to start Lightpanda. Make sure the path is correct: /Users/aileron/bin/lightpanda'
+  end
+  false
+end
+# Main script
+host = '127.0.0.1'
+port = 9222
+if lightpanda_running?(host, port)
+  puts "✅ Lightpanda is already running at http://#{host}:#{port}"
+else
+  puts "❌ Lightpanda is not running at http://#{host}:#{port}"
+  if ARGV.include?('--start') || ARGV.include?('-s')
+    start_lightpanda(host, port)
+  else
+    puts 'Run this script with --start or -s option to start Lightpanda automatically:'
+    puts "  #{$PROGRAM_NAME} --start"
+  end
+end

data/example/connect_test.rb ADDED Viewed

@@ -0,0 +1,48 @@
+#!/usr/bin/env ruby
+# Test connecting to existing Chrome instance at 127.0.0.1:9222
+begin
+  require 'ferrum'
+  puts 'Successfully loaded Ferrum'
+rescue LoadError => e
+  puts "Ferrum is not installed: #{e.message}"
+  puts "Run 'gem install ferrum' to install it"
+  exit 1
+end
+puts 'Testing connection to Chrome at 127.0.0.1:9222'
+puts '=============================================='
+begin
+  # Connect to the remote Chrome instance
+  # Note: We're setting process: false to prevent launching a new browser
+  browser = Ferrum::Browser.new(
+    url: 'http://127.0.0.1:9222',
+    process: false
+  )
+  # Access a test URL
+  url = 'https://example.com'
+  puts "Accessing: #{url}"
+  browser.goto(url)
+  # Get page title
+  title = browser.title
+  puts "Page title: #{title}"
+  # Clean up browser connection (but don't close Chrome)
+  browser.quit
+  puts "\nSuccess! Connected to Chrome at 127.0.0.1:9222"
+rescue StandardError => e
+  puts "Error: #{e.message}"
+  puts e.backtrace.join("\n")
+  puts "\nTroubleshooting tips:"
+  puts '1. Make sure Chrome is running with remote debugging enabled'
+  puts '2. Verify the command: /Users/aileron/bin/lightpanda serve --host 127.0.0.1 --port 9222'
+  puts '3. Check if you can access http://127.0.0.1:9222/json/version in your browser'
+end
+puts "\nTest completed"

data/example/lightpanda_links.rb ADDED Viewed

@@ -0,0 +1,80 @@
+#!/usr/bin/env ruby
+# Enhanced Lightpanda link extractor
+Spidy.define do
+  # Set wait time to avoid timeouts
+  wait_time 10
+  # Set a standard user agent
+  user_agent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
+             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+  spider(as: :lightpanda) do |yielder, connector, start_url|
+    puts "Extracting links from: #{start_url}" if ENV['DEBUG']
+    connector.call(start_url) do |page|
+      links = page.search('a')
+      puts "Found #{links.size} links on page" if ENV['DEBUG']
+      links.each do |a|
+        # Try different ways to get the href attribute
+        href = nil
+        # First try the hash access method
+        begin
+          href = a['href']
+        rescue StandardError
+          # Then try the attr method
+          begin
+            href = a.attr('href')
+          rescue StandardError
+            # Then try the attribute method
+            begin
+              href = a.attribute('href')&.to_s
+            rescue StandardError
+              # Give up on this link
+              next
+            end
+          end
+        end
+        # Skip empty or invalid links
+        next if href.nil? || href.empty? || href == '#'
+        # Try to get link text for better output
+        link_text = nil
+        # Try different ways to get the text
+        begin
+          link_text = a.text.to_s.strip
+        rescue StandardError
+          begin
+            link_text = a.inner_text.to_s.strip
+          rescue StandardError
+            begin
+              link_text = a.content.to_s.strip
+            rescue StandardError
+              link_text = '(Could not extract text)'
+            end
+          end
+        end
+        # Truncate long text
+        link_text = "#{link_text[0...47]}..." if link_text.length > 50
+        link_text = '(No text)' if link_text.nil? || link_text.empty?
+        # Output the link with its text if in debug mode
+        puts "  - #{link_text}: #{href}" if ENV['DEBUG']
+        # Yield the link
+        yielder.call(href)
+      end
+    rescue StandardError => e
+      puts "Error processing links: #{e.message}" if ENV['DEBUG']
+      raise e
+    end
+  rescue StandardError => e
+    puts "Error in spider: #{e.message}" if ENV['DEBUG']
+    raise e
+  end
+end

data/example/master_detail.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 Spidy.define do
   url_to_params = lambda { |url|
     uri = URI.parse(url)
@@ -9,7 +7,7 @@ Spidy.define do
   master_page = proc { |url, &yielder|
     params = url_to_params.call(url)
-    page = params&.dig('page')&.to_i || 0
+    page = params&.dig('page').to_i
     limit_page = 3
     per_page = 25

data/example/proxy.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 Spidy.define do
   user_agent 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
   socks_proxy '127.0.0.1', 9050

data/example/retry.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 Spidy.define do
   spider(as: :json) do |yielder, connector|
     connector.call('https://httpbin.org/status/500') do |json|

data/example/run_with_lightpanda.rb ADDED Viewed

@@ -0,0 +1,25 @@
+#!/usr/bin/env ruby
+# This is a wrapper script that ensures Lightpanda is running before executing Spidy commands
+require_relative 'check_lightpanda'
+# Check if Lightpanda is running, start it if not
+unless lightpanda_running?
+  puts 'Lightpanda is not running. Starting it now...'
+  unless start_lightpanda
+    puts 'Failed to start Lightpanda. Exiting.'
+    exit 1
+  end
+end
+# Execute the Spidy command with debug mode enabled
+ENV['DEBUG'] = 'true'
+url = ARGV[0] || 'https://example.com'
+definition = ARGV[1] || 'example/lightpanda_links.rb'
+puts "Running Spidy with URL: #{url} and definition: #{definition}"
+cmd = "echo '#{url}' | spidy each #{definition}"
+puts "Executing: #{cmd}"
+exec(cmd)

data/example/simple_test.rb ADDED Viewed

@@ -0,0 +1,53 @@
+#!/usr/bin/env ruby
+# Simple Ferrum usage test
+begin
+  require 'ferrum'
+  puts 'Successfully loaded Ferrum'
+rescue LoadError => e
+  puts "Ferrum is not installed: #{e.message}"
+  puts "Run 'gem install ferrum' to install it"
+  exit 1
+end
+puts 'Simple Browser Test'
+puts '==================='
+begin
+  # Initialize Ferrum browser (headless mode)
+  browser = Ferrum::Browser.new(headless: true)
+  # Access URL
+  url = 'https://react-shopping-cart-67954.firebaseapp.com/'
+  puts "Accessing: #{url}"
+  browser.goto(url)
+  # Wait for page to load
+  sleep 2
+  # Get page title
+  title = browser.title
+  puts "Page title: #{title}"
+  # Search for "Add to cart" buttons
+  cart_buttons = browser.css('button.sc-124al1g-0')
+  if cart_buttons.any?
+    puts "\nFound #{cart_buttons.size} 'Add to cart' buttons"
+  else
+    puts "\nNo 'Add to cart' buttons found."
+    # Try to get all buttons for debugging
+    all_buttons = browser.css('button')
+    puts "Found #{all_buttons.size} buttons on the page."
+  end
+  # Clean up browser
+  browser.quit
+rescue StandardError => e
+  puts "Error: #{e.message}"
+  puts e.backtrace.join("\n")
+end
+puts "\nTest completed"

data/example/test_lightpanda.rb ADDED Viewed

@@ -0,0 +1,86 @@
+#!/usr/bin/env ruby
+# This script tests the fixed Lightpanda connector
+require_relative '../lib/spidy'
+# Enable debug output
+ENV['DEBUG'] = 'true'
+# Define a scraper using the lightpanda connector
+scraper = Spidy.define do
+  # Define user agent
+  user_agent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
+             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+  # Define a spider that uses the lightpanda connector
+  spider(as: :lightpanda) do |yielder, connector, url|
+    puts "Processing page with Chrome/Lightpanda: #{url}"
+    connector.call(url) do |page|
+      yielder.call(page)
+    end
+  end
+  # Define the object structure we want to extract
+  define(as: :html) do
+    let(:title, 'title')
+    # Example: extract links
+    let(:links) do |doc|
+      doc.css('a').take(5).map do |link|
+        {
+          text: link.text.to_s.strip,
+          href: link['href'].to_s
+        }
+      end
+    end
+  end
+end
+# URL to scrape
+url = 'https://example.com'
+puts '=== Testing Chrome/Lightpanda Connection ==='
+puts "URL: #{url}\n\n"
+begin
+  # First, check if Lightpanda is running
+  require 'net/http'
+  version_url = URI('http://127.0.0.1:9222/json/version')
+  http = Net::HTTP.new(version_url.host, version_url.port)
+  response = http.get(version_url.path)
+  if response.code == '200'
+    puts 'Lightpanda is running at 127.0.0.1:9222'
+    puts "Version info: #{response.body[0..100]}...\n\n"
+  else
+    puts "Warning: Couldn't connect to Lightpanda at 127.0.0.1:9222"
+    puts "Make sure it's running with: /Users/aileron/bin/lightpanda serve --host 127.0.0.1 --port 9222\n\n"
+  end
+rescue StandardError => e
+  puts "Error checking Lightpanda: #{e.message}"
+  puts "Make sure Lightpanda is running with: /Users/aileron/bin/lightpanda serve --host 127.0.0.1 --port 9222\n\n"
+end
+begin
+  # Execute the scraper
+  result = scraper.call(url)
+  # Display the results
+  puts "Page Title: #{result.title}"
+  if result.links&.any?
+    puts "\nFound #{result.links.size} links:"
+    result.links.each_with_index do |link, index|
+      puts "  #{index + 1}. #{link[:text]} - URL: #{link[:href]}"
+    end
+  else
+    puts "\nNo links found."
+  end
+rescue StandardError => e
+  puts "Error: #{e.message}"
+  puts e.backtrace.join("\n")
+end
+puts "\n=== Test completed ==="

data/example/wikip.rb CHANGED Viewed

@@ -1,8 +1,6 @@
-# frozen_string_literal: true
 Spidy.define do
-  def self.infobox_scrape(params, &block)
-    call(params.html.at('.infobox'), name: :infobox, &block)
+  def self.infobox_scrape(params, &)
+    call(params.html.at('.infobox'), name: :infobox, &)
   end
   define(as: :html) do

data/exe/spidy CHANGED Viewed

@@ -1,8 +1,5 @@
 #!/usr/bin/env ruby
-# frozen_string_literal: true
 require 'spidy'
-require 'pry'
 if ARGV[1].blank?
   case ARGV[0]

data/lib/spidy/binder/error.rb CHANGED Viewed

@@ -1,4 +1,2 @@
-# frozen_string_literal: true
 class Spidy::Binder::Error < StandardError
 end

data/lib/spidy/binder/html.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # Bind html and convert to object
 #

data/lib/spidy/binder/json.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # Bind json and convert to object
 #

data/lib/spidy/binder/xml.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # Bind xml and convert to object
 #

data/lib/spidy/binder.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # Bind resource received from the connection to the result object
 #

data/lib/spidy/command_line.rb CHANGED Viewed

@@ -1,14 +1,12 @@
-# frozen_string_literal: true
 #
 # spidy shell interface
 #
 class Spidy::CommandLine
   delegate :spidy, to: :@definition_file
-  class_attribute :output, default: (proc { |result| $stdout.puts(result.to_s) })
-  class_attribute :error_handler, default: (proc { |e, url|
-                                              warn({ url: url, message: e.message, backtrace: e.backtrace }.to_json)
-                                            })
+  class_attribute :output, default: proc { |result| $stdout.puts(result.to_s) }
+  class_attribute :error_handler, default: proc { |e, url|
+    warn({ url: url, message: e.message, backtrace: e.backtrace }.to_json)
+  }
   def eval_call(script)
     @definition_file.spidy.instance_eval(script)

data/lib/spidy/connector/direct.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # Direct resource ( not network resource  )
 #

data/lib/spidy/connector/html.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # Mechanize wrapper
 #

data/lib/spidy/connector/json.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 #
 # OpenURI to JSON.parse
 #
@@ -12,9 +10,9 @@ class Spidy::Connector::Json
     @user_agent = user_agent
   end
-  def call(url, &block)
+  def call(url, &)
     fail 'url is not specified' if url.blank?
-    connect(url, &block)
+    connect(url, &)
   end
   def connect(url)