spidy 0.4.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/env ruby
2
+ # Enhanced Lightpanda link extractor
3
+
4
+ Spidy.define do
5
+ # Set wait time to avoid timeouts
6
+ wait_time 10
7
+
8
+ # Set a standard user agent
9
+ user_agent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
10
+ 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
11
+
12
+ spider(as: :lightpanda) do |yielder, connector, start_url|
13
+ puts "Extracting links from: #{start_url}" if ENV['DEBUG']
14
+
15
+ connector.call(start_url) do |page|
16
+ links = page.search('a')
17
+ puts "Found #{links.size} links on page" if ENV['DEBUG']
18
+
19
+ links.each do |a|
20
+ # Try different ways to get the href attribute
21
+ href = nil
22
+
23
+ # First try the hash access method
24
+ begin
25
+ href = a['href']
26
+ rescue StandardError
27
+ # Then try the attr method
28
+ begin
29
+ href = a.attr('href')
30
+ rescue StandardError
31
+ # Then try the attribute method
32
+ begin
33
+ href = a.attribute('href')&.to_s
34
+ rescue StandardError
35
+ # Give up on this link
36
+ next
37
+ end
38
+ end
39
+ end
40
+
41
+ # Skip empty or invalid links
42
+ next if href.nil? || href.empty? || href == '#'
43
+
44
+ # Try to get link text for better output
45
+ link_text = nil
46
+
47
+ # Try different ways to get the text
48
+ begin
49
+ link_text = a.text.to_s.strip
50
+ rescue StandardError
51
+ begin
52
+ link_text = a.inner_text.to_s.strip
53
+ rescue StandardError
54
+ begin
55
+ link_text = a.content.to_s.strip
56
+ rescue StandardError
57
+ link_text = '(Could not extract text)'
58
+ end
59
+ end
60
+ end
61
+
62
+ # Truncate long text
63
+ link_text = "#{link_text[0...47]}..." if link_text.length > 50
64
+ link_text = '(No text)' if link_text.nil? || link_text.empty?
65
+
66
+ # Output the link with its text if in debug mode
67
+ puts " - #{link_text}: #{href}" if ENV['DEBUG']
68
+
69
+ # Yield the link
70
+ yielder.call(href)
71
+ end
72
+ rescue StandardError => e
73
+ puts "Error processing links: #{e.message}" if ENV['DEBUG']
74
+ raise e
75
+ end
76
+ rescue StandardError => e
77
+ puts "Error in spider: #{e.message}" if ENV['DEBUG']
78
+ raise e
79
+ end
80
+ end
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  Spidy.define do
4
2
  url_to_params = lambda { |url|
5
3
  uri = URI.parse(url)
@@ -9,7 +7,7 @@ Spidy.define do
9
7
 
10
8
  master_page = proc { |url, &yielder|
11
9
  params = url_to_params.call(url)
12
- page = params&.dig('page')&.to_i || 0
10
+ page = params&.dig('page').to_i
13
11
 
14
12
  limit_page = 3
15
13
  per_page = 25
data/example/proxy.rb CHANGED
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  Spidy.define do
4
2
  user_agent 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
5
3
  socks_proxy '127.0.0.1', 9050
data/example/retry.rb CHANGED
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  Spidy.define do
4
2
  spider(as: :json) do |yielder, connector|
5
3
  connector.call('https://httpbin.org/status/500') do |json|
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # This is a wrapper script that ensures Lightpanda is running before executing Spidy commands
4
+
5
+ require_relative 'check_lightpanda'
6
+
7
+ # Check if Lightpanda is running, start it if not
8
+ unless lightpanda_running?
9
+ puts 'Lightpanda is not running. Starting it now...'
10
+ unless start_lightpanda
11
+ puts 'Failed to start Lightpanda. Exiting.'
12
+ exit 1
13
+ end
14
+ end
15
+
16
+ # Execute the Spidy command with debug mode enabled
17
+ ENV['DEBUG'] = 'true'
18
+ url = ARGV[0] || 'https://example.com'
19
+ definition = ARGV[1] || 'example/lightpanda_links.rb'
20
+
21
+ puts "Running Spidy with URL: #{url} and definition: #{definition}"
22
+ cmd = "echo '#{url}' | spidy each #{definition}"
23
+
24
+ puts "Executing: #{cmd}"
25
+ exec(cmd)
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Simple Ferrum usage test
4
+
5
+ begin
6
+ require 'ferrum'
7
+ puts 'Successfully loaded Ferrum'
8
+ rescue LoadError => e
9
+ puts "Ferrum is not installed: #{e.message}"
10
+ puts "Run 'gem install ferrum' to install it"
11
+ exit 1
12
+ end
13
+
14
+ puts 'Simple Browser Test'
15
+ puts '==================='
16
+
17
+ begin
18
+ # Initialize Ferrum browser (headless mode)
19
+ browser = Ferrum::Browser.new(headless: true)
20
+
21
+ # Access URL
22
+ url = 'https://react-shopping-cart-67954.firebaseapp.com/'
23
+ puts "Accessing: #{url}"
24
+ browser.goto(url)
25
+
26
+ # Wait for page to load
27
+ sleep 2
28
+
29
+ # Get page title
30
+ title = browser.title
31
+ puts "Page title: #{title}"
32
+
33
+ # Search for "Add to cart" buttons
34
+ cart_buttons = browser.css('button.sc-124al1g-0')
35
+
36
+ if cart_buttons.any?
37
+ puts "\nFound #{cart_buttons.size} 'Add to cart' buttons"
38
+ else
39
+ puts "\nNo 'Add to cart' buttons found."
40
+
41
+ # Try to get all buttons for debugging
42
+ all_buttons = browser.css('button')
43
+ puts "Found #{all_buttons.size} buttons on the page."
44
+ end
45
+
46
+ # Clean up browser
47
+ browser.quit
48
+ rescue StandardError => e
49
+ puts "Error: #{e.message}"
50
+ puts e.backtrace.join("\n")
51
+ end
52
+
53
+ puts "\nTest completed"
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # This script tests the fixed Lightpanda connector
4
+
5
+ require_relative '../lib/spidy'
6
+
7
+ # Enable debug output
8
+ ENV['DEBUG'] = 'true'
9
+
10
+ # Define a scraper using the lightpanda connector
11
+ scraper = Spidy.define do
12
+ # Define user agent
13
+ user_agent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
14
+ 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
15
+
16
+ # Define a spider that uses the lightpanda connector
17
+ spider(as: :lightpanda) do |yielder, connector, url|
18
+ puts "Processing page with Chrome/Lightpanda: #{url}"
19
+ connector.call(url) do |page|
20
+ yielder.call(page)
21
+ end
22
+ end
23
+
24
+ # Define the object structure we want to extract
25
+ define(as: :html) do
26
+ let(:title, 'title')
27
+
28
+ # Example: extract links
29
+ let(:links) do |doc|
30
+ doc.css('a').take(5).map do |link|
31
+ {
32
+ text: link.text.to_s.strip,
33
+ href: link['href'].to_s
34
+ }
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ # URL to scrape
41
+ url = 'https://example.com'
42
+
43
+ puts '=== Testing Chrome/Lightpanda Connection ==='
44
+ puts "URL: #{url}\n\n"
45
+
46
+ begin
47
+ # First, check if Lightpanda is running
48
+ require 'net/http'
49
+ version_url = URI('http://127.0.0.1:9222/json/version')
50
+ http = Net::HTTP.new(version_url.host, version_url.port)
51
+ response = http.get(version_url.path)
52
+
53
+ if response.code == '200'
54
+ puts 'Lightpanda is running at 127.0.0.1:9222'
55
+ puts "Version info: #{response.body[0..100]}...\n\n"
56
+ else
57
+ puts "Warning: Couldn't connect to Lightpanda at 127.0.0.1:9222"
58
+ puts "Make sure it's running with: /Users/aileron/bin/lightpanda serve --host 127.0.0.1 --port 9222\n\n"
59
+ end
60
+ rescue StandardError => e
61
+ puts "Error checking Lightpanda: #{e.message}"
62
+ puts "Make sure Lightpanda is running with: /Users/aileron/bin/lightpanda serve --host 127.0.0.1 --port 9222\n\n"
63
+ end
64
+
65
+ begin
66
+ # Execute the scraper
67
+ result = scraper.call(url)
68
+
69
+ # Display the results
70
+ puts "Page Title: #{result.title}"
71
+
72
+ if result.links&.any?
73
+ puts "\nFound #{result.links.size} links:"
74
+
75
+ result.links.each_with_index do |link, index|
76
+ puts " #{index + 1}. #{link[:text]} - URL: #{link[:href]}"
77
+ end
78
+ else
79
+ puts "\nNo links found."
80
+ end
81
+ rescue StandardError => e
82
+ puts "Error: #{e.message}"
83
+ puts e.backtrace.join("\n")
84
+ end
85
+
86
+ puts "\n=== Test completed ==="
data/example/wikip.rb CHANGED
@@ -1,8 +1,6 @@
1
- # frozen_string_literal: true
2
-
3
1
  Spidy.define do
4
- def self.infobox_scrape(params, &block)
5
- call(params.html.at('.infobox'), name: :infobox, &block)
2
+ def self.infobox_scrape(params, &)
3
+ call(params.html.at('.infobox'), name: :infobox, &)
6
4
  end
7
5
 
8
6
  define(as: :html) do
data/exe/spidy CHANGED
@@ -1,6 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
2
  require 'spidy'
5
3
 
6
4
  if ARGV[1].blank?
@@ -1,4 +1,2 @@
1
- # frozen_string_literal: true
2
-
3
1
  class Spidy::Binder::Error < StandardError
4
2
  end
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # Bind html and convert to object
5
3
  #
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # Bind json and convert to object
5
3
  #
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # Bind xml and convert to object
5
3
  #
data/lib/spidy/binder.rb CHANGED
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # Bind resource received from the connection to the result object
5
3
  #
@@ -1,14 +1,12 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # spidy shell interface
5
3
  #
6
4
  class Spidy::CommandLine
7
5
  delegate :spidy, to: :@definition_file
8
- class_attribute :output, default: (proc { |result| $stdout.puts(result.to_s) })
9
- class_attribute :error_handler, default: (proc { |e, url|
10
- warn({ url: url, message: e.message, backtrace: e.backtrace }.to_json)
11
- })
6
+ class_attribute :output, default: proc { |result| $stdout.puts(result.to_s) }
7
+ class_attribute :error_handler, default: proc { |e, url|
8
+ warn({ url: url, message: e.message, backtrace: e.backtrace }.to_json)
9
+ }
12
10
 
13
11
  def eval_call(script)
14
12
  @definition_file.spidy.instance_eval(script)
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # Direct resource ( not network resource )
5
3
  #
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # Mechanize wrapper
5
3
  #
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # OpenURI to JSON.parse
5
3
  #
@@ -12,9 +10,9 @@ class Spidy::Connector::Json
12
10
  @user_agent = user_agent
13
11
  end
14
12
 
15
- def call(url, &block)
13
+ def call(url, &)
16
14
  fail 'url is not specified' if url.blank?
17
- connect(url, &block)
15
+ connect(url, &)
18
16
  end
19
17
 
20
18
  def connect(url)
@@ -0,0 +1,161 @@
1
+ #
2
+ # Lightpanda connector for JavaScript-rendered pages via CDP
3
+ # Using Ferrum for direct CDP connection
4
+ #
5
+ class Spidy::Connector::Lightpanda
6
+ include Spidy::Connector::StaticAccessor
7
+
8
+ attr_reader :user_agent, :host, :port
9
+
10
+ DEFAULT_HOST = '127.0.0.1'.freeze
11
+ DEFAULT_PORT = 9222
12
+
13
+ def initialize(user_agent:, host: nil, port: nil)
14
+ begin
15
+ require 'ferrum'
16
+ rescue LoadError
17
+ raise 'Ferrum gem is required. Please install with: gem install ferrum'
18
+ end
19
+
20
+ @user_agent = user_agent
21
+ @host = host || ENV['LIGHTPANDA_HOST'] || DEFAULT_HOST
22
+ @port = port || ENV['LIGHTPANDA_PORT'] || DEFAULT_PORT
23
+ end
24
+
25
+ def call(url)
26
+ fail 'url is not specified' if url.blank?
27
+
28
+ # Clean the URL by removing any whitespace or newlines
29
+ clean_url = url.to_s.strip
30
+
31
+ puts "Processing URL: #{clean_url}" if ENV['DEBUG']
32
+
33
+ # Create a page-like object similar to Mechanize
34
+ page = fetch_with_ferrum(clean_url)
35
+
36
+ # Apply yielder to the page
37
+ yield(page)
38
+ end
39
+
40
+ def refresh!
41
+ # No special refresh actions needed for Lightpanda
42
+ end
43
+
44
+ private
45
+
46
+ # Try to wait for network to be idle
47
+ def wait_for_network_idle(browser)
48
+ puts 'Waiting for network idle...' if ENV['DEBUG']
49
+
50
+ begin
51
+ # Try with timeout parameter
52
+ browser.network.wait_for_idle(timeout: 15)
53
+ rescue ArgumentError
54
+ begin
55
+ # Try without timeout parameter
56
+ browser.network.wait_for_idle
57
+ rescue StandardError => e
58
+ # If wait_for_idle fails, fall back to a simple sleep
59
+ puts "Warning: Could not wait for network idle: #{e.message}" if ENV['DEBUG']
60
+ sleep 5 # Simple fallback
61
+ end
62
+ end
63
+ end
64
+
65
+ # Navigate to URL and get page content
66
+ def navigate_and_get_content(browser, url)
67
+ # Navigate to the URL
68
+ puts "Navigating to: #{url}" if ENV['DEBUG']
69
+ browser.goto(url)
70
+
71
+ # Wait for network idle
72
+ wait_for_network_idle(browser)
73
+
74
+ # Get the content
75
+ puts 'Getting page content...' if ENV['DEBUG']
76
+ browser.body
77
+ end
78
+
79
+ # Create browser options
80
+ def create_browser_options
81
+ options = {
82
+ headless: true, # Run in headless mode
83
+ timeout: 20, # Increase timeout
84
+ process_timeout: 20, # Process timeout
85
+ window_size: [1280, 800] # Set window size
86
+ }
87
+
88
+ # Add Chrome path if available
89
+ options[:browser_path] = ENV['CHROME_PATH'] if ENV['CHROME_PATH'] && File.exist?(ENV['CHROME_PATH'])
90
+
91
+ options
92
+ end
93
+
94
+ # Connect to CDP server with Ferrum and fetch the page
95
+ def fetch_with_ferrum(url) # rubocop:todo Metrics/PerceivedComplexity, Metrics/CyclomaticComplexity
96
+ puts 'Using direct Chrome/Chromium instead of Lightpanda' if ENV['DEBUG']
97
+ browser = nil
98
+ html_content = nil
99
+
100
+ begin
101
+ # Create Ferrum browser
102
+ browser = Ferrum::Browser.new(create_browser_options)
103
+
104
+ # Skip user agent setting - not supported in this Ferrum version
105
+ if @user_agent.present? && ENV.fetch('DEBUG', nil)
106
+ puts 'User agent setting will be skipped - not supported in your Ferrum version'
107
+ end
108
+
109
+ # Navigate and get content
110
+ html_content = navigate_and_get_content(browser, url)
111
+ rescue StandardError => e
112
+ puts "Error during page navigation: #{e.class} - #{e.message}" if ENV['DEBUG']
113
+ raise e
114
+ ensure
115
+ # Clean up - ensure browser is always closed, even if an error occurred
116
+ browser&.quit if defined?(browser) && browser
117
+ end
118
+
119
+ # Create a Mechanize-like page object
120
+ LightpandaPage.new(url, html_content)
121
+ end
122
+
123
+ # Page-like object that mimics the Mechanize::Page interface
124
+ class LightpandaPage
125
+ attr_reader :uri, :body, :title, :code, :response_code
126
+
127
+ def initialize(url, html_content)
128
+ @uri = url
129
+ @body = html_content
130
+ @doc = Nokogiri::HTML(html_content)
131
+ @title = @doc.title
132
+ @code = '200'
133
+ @response_code = '200'
134
+ end
135
+
136
+ # Common methods from Mechanize::Page that might be used in the application
137
+ def search(*)
138
+ @doc.search(*)
139
+ end
140
+
141
+ def at(*)
142
+ @doc.at(*)
143
+ end
144
+
145
+ def css(*)
146
+ @doc.css(*)
147
+ end
148
+
149
+ def xpath(*)
150
+ @doc.xpath(*)
151
+ end
152
+
153
+ def encoding
154
+ @doc.encoding
155
+ end
156
+
157
+ def try(*args)
158
+ send(*args) if respond_to?(args.first)
159
+ end
160
+ end
161
+ end
@@ -1,20 +1,18 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # xml
5
3
  #
6
4
  class Spidy::Connector::Xml
7
5
  include Spidy::Connector::StaticAccessor
8
6
 
9
- def call(url, &block)
7
+ def call(url, &)
10
8
  fail 'URL is undefined' if url.blank?
11
9
 
12
- connect(url, &block)
10
+ connect(url, &)
13
11
  end
14
12
 
15
- def connect(url, &block)
13
+ def connect(url)
16
14
  OpenURI.open_uri(url, 'User-Agent' => @user_agent) do |body|
17
- block.call Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
15
+ yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
18
16
  end
19
17
  rescue OpenURI::HTTPError => e
20
18
  raise Spidy::Connector::Retry.new(error: e, response_code: e.io.status[0])
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # This class is responsible for actually making a network connection and downloading hypertext
5
3
  #
@@ -9,6 +7,7 @@ module Spidy::Connector
9
7
  autoload :Html
10
8
  autoload :Json
11
9
  autoload :Xml
10
+ autoload :Lightpanda
12
11
 
13
12
  DEFAULT_WAIT_TIME = 5
14
13
 
@@ -35,9 +34,9 @@ module Spidy::Connector
35
34
  module StaticAccessor
36
35
  extend ActiveSupport::Concern
37
36
  class_methods do
38
- def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &block)
37
+ def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &)
39
38
  ::Spidy::Connector::RetryableCaller.new(new(user_agent: user_agent), wait_time: wait_time, logger: logger).call(
40
- url, &block
39
+ url, &
41
40
  )
42
41
  end
43
42
  end
@@ -75,9 +74,9 @@ module Spidy::Connector
75
74
  connect(url, &block)
76
75
  end
77
76
 
78
- def connect(url, retry_attempt_count: @retry_attempt_count, &block)
77
+ def connect(url, retry_attempt_count: @retry_attempt_count, &)
79
78
  logger.call('connnector.get': url, 'connnector.accessed': Time.current)
80
- origin_connector.call(url, &block)
79
+ origin_connector.call(url, &)
81
80
  rescue Spidy::Connector::Retry => e
82
81
  logger.call('retry.accessed': Time.current,
83
82
  'retry.uri': url,
@@ -105,9 +104,9 @@ module Spidy::Connector
105
104
  @socks_proxy = socks_proxy
106
105
  end
107
106
 
108
- def call(url, &block)
107
+ def call(url, &)
109
108
  Socksify.proxy(socks_proxy[:host], socks_proxy[:port]) do
110
- connector.call(url, &block)
109
+ connector.call(url, &)
111
110
  end
112
111
  end
113
112
 
data/lib/spidy/console.rb CHANGED
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # spidy console
5
3
  #
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # Class representing a website defined by DSL
5
3
  #
@@ -42,12 +40,12 @@ module Spidy::Definition
42
40
  end
43
41
  end
44
42
 
45
- def spider(name = :default, connector: nil, as: nil, &define_block)
43
+ def spider(name = :default, connector: nil, as: nil)
46
44
  @namespace ||= {}
47
45
  connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
48
46
  socks_proxy: @socks_proxy)
49
47
  @namespace[:"#{name}_spider"] = proc do |source, &yielder|
50
- define_block.call(yielder, connector, source)
48
+ yield(yielder, connector, source)
51
49
  end
52
50
  end
53
51
 
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # spidy interface binding
5
3
  #
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # An object that represents the scraper defined by define block.
5
3
  #
data/lib/spidy/shell.rb CHANGED
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # spidy Shell
5
3
  #
data/lib/spidy/spider.rb CHANGED
@@ -1,11 +1,9 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # Spider
5
3
  #
6
4
  class Spidy::Spider
7
- def initialize(&block)
8
- define_singleton_method(:bind, &block)
5
+ def initialize(&)
6
+ define_singleton_method(:bind, &)
9
7
  end
10
8
 
11
9
  def call(resource)