spidy 0.3.12 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # This script checks what Ferrum API methods are available
4
+ begin
5
+ require 'ferrum'
6
+ puts 'Ferrum gem is loaded!'
7
+
8
+ # Check Ferrum version
9
+ puts "Ferrum version: #{begin
10
+ Ferrum::VERSION
11
+ rescue StandardError
12
+ 'unknown'
13
+ end}"
14
+
15
+ # Try to create a browser instance
16
+ puts "\nTrying to create a browser instance..."
17
+
18
+ # Find Chrome executable path
19
+ def find_chrome_path
20
+ # Common locations on macOS
21
+ macos_paths = [
22
+ '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
23
+ '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
24
+ '/Applications/Chromium.app/Contents/MacOS/Chromium',
25
+ '/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'
26
+ ]
27
+
28
+ # Check macOS paths
29
+ macos_paths.each do |path|
30
+ return path if File.exist?(path)
31
+ end
32
+
33
+ # Try to locate Chrome using 'which' command
34
+ %w[google-chrome chromium chromium-browser].each do |browser|
35
+ path = `which #{browser} 2>/dev/null`.strip
36
+ return path if path != '' && File.exist?(path)
37
+ end
38
+
39
+ nil
40
+ end
41
+
42
+ # Get Chrome path
43
+ chrome_path = ENV['CHROME_PATH'] || find_chrome_path
44
+ if chrome_path
45
+ puts "Using Chrome executable: #{chrome_path}"
46
+ else
47
+ puts 'No Chrome executable found. Using default.'
48
+ end
49
+
50
+ # Create browser with options
51
+ options = {
52
+ headless: true,
53
+ window_size: [1280, 800]
54
+ }
55
+
56
+ # Add Chrome path if available
57
+ options[:browser_path] = chrome_path if chrome_path
58
+
59
+ browser = Ferrum::Browser.new(options)
60
+ puts 'Browser instance created successfully!'
61
+
62
+ # Check available methods on browser
63
+ puts "\nAvailable methods on browser object:"
64
+ browser_methods = (browser.methods - Object.methods).sort
65
+ puts browser_methods.join(', ')
66
+
67
+ # Check if headers method exists
68
+ puts "\nDoes browser respond to 'headers='? #{browser.respond_to?(:headers=)}"
69
+
70
+ # Check available methods on browser.network
71
+ if browser.respond_to?(:network)
72
+ puts "\nAvailable methods on browser.network object:"
73
+ network_methods = (browser.network.methods - Object.methods).sort
74
+ puts network_methods.join(', ')
75
+
76
+ # Check if wait_for_idle method exists and what parameters it accepts
77
+ if browser.network.respond_to?(:wait_for_idle)
78
+ puts "\nExamine wait_for_idle method:"
79
+ begin
80
+ # Try with timeout parameter
81
+ browser.network.wait_for_idle(timeout: 1)
82
+ puts 'wait_for_idle accepts timeout parameter'
83
+ rescue ArgumentError => e
84
+ puts "wait_for_idle does not accept timeout parameter: #{e.message}"
85
+ rescue StandardError => e
86
+ puts "Error calling wait_for_idle with timeout: #{e.message}"
87
+ end
88
+ else
89
+ puts "\nwait_for_idle method not available on network object"
90
+ end
91
+ else
92
+ puts "\nnetwork method not available on browser object"
93
+ end
94
+
95
+ # Test goto method
96
+ puts "\nTesting navigation with goto method:"
97
+ begin
98
+ browser.goto('https://example.com')
99
+ puts 'Navigation successful!'
100
+ puts "Page title: #{browser.title}"
101
+ rescue StandardError => e
102
+ puts "Error during navigation: #{e.message}"
103
+ end
104
+
105
+ # Clean up
106
+ browser.quit
107
+ puts "\nBrowser closed successfully"
108
+ rescue LoadError => e
109
+ puts "Error: Ferrum gem is not installed: #{e.message}"
110
+ puts 'Install it with: gem install ferrum'
111
+ rescue StandardError => e
112
+ puts "Error: #{e.message}"
113
+ puts e.backtrace.join("\n")
114
+ end
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Script to check if Lightpanda is running and start it if needed
4
+ require 'net/http'
5
+
6
+ def lightpanda_running?(host = '127.0.0.1', port = 9222)
7
+ uri = URI("http://#{host}:#{port}/json/version")
8
+ response = Net::HTTP.get_response(uri)
9
+ response.is_a?(Net::HTTPSuccess)
10
+ rescue StandardError
11
+ false
12
+ end
13
+
14
+ def start_lightpanda(host = '127.0.0.1', port = 9222)
15
+ puts 'Starting Lightpanda...'
16
+
17
+ # Build the command to start Lightpanda in the background
18
+ cmd = "/Users/aileron/bin/lightpanda serve --host #{host} --port #{port} > /tmp/lightpanda.log 2>&1 &"
19
+
20
+ # Execute the command
21
+ result = system(cmd)
22
+
23
+ if result
24
+ puts "Lightpanda started! Service should be available at http://#{host}:#{port}"
25
+
26
+ # Wait for it to be ready
27
+ 10.times do
28
+ if lightpanda_running?(host, port)
29
+ puts 'Lightpanda is now running and accepting connections!'
30
+ return true
31
+ end
32
+ puts 'Waiting for Lightpanda to start...'
33
+ sleep 1
34
+ end
35
+
36
+ puts "Lightpanda might have started but isn't responding yet."
37
+ puts 'Check /tmp/lightpanda.log for details.'
38
+ else
39
+ puts 'Failed to start Lightpanda. Make sure the path is correct: /Users/aileron/bin/lightpanda'
40
+ end
41
+ false
42
+ end
43
+
44
+ # Main script
45
+ host = '127.0.0.1'
46
+ port = 9222
47
+
48
+ if lightpanda_running?(host, port)
49
+ puts "✅ Lightpanda is already running at http://#{host}:#{port}"
50
+ else
51
+ puts "❌ Lightpanda is not running at http://#{host}:#{port}"
52
+
53
+ if ARGV.include?('--start') || ARGV.include?('-s')
54
+ start_lightpanda(host, port)
55
+ else
56
+ puts 'Run this script with --start or -s option to start Lightpanda automatically:'
57
+ puts " #{$PROGRAM_NAME} --start"
58
+ end
59
+ end
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Test connecting to existing Chrome instance at 127.0.0.1:9222
4
+
5
+ begin
6
+ require 'ferrum'
7
+ puts 'Successfully loaded Ferrum'
8
+ rescue LoadError => e
9
+ puts "Ferrum is not installed: #{e.message}"
10
+ puts "Run 'gem install ferrum' to install it"
11
+ exit 1
12
+ end
13
+
14
+ puts 'Testing connection to Chrome at 127.0.0.1:9222'
15
+ puts '=============================================='
16
+
17
+ begin
18
+ # Connect to the remote Chrome instance
19
+ # Note: We're setting process: false to prevent launching a new browser
20
+ browser = Ferrum::Browser.new(
21
+ url: 'http://127.0.0.1:9222',
22
+ process: false
23
+ )
24
+
25
+ # Access a test URL
26
+ url = 'https://example.com'
27
+ puts "Accessing: #{url}"
28
+ browser.goto(url)
29
+
30
+ # Get page title
31
+ title = browser.title
32
+ puts "Page title: #{title}"
33
+
34
+ # Clean up browser connection (but don't close Chrome)
35
+ browser.quit
36
+
37
+ puts "\nSuccess! Connected to Chrome at 127.0.0.1:9222"
38
+ rescue StandardError => e
39
+ puts "Error: #{e.message}"
40
+ puts e.backtrace.join("\n")
41
+
42
+ puts "\nTroubleshooting tips:"
43
+ puts '1. Make sure Chrome is running with remote debugging enabled'
44
+ puts '2. Verify the command: /Users/aileron/bin/lightpanda serve --host 127.0.0.1 --port 9222'
45
+ puts '3. Check if you can access http://127.0.0.1:9222/json/version in your browser'
46
+ end
47
+
48
+ puts "\nTest completed"
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/env ruby
2
+ # Enhanced Lightpanda link extractor
3
+
4
+ Spidy.define do
5
+ # Set wait time to avoid timeouts
6
+ wait_time 10
7
+
8
+ # Set a standard user agent
9
+ user_agent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
10
+ 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
11
+
12
+ spider(as: :lightpanda) do |yielder, connector, start_url|
13
+ puts "Extracting links from: #{start_url}" if ENV['DEBUG']
14
+
15
+ connector.call(start_url) do |page|
16
+ links = page.search('a')
17
+ puts "Found #{links.size} links on page" if ENV['DEBUG']
18
+
19
+ links.each do |a|
20
+ # Try different ways to get the href attribute
21
+ href = nil
22
+
23
+ # First try the hash access method
24
+ begin
25
+ href = a['href']
26
+ rescue StandardError
27
+ # Then try the attr method
28
+ begin
29
+ href = a.attr('href')
30
+ rescue StandardError
31
+ # Then try the attribute method
32
+ begin
33
+ href = a.attribute('href')&.to_s
34
+ rescue StandardError
35
+ # Give up on this link
36
+ next
37
+ end
38
+ end
39
+ end
40
+
41
+ # Skip empty or invalid links
42
+ next if href.nil? || href.empty? || href == '#'
43
+
44
+ # Try to get link text for better output
45
+ link_text = nil
46
+
47
+ # Try different ways to get the text
48
+ begin
49
+ link_text = a.text.to_s.strip
50
+ rescue StandardError
51
+ begin
52
+ link_text = a.inner_text.to_s.strip
53
+ rescue StandardError
54
+ begin
55
+ link_text = a.content.to_s.strip
56
+ rescue StandardError
57
+ link_text = '(Could not extract text)'
58
+ end
59
+ end
60
+ end
61
+
62
+ # Truncate long text
63
+ link_text = "#{link_text[0...47]}..." if link_text.length > 50
64
+ link_text = '(No text)' if link_text.nil? || link_text.empty?
65
+
66
+ # Output the link with its text if in debug mode
67
+ puts " - #{link_text}: #{href}" if ENV['DEBUG']
68
+
69
+ # Yield the link
70
+ yielder.call(href)
71
+ end
72
+ rescue StandardError => e
73
+ puts "Error processing links: #{e.message}" if ENV['DEBUG']
74
+ raise e
75
+ end
76
+ rescue StandardError => e
77
+ puts "Error in spider: #{e.message}" if ENV['DEBUG']
78
+ raise e
79
+ end
80
+ end
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  Spidy.define do
4
2
  url_to_params = lambda { |url|
5
3
  uri = URI.parse(url)
@@ -9,7 +7,7 @@ Spidy.define do
9
7
 
10
8
  master_page = proc { |url, &yielder|
11
9
  params = url_to_params.call(url)
12
- page = params&.dig('page')&.to_i || 0
10
+ page = params&.dig('page').to_i
13
11
 
14
12
  limit_page = 3
15
13
  per_page = 25
data/example/proxy.rb CHANGED
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  Spidy.define do
4
2
  user_agent 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
5
3
  socks_proxy '127.0.0.1', 9050
data/example/retry.rb CHANGED
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  Spidy.define do
4
2
  spider(as: :json) do |yielder, connector|
5
3
  connector.call('https://httpbin.org/status/500') do |json|
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # This is a wrapper script that ensures Lightpanda is running before executing Spidy commands
4
+
5
+ require_relative 'check_lightpanda'
6
+
7
+ # Check if Lightpanda is running, start it if not
8
+ unless lightpanda_running?
9
+ puts 'Lightpanda is not running. Starting it now...'
10
+ unless start_lightpanda
11
+ puts 'Failed to start Lightpanda. Exiting.'
12
+ exit 1
13
+ end
14
+ end
15
+
16
+ # Execute the Spidy command with debug mode enabled
17
+ ENV['DEBUG'] = 'true'
18
+ url = ARGV[0] || 'https://example.com'
19
+ definition = ARGV[1] || 'example/lightpanda_links.rb'
20
+
21
+ puts "Running Spidy with URL: #{url} and definition: #{definition}"
22
+ cmd = "echo '#{url}' | spidy each #{definition}"
23
+
24
+ puts "Executing: #{cmd}"
25
+ exec(cmd)
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Simple Ferrum usage test
4
+
5
+ begin
6
+ require 'ferrum'
7
+ puts 'Successfully loaded Ferrum'
8
+ rescue LoadError => e
9
+ puts "Ferrum is not installed: #{e.message}"
10
+ puts "Run 'gem install ferrum' to install it"
11
+ exit 1
12
+ end
13
+
14
+ puts 'Simple Browser Test'
15
+ puts '==================='
16
+
17
+ begin
18
+ # Initialize Ferrum browser (headless mode)
19
+ browser = Ferrum::Browser.new(headless: true)
20
+
21
+ # Access URL
22
+ url = 'https://react-shopping-cart-67954.firebaseapp.com/'
23
+ puts "Accessing: #{url}"
24
+ browser.goto(url)
25
+
26
+ # Wait for page to load
27
+ sleep 2
28
+
29
+ # Get page title
30
+ title = browser.title
31
+ puts "Page title: #{title}"
32
+
33
+ # Search for "Add to cart" buttons
34
+ cart_buttons = browser.css('button.sc-124al1g-0')
35
+
36
+ if cart_buttons.any?
37
+ puts "\nFound #{cart_buttons.size} 'Add to cart' buttons"
38
+ else
39
+ puts "\nNo 'Add to cart' buttons found."
40
+
41
+ # Try to get all buttons for debugging
42
+ all_buttons = browser.css('button')
43
+ puts "Found #{all_buttons.size} buttons on the page."
44
+ end
45
+
46
+ # Clean up browser
47
+ browser.quit
48
+ rescue StandardError => e
49
+ puts "Error: #{e.message}"
50
+ puts e.backtrace.join("\n")
51
+ end
52
+
53
+ puts "\nTest completed"
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # This script tests the fixed Lightpanda connector
4
+
5
+ require_relative '../lib/spidy'
6
+
7
+ # Enable debug output
8
+ ENV['DEBUG'] = 'true'
9
+
10
+ # Define a scraper using the lightpanda connector
11
+ scraper = Spidy.define do
12
+ # Define user agent
13
+ user_agent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
14
+ 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
15
+
16
+ # Define a spider that uses the lightpanda connector
17
+ spider(as: :lightpanda) do |yielder, connector, url|
18
+ puts "Processing page with Chrome/Lightpanda: #{url}"
19
+ connector.call(url) do |page|
20
+ yielder.call(page)
21
+ end
22
+ end
23
+
24
+ # Define the object structure we want to extract
25
+ define(as: :html) do
26
+ let(:title, 'title')
27
+
28
+ # Example: extract links
29
+ let(:links) do |doc|
30
+ doc.css('a').take(5).map do |link|
31
+ {
32
+ text: link.text.to_s.strip,
33
+ href: link['href'].to_s
34
+ }
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ # URL to scrape
41
+ url = 'https://example.com'
42
+
43
+ puts '=== Testing Chrome/Lightpanda Connection ==='
44
+ puts "URL: #{url}\n\n"
45
+
46
+ begin
47
+ # First, check if Lightpanda is running
48
+ require 'net/http'
49
+ version_url = URI('http://127.0.0.1:9222/json/version')
50
+ http = Net::HTTP.new(version_url.host, version_url.port)
51
+ response = http.get(version_url.path)
52
+
53
+ if response.code == '200'
54
+ puts 'Lightpanda is running at 127.0.0.1:9222'
55
+ puts "Version info: #{response.body[0..100]}...\n\n"
56
+ else
57
+ puts "Warning: Couldn't connect to Lightpanda at 127.0.0.1:9222"
58
+ puts "Make sure it's running with: /Users/aileron/bin/lightpanda serve --host 127.0.0.1 --port 9222\n\n"
59
+ end
60
+ rescue StandardError => e
61
+ puts "Error checking Lightpanda: #{e.message}"
62
+ puts "Make sure Lightpanda is running with: /Users/aileron/bin/lightpanda serve --host 127.0.0.1 --port 9222\n\n"
63
+ end
64
+
65
+ begin
66
+ # Execute the scraper
67
+ result = scraper.call(url)
68
+
69
+ # Display the results
70
+ puts "Page Title: #{result.title}"
71
+
72
+ if result.links&.any?
73
+ puts "\nFound #{result.links.size} links:"
74
+
75
+ result.links.each_with_index do |link, index|
76
+ puts " #{index + 1}. #{link[:text]} - URL: #{link[:href]}"
77
+ end
78
+ else
79
+ puts "\nNo links found."
80
+ end
81
+ rescue StandardError => e
82
+ puts "Error: #{e.message}"
83
+ puts e.backtrace.join("\n")
84
+ end
85
+
86
+ puts "\n=== Test completed ==="
data/example/wikip.rb CHANGED
@@ -1,8 +1,6 @@
1
- # frozen_string_literal: true
2
-
3
1
  Spidy.define do
4
- def self.infobox_scrape(params, &block)
5
- call(params.html.at('.infobox'), name: :infobox, &block)
2
+ def self.infobox_scrape(params, &)
3
+ call(params.html.at('.infobox'), name: :infobox, &)
6
4
  end
7
5
 
8
6
  define(as: :html) do
data/exe/spidy CHANGED
@@ -1,8 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
2
  require 'spidy'
5
- require 'pry'
6
3
 
7
4
  if ARGV[1].blank?
8
5
  case ARGV[0]
@@ -1,4 +1,2 @@
1
- # frozen_string_literal: true
2
-
3
1
  class Spidy::Binder::Error < StandardError
4
2
  end
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # Bind html and convert to object
5
3
  #
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # Bind json and convert to object
5
3
  #
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # Bind xml and convert to object
5
3
  #
data/lib/spidy/binder.rb CHANGED
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # Bind resource received from the connection to the result object
5
3
  #
@@ -1,14 +1,12 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # spidy shell interface
5
3
  #
6
4
  class Spidy::CommandLine
7
5
  delegate :spidy, to: :@definition_file
8
- class_attribute :output, default: (proc { |result| $stdout.puts(result.to_s) })
9
- class_attribute :error_handler, default: (proc { |e, url|
10
- warn({ url: url, message: e.message, backtrace: e.backtrace }.to_json)
11
- })
6
+ class_attribute :output, default: proc { |result| $stdout.puts(result.to_s) }
7
+ class_attribute :error_handler, default: proc { |e, url|
8
+ warn({ url: url, message: e.message, backtrace: e.backtrace }.to_json)
9
+ }
12
10
 
13
11
  def eval_call(script)
14
12
  @definition_file.spidy.instance_eval(script)
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # Direct resource ( not network resource )
5
3
  #
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # Mechanize wrapper
5
3
  #
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # OpenURI to JSON.parse
5
3
  #
@@ -12,9 +10,9 @@ class Spidy::Connector::Json
12
10
  @user_agent = user_agent
13
11
  end
14
12
 
15
- def call(url, &block)
13
+ def call(url, &)
16
14
  fail 'url is not specified' if url.blank?
17
- connect(url, &block)
15
+ connect(url, &)
18
16
  end
19
17
 
20
18
  def connect(url)