spidy 0.4.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +53 -8
- data/Gemfile +18 -4
- data/Gemfile.lock +71 -19
- data/README.md +95 -16
- data/Rakefile +0 -2
- data/example/check_ferrum.rb +114 -0
- data/example/check_lightpanda.rb +59 -0
- data/example/connect_test.rb +48 -0
- data/example/lightpanda_links.rb +80 -0
- data/example/master_detail.rb +1 -3
- data/example/proxy.rb +0 -2
- data/example/retry.rb +0 -2
- data/example/run_with_lightpanda.rb +25 -0
- data/example/simple_test.rb +53 -0
- data/example/test_lightpanda.rb +86 -0
- data/example/wikip.rb +2 -4
- data/exe/spidy +0 -2
- data/lib/spidy/binder/error.rb +0 -2
- data/lib/spidy/binder/html.rb +0 -2
- data/lib/spidy/binder/json.rb +0 -2
- data/lib/spidy/binder/xml.rb +0 -2
- data/lib/spidy/binder.rb +0 -2
- data/lib/spidy/command_line.rb +4 -6
- data/lib/spidy/connector/direct.rb +0 -2
- data/lib/spidy/connector/html.rb +0 -2
- data/lib/spidy/connector/json.rb +2 -4
- data/lib/spidy/connector/lightpanda.rb +161 -0
- data/lib/spidy/connector/xml.rb +4 -6
- data/lib/spidy/connector.rb +7 -8
- data/lib/spidy/console.rb +0 -2
- data/lib/spidy/definition.rb +2 -4
- data/lib/spidy/definition_file.rb +0 -2
- data/lib/spidy/definition_object.rb +0 -2
- data/lib/spidy/shell.rb +0 -2
- data/lib/spidy/spider.rb +2 -4
- data/lib/spidy/version.rb +1 -3
- data/lib/spidy.rb +3 -5
- data/spidy.gemspec +4 -15
- metadata +11 -102
- data/.rubocop_todo.yml +0 -13
@@ -0,0 +1,80 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Enhanced Lightpanda link extractor
|
3
|
+
|
4
|
+
Spidy.define do
|
5
|
+
# Set wait time to avoid timeouts
|
6
|
+
wait_time 10
|
7
|
+
|
8
|
+
# Set a standard user agent
|
9
|
+
user_agent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
|
10
|
+
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
11
|
+
|
12
|
+
spider(as: :lightpanda) do |yielder, connector, start_url|
|
13
|
+
puts "Extracting links from: #{start_url}" if ENV['DEBUG']
|
14
|
+
|
15
|
+
connector.call(start_url) do |page|
|
16
|
+
links = page.search('a')
|
17
|
+
puts "Found #{links.size} links on page" if ENV['DEBUG']
|
18
|
+
|
19
|
+
links.each do |a|
|
20
|
+
# Try different ways to get the href attribute
|
21
|
+
href = nil
|
22
|
+
|
23
|
+
# First try the hash access method
|
24
|
+
begin
|
25
|
+
href = a['href']
|
26
|
+
rescue StandardError
|
27
|
+
# Then try the attr method
|
28
|
+
begin
|
29
|
+
href = a.attr('href')
|
30
|
+
rescue StandardError
|
31
|
+
# Then try the attribute method
|
32
|
+
begin
|
33
|
+
href = a.attribute('href')&.to_s
|
34
|
+
rescue StandardError
|
35
|
+
# Give up on this link
|
36
|
+
next
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# Skip empty or invalid links
|
42
|
+
next if href.nil? || href.empty? || href == '#'
|
43
|
+
|
44
|
+
# Try to get link text for better output
|
45
|
+
link_text = nil
|
46
|
+
|
47
|
+
# Try different ways to get the text
|
48
|
+
begin
|
49
|
+
link_text = a.text.to_s.strip
|
50
|
+
rescue StandardError
|
51
|
+
begin
|
52
|
+
link_text = a.inner_text.to_s.strip
|
53
|
+
rescue StandardError
|
54
|
+
begin
|
55
|
+
link_text = a.content.to_s.strip
|
56
|
+
rescue StandardError
|
57
|
+
link_text = '(Could not extract text)'
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Truncate long text
|
63
|
+
link_text = "#{link_text[0...47]}..." if link_text.length > 50
|
64
|
+
link_text = '(No text)' if link_text.nil? || link_text.empty?
|
65
|
+
|
66
|
+
# Output the link with its text if in debug mode
|
67
|
+
puts " - #{link_text}: #{href}" if ENV['DEBUG']
|
68
|
+
|
69
|
+
# Yield the link
|
70
|
+
yielder.call(href)
|
71
|
+
end
|
72
|
+
rescue StandardError => e
|
73
|
+
puts "Error processing links: #{e.message}" if ENV['DEBUG']
|
74
|
+
raise e
|
75
|
+
end
|
76
|
+
rescue StandardError => e
|
77
|
+
puts "Error in spider: #{e.message}" if ENV['DEBUG']
|
78
|
+
raise e
|
79
|
+
end
|
80
|
+
end
|
data/example/master_detail.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
1
|
Spidy.define do
|
4
2
|
url_to_params = lambda { |url|
|
5
3
|
uri = URI.parse(url)
|
@@ -9,7 +7,7 @@ Spidy.define do
|
|
9
7
|
|
10
8
|
master_page = proc { |url, &yielder|
|
11
9
|
params = url_to_params.call(url)
|
12
|
-
page = params&.dig('page')
|
10
|
+
page = params&.dig('page').to_i
|
13
11
|
|
14
12
|
limit_page = 3
|
15
13
|
per_page = 25
|
data/example/proxy.rb
CHANGED
data/example/retry.rb
CHANGED
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This is a wrapper script that ensures Lightpanda is running before executing Spidy commands
|
4
|
+
|
5
|
+
require_relative 'check_lightpanda'
|
6
|
+
|
7
|
+
# Check if Lightpanda is running, start it if not
|
8
|
+
unless lightpanda_running?
|
9
|
+
puts 'Lightpanda is not running. Starting it now...'
|
10
|
+
unless start_lightpanda
|
11
|
+
puts 'Failed to start Lightpanda. Exiting.'
|
12
|
+
exit 1
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# Execute the Spidy command with debug mode enabled
|
17
|
+
ENV['DEBUG'] = 'true'
|
18
|
+
url = ARGV[0] || 'https://example.com'
|
19
|
+
definition = ARGV[1] || 'example/lightpanda_links.rb'
|
20
|
+
|
21
|
+
puts "Running Spidy with URL: #{url} and definition: #{definition}"
|
22
|
+
cmd = "echo '#{url}' | spidy each #{definition}"
|
23
|
+
|
24
|
+
puts "Executing: #{cmd}"
|
25
|
+
exec(cmd)
|
@@ -0,0 +1,53 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Simple Ferrum usage test
|
4
|
+
|
5
|
+
begin
|
6
|
+
require 'ferrum'
|
7
|
+
puts 'Successfully loaded Ferrum'
|
8
|
+
rescue LoadError => e
|
9
|
+
puts "Ferrum is not installed: #{e.message}"
|
10
|
+
puts "Run 'gem install ferrum' to install it"
|
11
|
+
exit 1
|
12
|
+
end
|
13
|
+
|
14
|
+
puts 'Simple Browser Test'
|
15
|
+
puts '==================='
|
16
|
+
|
17
|
+
begin
|
18
|
+
# Initialize Ferrum browser (headless mode)
|
19
|
+
browser = Ferrum::Browser.new(headless: true)
|
20
|
+
|
21
|
+
# Access URL
|
22
|
+
url = 'https://react-shopping-cart-67954.firebaseapp.com/'
|
23
|
+
puts "Accessing: #{url}"
|
24
|
+
browser.goto(url)
|
25
|
+
|
26
|
+
# Wait for page to load
|
27
|
+
sleep 2
|
28
|
+
|
29
|
+
# Get page title
|
30
|
+
title = browser.title
|
31
|
+
puts "Page title: #{title}"
|
32
|
+
|
33
|
+
# Search for "Add to cart" buttons
|
34
|
+
cart_buttons = browser.css('button.sc-124al1g-0')
|
35
|
+
|
36
|
+
if cart_buttons.any?
|
37
|
+
puts "\nFound #{cart_buttons.size} 'Add to cart' buttons"
|
38
|
+
else
|
39
|
+
puts "\nNo 'Add to cart' buttons found."
|
40
|
+
|
41
|
+
# Try to get all buttons for debugging
|
42
|
+
all_buttons = browser.css('button')
|
43
|
+
puts "Found #{all_buttons.size} buttons on the page."
|
44
|
+
end
|
45
|
+
|
46
|
+
# Clean up browser
|
47
|
+
browser.quit
|
48
|
+
rescue StandardError => e
|
49
|
+
puts "Error: #{e.message}"
|
50
|
+
puts e.backtrace.join("\n")
|
51
|
+
end
|
52
|
+
|
53
|
+
puts "\nTest completed"
|
@@ -0,0 +1,86 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This script tests the fixed Lightpanda connector
|
4
|
+
|
5
|
+
require_relative '../lib/spidy'
|
6
|
+
|
7
|
+
# Enable debug output
|
8
|
+
ENV['DEBUG'] = 'true'
|
9
|
+
|
10
|
+
# Define a scraper using the lightpanda connector
|
11
|
+
scraper = Spidy.define do
|
12
|
+
# Define user agent
|
13
|
+
user_agent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
|
14
|
+
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
15
|
+
|
16
|
+
# Define a spider that uses the lightpanda connector
|
17
|
+
spider(as: :lightpanda) do |yielder, connector, url|
|
18
|
+
puts "Processing page with Chrome/Lightpanda: #{url}"
|
19
|
+
connector.call(url) do |page|
|
20
|
+
yielder.call(page)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Define the object structure we want to extract
|
25
|
+
define(as: :html) do
|
26
|
+
let(:title, 'title')
|
27
|
+
|
28
|
+
# Example: extract links
|
29
|
+
let(:links) do |doc|
|
30
|
+
doc.css('a').take(5).map do |link|
|
31
|
+
{
|
32
|
+
text: link.text.to_s.strip,
|
33
|
+
href: link['href'].to_s
|
34
|
+
}
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# URL to scrape
|
41
|
+
url = 'https://example.com'
|
42
|
+
|
43
|
+
puts '=== Testing Chrome/Lightpanda Connection ==='
|
44
|
+
puts "URL: #{url}\n\n"
|
45
|
+
|
46
|
+
begin
|
47
|
+
# First, check if Lightpanda is running
|
48
|
+
require 'net/http'
|
49
|
+
version_url = URI('http://127.0.0.1:9222/json/version')
|
50
|
+
http = Net::HTTP.new(version_url.host, version_url.port)
|
51
|
+
response = http.get(version_url.path)
|
52
|
+
|
53
|
+
if response.code == '200'
|
54
|
+
puts 'Lightpanda is running at 127.0.0.1:9222'
|
55
|
+
puts "Version info: #{response.body[0..100]}...\n\n"
|
56
|
+
else
|
57
|
+
puts "Warning: Couldn't connect to Lightpanda at 127.0.0.1:9222"
|
58
|
+
puts "Make sure it's running with: /Users/aileron/bin/lightpanda serve --host 127.0.0.1 --port 9222\n\n"
|
59
|
+
end
|
60
|
+
rescue StandardError => e
|
61
|
+
puts "Error checking Lightpanda: #{e.message}"
|
62
|
+
puts "Make sure Lightpanda is running with: /Users/aileron/bin/lightpanda serve --host 127.0.0.1 --port 9222\n\n"
|
63
|
+
end
|
64
|
+
|
65
|
+
begin
|
66
|
+
# Execute the scraper
|
67
|
+
result = scraper.call(url)
|
68
|
+
|
69
|
+
# Display the results
|
70
|
+
puts "Page Title: #{result.title}"
|
71
|
+
|
72
|
+
if result.links&.any?
|
73
|
+
puts "\nFound #{result.links.size} links:"
|
74
|
+
|
75
|
+
result.links.each_with_index do |link, index|
|
76
|
+
puts " #{index + 1}. #{link[:text]} - URL: #{link[:href]}"
|
77
|
+
end
|
78
|
+
else
|
79
|
+
puts "\nNo links found."
|
80
|
+
end
|
81
|
+
rescue StandardError => e
|
82
|
+
puts "Error: #{e.message}"
|
83
|
+
puts e.backtrace.join("\n")
|
84
|
+
end
|
85
|
+
|
86
|
+
puts "\n=== Test completed ==="
|
data/example/wikip.rb
CHANGED
@@ -1,8 +1,6 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
1
|
Spidy.define do
|
4
|
-
def self.infobox_scrape(params, &
|
5
|
-
call(params.html.at('.infobox'), name: :infobox, &
|
2
|
+
def self.infobox_scrape(params, &)
|
3
|
+
call(params.html.at('.infobox'), name: :infobox, &)
|
6
4
|
end
|
7
5
|
|
8
6
|
define(as: :html) do
|
data/exe/spidy
CHANGED
data/lib/spidy/binder/error.rb
CHANGED
data/lib/spidy/binder/html.rb
CHANGED
data/lib/spidy/binder/json.rb
CHANGED
data/lib/spidy/binder/xml.rb
CHANGED
data/lib/spidy/binder.rb
CHANGED
data/lib/spidy/command_line.rb
CHANGED
@@ -1,14 +1,12 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
1
|
#
|
4
2
|
# spidy shell interface
|
5
3
|
#
|
6
4
|
class Spidy::CommandLine
|
7
5
|
delegate :spidy, to: :@definition_file
|
8
|
-
class_attribute :output, default:
|
9
|
-
class_attribute :error_handler, default:
|
10
|
-
|
11
|
-
|
6
|
+
class_attribute :output, default: proc { |result| $stdout.puts(result.to_s) }
|
7
|
+
class_attribute :error_handler, default: proc { |e, url|
|
8
|
+
warn({ url: url, message: e.message, backtrace: e.backtrace }.to_json)
|
9
|
+
}
|
12
10
|
|
13
11
|
def eval_call(script)
|
14
12
|
@definition_file.spidy.instance_eval(script)
|
data/lib/spidy/connector/html.rb
CHANGED
data/lib/spidy/connector/json.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
1
|
#
|
4
2
|
# OpenURI to JSON.parse
|
5
3
|
#
|
@@ -12,9 +10,9 @@ class Spidy::Connector::Json
|
|
12
10
|
@user_agent = user_agent
|
13
11
|
end
|
14
12
|
|
15
|
-
def call(url, &
|
13
|
+
def call(url, &)
|
16
14
|
fail 'url is not specified' if url.blank?
|
17
|
-
connect(url, &
|
15
|
+
connect(url, &)
|
18
16
|
end
|
19
17
|
|
20
18
|
def connect(url)
|
@@ -0,0 +1,161 @@
|
|
1
|
+
#
|
2
|
+
# Lightpanda connector for JavaScript-rendered pages via CDP
|
3
|
+
# Using Ferrum for direct CDP connection
|
4
|
+
#
|
5
|
+
class Spidy::Connector::Lightpanda
|
6
|
+
include Spidy::Connector::StaticAccessor
|
7
|
+
|
8
|
+
attr_reader :user_agent, :host, :port
|
9
|
+
|
10
|
+
DEFAULT_HOST = '127.0.0.1'.freeze
|
11
|
+
DEFAULT_PORT = 9222
|
12
|
+
|
13
|
+
def initialize(user_agent:, host: nil, port: nil)
|
14
|
+
begin
|
15
|
+
require 'ferrum'
|
16
|
+
rescue LoadError
|
17
|
+
raise 'Ferrum gem is required. Please install with: gem install ferrum'
|
18
|
+
end
|
19
|
+
|
20
|
+
@user_agent = user_agent
|
21
|
+
@host = host || ENV['LIGHTPANDA_HOST'] || DEFAULT_HOST
|
22
|
+
@port = port || ENV['LIGHTPANDA_PORT'] || DEFAULT_PORT
|
23
|
+
end
|
24
|
+
|
25
|
+
def call(url)
|
26
|
+
fail 'url is not specified' if url.blank?
|
27
|
+
|
28
|
+
# Clean the URL by removing any whitespace or newlines
|
29
|
+
clean_url = url.to_s.strip
|
30
|
+
|
31
|
+
puts "Processing URL: #{clean_url}" if ENV['DEBUG']
|
32
|
+
|
33
|
+
# Create a page-like object similar to Mechanize
|
34
|
+
page = fetch_with_ferrum(clean_url)
|
35
|
+
|
36
|
+
# Apply yielder to the page
|
37
|
+
yield(page)
|
38
|
+
end
|
39
|
+
|
40
|
+
def refresh!
|
41
|
+
# No special refresh actions needed for Lightpanda
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
# Try to wait for network to be idle
|
47
|
+
def wait_for_network_idle(browser)
|
48
|
+
puts 'Waiting for network idle...' if ENV['DEBUG']
|
49
|
+
|
50
|
+
begin
|
51
|
+
# Try with timeout parameter
|
52
|
+
browser.network.wait_for_idle(timeout: 15)
|
53
|
+
rescue ArgumentError
|
54
|
+
begin
|
55
|
+
# Try without timeout parameter
|
56
|
+
browser.network.wait_for_idle
|
57
|
+
rescue StandardError => e
|
58
|
+
# If wait_for_idle fails, fall back to a simple sleep
|
59
|
+
puts "Warning: Could not wait for network idle: #{e.message}" if ENV['DEBUG']
|
60
|
+
sleep 5 # Simple fallback
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# Navigate to URL and get page content
|
66
|
+
def navigate_and_get_content(browser, url)
|
67
|
+
# Navigate to the URL
|
68
|
+
puts "Navigating to: #{url}" if ENV['DEBUG']
|
69
|
+
browser.goto(url)
|
70
|
+
|
71
|
+
# Wait for network idle
|
72
|
+
wait_for_network_idle(browser)
|
73
|
+
|
74
|
+
# Get the content
|
75
|
+
puts 'Getting page content...' if ENV['DEBUG']
|
76
|
+
browser.body
|
77
|
+
end
|
78
|
+
|
79
|
+
# Create browser options
|
80
|
+
def create_browser_options
|
81
|
+
options = {
|
82
|
+
headless: true, # Run in headless mode
|
83
|
+
timeout: 20, # Increase timeout
|
84
|
+
process_timeout: 20, # Process timeout
|
85
|
+
window_size: [1280, 800] # Set window size
|
86
|
+
}
|
87
|
+
|
88
|
+
# Add Chrome path if available
|
89
|
+
options[:browser_path] = ENV['CHROME_PATH'] if ENV['CHROME_PATH'] && File.exist?(ENV['CHROME_PATH'])
|
90
|
+
|
91
|
+
options
|
92
|
+
end
|
93
|
+
|
94
|
+
# Connect to CDP server with Ferrum and fetch the page
|
95
|
+
def fetch_with_ferrum(url) # rubocop:todo Metrics/PerceivedComplexity, Metrics/CyclomaticComplexity
|
96
|
+
puts 'Using direct Chrome/Chromium instead of Lightpanda' if ENV['DEBUG']
|
97
|
+
browser = nil
|
98
|
+
html_content = nil
|
99
|
+
|
100
|
+
begin
|
101
|
+
# Create Ferrum browser
|
102
|
+
browser = Ferrum::Browser.new(create_browser_options)
|
103
|
+
|
104
|
+
# Skip user agent setting - not supported in this Ferrum version
|
105
|
+
if @user_agent.present? && ENV.fetch('DEBUG', nil)
|
106
|
+
puts 'User agent setting will be skipped - not supported in your Ferrum version'
|
107
|
+
end
|
108
|
+
|
109
|
+
# Navigate and get content
|
110
|
+
html_content = navigate_and_get_content(browser, url)
|
111
|
+
rescue StandardError => e
|
112
|
+
puts "Error during page navigation: #{e.class} - #{e.message}" if ENV['DEBUG']
|
113
|
+
raise e
|
114
|
+
ensure
|
115
|
+
# Clean up - ensure browser is always closed, even if an error occurred
|
116
|
+
browser&.quit if defined?(browser) && browser
|
117
|
+
end
|
118
|
+
|
119
|
+
# Create a Mechanize-like page object
|
120
|
+
LightpandaPage.new(url, html_content)
|
121
|
+
end
|
122
|
+
|
123
|
+
# Page-like object that mimics the Mechanize::Page interface
|
124
|
+
class LightpandaPage
|
125
|
+
attr_reader :uri, :body, :title, :code, :response_code
|
126
|
+
|
127
|
+
def initialize(url, html_content)
|
128
|
+
@uri = url
|
129
|
+
@body = html_content
|
130
|
+
@doc = Nokogiri::HTML(html_content)
|
131
|
+
@title = @doc.title
|
132
|
+
@code = '200'
|
133
|
+
@response_code = '200'
|
134
|
+
end
|
135
|
+
|
136
|
+
# Common methods from Mechanize::Page that might be used in the application
|
137
|
+
def search(*)
|
138
|
+
@doc.search(*)
|
139
|
+
end
|
140
|
+
|
141
|
+
def at(*)
|
142
|
+
@doc.at(*)
|
143
|
+
end
|
144
|
+
|
145
|
+
def css(*)
|
146
|
+
@doc.css(*)
|
147
|
+
end
|
148
|
+
|
149
|
+
def xpath(*)
|
150
|
+
@doc.xpath(*)
|
151
|
+
end
|
152
|
+
|
153
|
+
def encoding
|
154
|
+
@doc.encoding
|
155
|
+
end
|
156
|
+
|
157
|
+
def try(*args)
|
158
|
+
send(*args) if respond_to?(args.first)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
data/lib/spidy/connector/xml.rb
CHANGED
@@ -1,20 +1,18 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
1
|
#
|
4
2
|
# xml
|
5
3
|
#
|
6
4
|
class Spidy::Connector::Xml
|
7
5
|
include Spidy::Connector::StaticAccessor
|
8
6
|
|
9
|
-
def call(url, &
|
7
|
+
def call(url, &)
|
10
8
|
fail 'URL is undefined' if url.blank?
|
11
9
|
|
12
|
-
connect(url, &
|
10
|
+
connect(url, &)
|
13
11
|
end
|
14
12
|
|
15
|
-
def connect(url
|
13
|
+
def connect(url)
|
16
14
|
OpenURI.open_uri(url, 'User-Agent' => @user_agent) do |body|
|
17
|
-
|
15
|
+
yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
|
18
16
|
end
|
19
17
|
rescue OpenURI::HTTPError => e
|
20
18
|
raise Spidy::Connector::Retry.new(error: e, response_code: e.io.status[0])
|
data/lib/spidy/connector.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
1
|
#
|
4
2
|
# This class is responsible for actually making a network connection and downloading hypertext
|
5
3
|
#
|
@@ -9,6 +7,7 @@ module Spidy::Connector
|
|
9
7
|
autoload :Html
|
10
8
|
autoload :Json
|
11
9
|
autoload :Xml
|
10
|
+
autoload :Lightpanda
|
12
11
|
|
13
12
|
DEFAULT_WAIT_TIME = 5
|
14
13
|
|
@@ -35,9 +34,9 @@ module Spidy::Connector
|
|
35
34
|
module StaticAccessor
|
36
35
|
extend ActiveSupport::Concern
|
37
36
|
class_methods do
|
38
|
-
def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &
|
37
|
+
def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &)
|
39
38
|
::Spidy::Connector::RetryableCaller.new(new(user_agent: user_agent), wait_time: wait_time, logger: logger).call(
|
40
|
-
url, &
|
39
|
+
url, &
|
41
40
|
)
|
42
41
|
end
|
43
42
|
end
|
@@ -75,9 +74,9 @@ module Spidy::Connector
|
|
75
74
|
connect(url, &block)
|
76
75
|
end
|
77
76
|
|
78
|
-
def connect(url, retry_attempt_count: @retry_attempt_count, &
|
77
|
+
def connect(url, retry_attempt_count: @retry_attempt_count, &)
|
79
78
|
logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
80
|
-
origin_connector.call(url, &
|
79
|
+
origin_connector.call(url, &)
|
81
80
|
rescue Spidy::Connector::Retry => e
|
82
81
|
logger.call('retry.accessed': Time.current,
|
83
82
|
'retry.uri': url,
|
@@ -105,9 +104,9 @@ module Spidy::Connector
|
|
105
104
|
@socks_proxy = socks_proxy
|
106
105
|
end
|
107
106
|
|
108
|
-
def call(url, &
|
107
|
+
def call(url, &)
|
109
108
|
Socksify.proxy(socks_proxy[:host], socks_proxy[:port]) do
|
110
|
-
connector.call(url, &
|
109
|
+
connector.call(url, &)
|
111
110
|
end
|
112
111
|
end
|
113
112
|
|
data/lib/spidy/console.rb
CHANGED
data/lib/spidy/definition.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
1
|
#
|
4
2
|
# Class representing a website defined by DSL
|
5
3
|
#
|
@@ -42,12 +40,12 @@ module Spidy::Definition
|
|
42
40
|
end
|
43
41
|
end
|
44
42
|
|
45
|
-
def spider(name = :default, connector: nil, as: nil
|
43
|
+
def spider(name = :default, connector: nil, as: nil)
|
46
44
|
@namespace ||= {}
|
47
45
|
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
|
48
46
|
socks_proxy: @socks_proxy)
|
49
47
|
@namespace[:"#{name}_spider"] = proc do |source, &yielder|
|
50
|
-
|
48
|
+
yield(yielder, connector, source)
|
51
49
|
end
|
52
50
|
end
|
53
51
|
|
data/lib/spidy/shell.rb
CHANGED
data/lib/spidy/spider.rb
CHANGED