scraypa 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,20 @@
1
+ module Scraypa
2
+ class Configuration
3
+ attr_accessor :use_capybara, :driver, :driver_options, :tor, :tor_options,
4
+ :user_agent, :eye_tor_config_template, :throttle_seconds,
5
+ :headless_chromium, :reset_driver_every_n_requests
6
+
7
+ def initialize
8
+ @use_capybara = nil
9
+ @tor = nil
10
+ @tor_options = nil
11
+ @user_agent = nil
12
+ @driver = nil
13
+ @driver_options = nil
14
+ @eye_tor_config_template = nil
15
+ @throttle_seconds = nil
16
+ @headless_chromium = nil
17
+ @reset_driver_every_n_requests = 5
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,37 @@
1
+ module Scraypa
2
+ include Capybara::DSL
3
+
4
+ class DriverResetter
5
+ attr_accessor :requests_since_last_reset
6
+
7
+ def initialize every_n_requests
8
+ @every_n_requests = every_n_requests
9
+ @requests_since_last_reset = 0
10
+ end
11
+
12
+ def reset_if_nth_request
13
+ @requests_since_last_reset += 1
14
+ if @requests_since_last_reset >= @every_n_requests
15
+ Capybara.current_driver == :poltergeist ?
16
+ reset_poltergeist_driver : reset_headless_chromium_driver
17
+ @requests_since_last_reset = 0
18
+ end
19
+ end
20
+
21
+ private
22
+
23
+ def reset_poltergeist_driver
24
+ Capybara.reset_sessions!
25
+ Capybara.send(:session_pool).each do |session_name, session|
26
+ session.driver.restart if session_name.include?('poltergeist')
27
+ end
28
+ end
29
+
30
+ def reset_headless_chromium_driver
31
+ Capybara.reset_sessions!
32
+ Capybara.send(:session_pool).each do |session_name, session|
33
+ session.driver.quit if session_name.include?('headless_chromium')
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,27 @@
1
+ require 'rest-client'
2
+
3
+ module Scraypa
4
+ class Throttle
5
+ attr_accessor :last_request_time
6
+ attr_reader :seconds
7
+
8
+ def initialize params={}
9
+ @seconds = params.fetch(:seconds, nil)
10
+ end
11
+
12
+ def throttle
13
+ @seconds && @last_request_time ? (@seconds.is_a?(Hash) ?
14
+ sleep_from_last_request_time_for(
15
+ Random.new.rand(@seconds[:from]..@seconds[:to])) :
16
+ sleep_from_last_request_time_for(@seconds)) : nil
17
+ end
18
+
19
+ private
20
+
21
+ def sleep_from_last_request_time_for seconds
22
+ sleep_time = @last_request_time ?
23
+ seconds - (Time.now - @last_request_time) : seconds
24
+ sleep(sleep_time) if sleep_time > 0
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,15 @@
1
+ module Scraypa
2
+ class UserAgentAbstract
3
+ def initialize(*args)
4
+
5
+ end
6
+
7
+ def user_agent
8
+ raise NotImplementedError, 'user_agent action not implemented.'
9
+ end
10
+
11
+ def list
12
+ raise NotImplementedError, 'list action not implemented'
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,27 @@
1
+ module Scraypa
2
+ USER_AGENT_LIST = {
3
+ 'Linux Firefox' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:43.0) Gecko/20100101 Firefox/43.0',
4
+ 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
5
+ 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
6
+ 'Mac Firefox' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:43.0) Gecko/20100101 Firefox/43.0',
7
+ 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
8
+ 'Mac Safari 4' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; de-at) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
9
+ 'Mac Safari' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9',
10
+ 'Windows Chrome' => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.125 Safari/537.36',
11
+ 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
12
+ 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
13
+ 'Windows IE 8' => 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
14
+ 'Windows IE 9' => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
15
+ 'Windows IE 10' => 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
16
+ 'Windows IE 11' => 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
17
+ 'Windows Edge' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586',
18
+ 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
19
+ 'Windows Firefox' => 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
20
+ }
21
+
22
+ USER_AGENT_MOBILE_LIST = {
23
+ 'iPhone' => 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B5110e Safari/601.1',
24
+ 'iPad' => 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1',
25
+ 'Android' => 'Mozilla/5.0 (Linux; Android 5.1.1; Nexus 7 Build/LMY47V) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.76 Safari/537.36'
26
+ }
27
+ end
@@ -0,0 +1,19 @@
1
+ module Scraypa
2
+ class UserAgentFactory
3
+ def self.build(*args)
4
+ #{
5
+ # method: :common_aliases, :randomizer
6
+ # list: :common_aliases,
7
+ # strategy: :randomize,
8
+ # change_after_n_requests: 2
9
+ #}
10
+
11
+ case args[0] && args[0][:method]
12
+ when :randomizer
13
+ UserAgentRandom.new(*args)
14
+ else
15
+ UserAgentIterator.new(*args)
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,97 @@
1
+ module Scraypa
2
+ class UserAgentIterator < UserAgentAbstract
3
+ attr_reader :current_user_agent
4
+
5
+ def initialize *args
6
+ super(*args)
7
+ @config = args[0] || {}
8
+ @change_after_n_requests = @config.fetch(:change_after_n_requests, 0)
9
+ @list_limit = @config.fetch(:list_limit, 0).to_i
10
+ @strategy = @config.fetch(:strategy, :roundrobin)
11
+ @list = limit_list to_array@config.fetch(:list, USER_AGENT_LIST)
12
+ @reducing_list = @list.clone
13
+ @current_user_agent = nil
14
+ @current_user_agent_requests = 0
15
+ end
16
+
17
+ def user_agent
18
+ get_a_new_user_agent? ? (
19
+ @current_user_agent_requests = 0
20
+ select_user_agent_using_strategy
21
+ ) : (
22
+ @current_user_agent_requests += 1
23
+ @current_user_agent
24
+ )
25
+ end
26
+
27
+ def list
28
+ @list
29
+ end
30
+
31
+ private
32
+
33
+ def to_array variable
34
+ case variable
35
+ when Array
36
+ variable
37
+ when Hash
38
+ variable.values
39
+ else
40
+ [variable]
41
+ end
42
+ end
43
+
44
+ def limit_list list
45
+ @list_limit <= 0 || @list_limit >= list.length ?
46
+ list :
47
+ @strategy == :randomize ?
48
+ limit_list_randomly(list) :
49
+ list[0..@list_limit-1]
50
+ end
51
+
52
+ def limit_list_randomly list
53
+ random_list = []
54
+ loop do
55
+ sample = list.sample
56
+ if list.include? sample
57
+ random_list << sample
58
+ list.delete(sample)
59
+ end
60
+ break if random_list.length >= @list_limit
61
+ end
62
+ random_list
63
+ end
64
+
65
+ def get_a_new_user_agent?
66
+ !@current_user_agent ||
67
+ @current_user_agent_requests >= @change_after_n_requests
68
+ end
69
+
70
+ def select_user_agent_using_strategy
71
+ @strategy == :randomize ?
72
+ random_user_agent_from_list :
73
+ next_user_agent_from_list
74
+ end
75
+
76
+ def random_user_agent_from_list
77
+ @current_user_agent_requests += 1
78
+ @current_user_agent = ensure_a_new_random_user_agent
79
+ end
80
+
81
+ def ensure_a_new_random_user_agent
82
+ return @list.first if @list.length == 1
83
+ random_user_agent = nil
84
+ loop do
85
+ random_user_agent = @list.sample
86
+ break unless random_user_agent == @current_user_agent
87
+ end
88
+ random_user_agent
89
+ end
90
+
91
+ def next_user_agent_from_list
92
+ @reducing_list = @list.clone if @reducing_list.empty?
93
+ @current_user_agent_requests += 1
94
+ @current_user_agent = @reducing_list.shift
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,66 @@
1
+ require 'useragents'
2
+
3
+ module Scraypa
4
+ class UserAgentRandom < UserAgentAbstract
5
+ attr_reader :current_user_agent
6
+
7
+ def initialize *args
8
+ super(*args)
9
+ @config = args[0] || {}
10
+ @change_after_n_requests = @config.fetch(:change_after_n_requests, 0)
11
+ @list_limit = @config.fetch(:list_limit, 0).to_i
12
+ random_limited_list if @list_limit > 0
13
+ @current_user_agent = nil
14
+ @current_user_agent_requests = 0
15
+ end
16
+
17
+ def user_agent
18
+ get_a_new_user_agent? ? (
19
+ @current_user_agent_requests = 0
20
+ @list ?
21
+ next_user_agent_from_list :
22
+ select_user_agent_using_randomizer
23
+ ) : (
24
+ @current_user_agent_requests += 1
25
+ @current_user_agent
26
+ )
27
+ end
28
+
29
+ private
30
+
31
+ def random_limited_list
32
+ @list = []
33
+ loop do
34
+ random_ua = UserAgents.rand()
35
+ @list << random_ua unless @list.include? random_ua
36
+ break if @list.length >= @list_limit
37
+ end
38
+ @reducing_list = @list.clone
39
+ end
40
+
41
+ def get_a_new_user_agent?
42
+ !@current_user_agent ||
43
+ @current_user_agent_requests >= @change_after_n_requests
44
+ end
45
+
46
+ def select_user_agent_using_randomizer
47
+ @current_user_agent_requests += 1
48
+ @current_user_agent = ensure_a_new_random_user_agent
49
+ end
50
+
51
+ def ensure_a_new_random_user_agent
52
+ random_user_agent = nil
53
+ loop do
54
+ random_user_agent = UserAgents.rand()
55
+ break unless random_user_agent == @current_user_agent
56
+ end
57
+ random_user_agent
58
+ end
59
+
60
+ def next_user_agent_from_list
61
+ @reducing_list = @list.clone if @reducing_list.empty?
62
+ @current_user_agent_requests += 1
63
+ @current_user_agent = @reducing_list.shift
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,3 @@
1
+ module Scraypa
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,120 @@
1
+ module Scraypa
2
+ include Capybara::DSL
3
+
4
+ class VisitCapybaraHeadlessChromium < VisitInterface
5
+ def initialize params={}
6
+ super(params)
7
+ @config = params[:config]
8
+ @driver_resetter = params[:driver_resetter]
9
+ @user_agent_retriever = params[:user_agent_retriever]
10
+ reset_and_setup_driver
11
+ end
12
+
13
+ def execute params={}
14
+ visit_get_response params
15
+ end
16
+
17
+ private
18
+
19
+ def visit_get_response params={}
20
+ update_user_agent_if_changed if @has_visited
21
+ @has_visited = true
22
+ Capybara.visit params[:url]
23
+ @driver_resetter.reset_if_nth_request if @driver_resetter
24
+ Capybara.page
25
+ end
26
+
27
+ def update_user_agent_if_changed
28
+ if @user_agent_retriever
29
+ new_user_agent = @user_agent_retriever.user_agent
30
+ update_user_agent_and_setup_driver new_user_agent if
31
+ @current_user_agent != new_user_agent
32
+ end
33
+ end
34
+
35
+ def update_user_agent_and_setup_driver new_user_agent
36
+ @current_user_agent = new_user_agent
37
+ @user_agents << @current_user_agent unless
38
+ @user_agents.include? @current_user_agent
39
+ setup_headless_chromium_driver
40
+ end
41
+
42
+ def reset_and_setup_driver
43
+ case @config.driver
44
+ when :headless_chromium
45
+ reset_headless_chromium_drivers
46
+ update_user_agent_if_changed
47
+ setup_headless_chromium_driver
48
+ when :selenium_chrome_billy
49
+ setup_billy_driver
50
+ else
51
+ raise CapybaraDriverUnsupported,
52
+ "Currently no support for capybara driver: #{@config.driver}"
53
+ end
54
+ end
55
+
56
+ def reset_headless_chromium_drivers
57
+ clear_capybara_session_pool
58
+ Capybara.drivers.delete_if{|k,v|
59
+ ![:poltergeist_billy,
60
+ :selenium_chrome_billy].include?(k)
61
+ }
62
+ @current_user_agent = nil
63
+ @user_agents = []
64
+ end
65
+
66
+ def clear_capybara_session_pool
67
+ Capybara.reset_sessions!
68
+ Capybara.send(:session_pool).each do |session_name, session|
69
+ session.driver.quit if session_name.include?('headless_chromium')
70
+ end
71
+ Capybara.send(:session_pool).delete_if{true}
72
+ end
73
+
74
+ def setup_billy_driver
75
+ Capybara.javascript_driver = @config.driver
76
+ end
77
+
78
+ def setup_headless_chromium_driver
79
+ driver_name = driver_name_from_config
80
+ Capybara.register_driver driver_name do |app|
81
+ Capybara::Selenium::Driver.new(app,
82
+ build_driver_options_from_config)
83
+ end unless Capybara.drivers.keys.include? driver_name
84
+ Capybara.default_driver = driver_name
85
+ end
86
+
87
+ def driver_name_from_config
88
+ (@config.driver.to_s +
89
+ (@config.tor ? "tor#{@config.tor_options[:tor_port]}" : "") +
90
+ (@current_user_agent ?
91
+ "ua#{@user_agents.index(@current_user_agent)}" : "")).to_sym
92
+ end
93
+
94
+ def build_driver_options_from_config
95
+ driver_options = {browser: @config.headless_chromium[:browser] || :chrome}
96
+ if @config.headless_chromium[:chromeOptions] || @current_user_agent
97
+ driver_options[:desired_capabilities] =
98
+ Selenium::WebDriver::Remote::Capabilities.chrome(
99
+ :chromeOptions =>
100
+ merge_user_agent_with_chrome_options
101
+ )
102
+ end
103
+ driver_options[:args] = @config.headless_chromium[:args] if
104
+ @config.headless_chromium[:args]
105
+ driver_options
106
+ end
107
+
108
+ def merge_user_agent_with_chrome_options
109
+ chrome_options =
110
+ @config.headless_chromium[:chromeOptions] || {args: []}
111
+ if @current_user_agent &&
112
+ (chrome_options[:args] || chrome_options['args'])
113
+ args_key = chrome_options[:args] ? :args : 'args'
114
+ chrome_options[args_key].delete_if {|d| d.include?("user-agent=")}
115
+ chrome_options[args_key] << "--user-agent=#{@current_user_agent}"
116
+ end
117
+ chrome_options
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,76 @@
1
+ require 'capybara'
2
+ require 'capybara/poltergeist'
3
+ require 'phantomjs'
4
+
5
+ module Scraypa
6
+ include Capybara::DSL
7
+
8
+ class VisitCapybaraPoltergeist < VisitInterface
9
+ def initialize params={}
10
+ super(params)
11
+ @config = params[:config]
12
+ @tor_proxy = params[:tor_proxy]
13
+ @driver_resetter = params[:driver_resetter]
14
+ @user_agent_retriever = params[:user_agent_retriever]
15
+ setup_driver
16
+ @current_user_agent = nil
17
+ end
18
+
19
+ def execute params={}
20
+ @config.tor && @tor_proxy ?
21
+ visit_get_response_through_tor(params) :
22
+ visit_get_response(params)
23
+ end
24
+
25
+ private
26
+
27
+ def visit_get_response_through_tor params={}
28
+ @tor_proxy.proxy do
29
+ return visit_get_response params
30
+ end
31
+ end
32
+
33
+ def visit_get_response params={}
34
+ update_user_agent_if_changed
35
+ Capybara.visit params[:url]
36
+ @driver_resetter.reset_if_nth_request if @driver_resetter
37
+ Capybara.page
38
+ end
39
+
40
+ def update_user_agent_if_changed
41
+ if @user_agent_retriever
42
+ new_user_agent = @user_agent_retriever.user_agent
43
+ if @current_user_agent != new_user_agent
44
+ @current_user_agent = new_user_agent
45
+ Capybara.page.driver.add_headers(
46
+ "User-Agent" => @current_user_agent)
47
+ end
48
+ end
49
+ end
50
+
51
+ def setup_driver
52
+ case @config.driver
53
+ when :poltergeist
54
+ setup_poltergeist_driver
55
+ when :poltergeist_billy
56
+ setup_billy_driver
57
+ else
58
+ raise CapybaraDriverUnsupported,
59
+ "Currently no support for capybara driver: #{@config.driver}"
60
+ end
61
+ end
62
+
63
+ def setup_poltergeist_driver
64
+ driver_name = (@config.driver.to_s +
65
+ (@config.tor ? "tor#{@config.tor_options[:tor_port]}" : '')).to_sym
66
+ Capybara.default_driver = driver_name
67
+ Capybara.register_driver driver_name do |app|
68
+ Capybara::Poltergeist::Driver.new(app, @config.driver_options || {})
69
+ end
70
+ end
71
+
72
+ def setup_billy_driver
73
+ Capybara.javascript_driver = @config.driver
74
+ end
75
+ end
76
+ end