scraypa 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ module Scraypa
2
+ class Configuration
3
+ attr_accessor :use_capybara, :driver, :driver_options, :tor, :tor_options,
4
+ :user_agent, :eye_tor_config_template, :throttle_seconds,
5
+ :headless_chromium, :reset_driver_every_n_requests
6
+
7
+ def initialize
8
+ @use_capybara = nil
9
+ @tor = nil
10
+ @tor_options = nil
11
+ @user_agent = nil
12
+ @driver = nil
13
+ @driver_options = nil
14
+ @eye_tor_config_template = nil
15
+ @throttle_seconds = nil
16
+ @headless_chromium = nil
17
+ @reset_driver_every_n_requests = 5
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,37 @@
1
+ module Scraypa
2
+ include Capybara::DSL
3
+
4
+ class DriverResetter
5
+ attr_accessor :requests_since_last_reset
6
+
7
+ def initialize every_n_requests
8
+ @every_n_requests = every_n_requests
9
+ @requests_since_last_reset = 0
10
+ end
11
+
12
+ def reset_if_nth_request
13
+ @requests_since_last_reset += 1
14
+ if @requests_since_last_reset >= @every_n_requests
15
+ Capybara.current_driver == :poltergeist ?
16
+ reset_poltergeist_driver : reset_headless_chromium_driver
17
+ @requests_since_last_reset = 0
18
+ end
19
+ end
20
+
21
+ private
22
+
23
+ def reset_poltergeist_driver
24
+ Capybara.reset_sessions!
25
+ Capybara.send(:session_pool).each do |session_name, session|
26
+ session.driver.restart if session_name.include?('poltergeist')
27
+ end
28
+ end
29
+
30
+ def reset_headless_chromium_driver
31
+ Capybara.reset_sessions!
32
+ Capybara.send(:session_pool).each do |session_name, session|
33
+ session.driver.quit if session_name.include?('headless_chromium')
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,27 @@
1
+ require 'rest-client'
2
+
3
+ module Scraypa
4
+ class Throttle
5
+ attr_accessor :last_request_time
6
+ attr_reader :seconds
7
+
8
+ def initialize params={}
9
+ @seconds = params.fetch(:seconds, nil)
10
+ end
11
+
12
+ def throttle
13
+ @seconds && @last_request_time ? (@seconds.is_a?(Hash) ?
14
+ sleep_from_last_request_time_for(
15
+ Random.new.rand(@seconds[:from]..@seconds[:to])) :
16
+ sleep_from_last_request_time_for(@seconds)) : nil
17
+ end
18
+
19
+ private
20
+
21
+ def sleep_from_last_request_time_for seconds
22
+ sleep_time = @last_request_time ?
23
+ seconds - (Time.now - @last_request_time) : seconds
24
+ sleep(sleep_time) if sleep_time > 0
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,15 @@
1
+ module Scraypa
2
+ class UserAgentAbstract
3
+ def initialize(*args)
4
+
5
+ end
6
+
7
+ def user_agent
8
+ raise NotImplementedError, 'user_agent action not implemented.'
9
+ end
10
+
11
+ def list
12
+ raise NotImplementedError, 'list action not implemented'
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,27 @@
1
+ module Scraypa
2
+ USER_AGENT_LIST = {
3
+ 'Linux Firefox' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:43.0) Gecko/20100101 Firefox/43.0',
4
+ 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
5
+ 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
6
+ 'Mac Firefox' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:43.0) Gecko/20100101 Firefox/43.0',
7
+ 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
8
+ 'Mac Safari 4' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; de-at) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
9
+ 'Mac Safari' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9',
10
+ 'Windows Chrome' => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.125 Safari/537.36',
11
+ 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
12
+ 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
13
+ 'Windows IE 8' => 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
14
+ 'Windows IE 9' => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
15
+ 'Windows IE 10' => 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
16
+ 'Windows IE 11' => 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
17
+ 'Windows Edge' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586',
18
+ 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
19
+ 'Windows Firefox' => 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
20
+ }
21
+
22
+ USER_AGENT_MOBILE_LIST = {
23
+ 'iPhone' => 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B5110e Safari/601.1',
24
+ 'iPad' => 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1',
25
+ 'Android' => 'Mozilla/5.0 (Linux; Android 5.1.1; Nexus 7 Build/LMY47V) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.76 Safari/537.36'
26
+ }
27
+ end
@@ -0,0 +1,19 @@
1
+ module Scraypa
2
+ class UserAgentFactory
3
+ def self.build(*args)
4
+ #{
5
+ # method: :common_aliases, :randomizer
6
+ # list: :common_aliases,
7
+ # strategy: :randomize,
8
+ # change_after_n_requests: 2
9
+ #}
10
+
11
+ case args[0] && args[0][:method]
12
+ when :randomizer
13
+ UserAgentRandom.new(*args)
14
+ else
15
+ UserAgentIterator.new(*args)
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,97 @@
1
+ module Scraypa
2
+ class UserAgentIterator < UserAgentAbstract
3
+ attr_reader :current_user_agent
4
+
5
+ def initialize *args
6
+ super(*args)
7
+ @config = args[0] || {}
8
+ @change_after_n_requests = @config.fetch(:change_after_n_requests, 0)
9
+ @list_limit = @config.fetch(:list_limit, 0).to_i
10
+ @strategy = @config.fetch(:strategy, :roundrobin)
11
+ @list = limit_list to_array@config.fetch(:list, USER_AGENT_LIST)
12
+ @reducing_list = @list.clone
13
+ @current_user_agent = nil
14
+ @current_user_agent_requests = 0
15
+ end
16
+
17
+ def user_agent
18
+ get_a_new_user_agent? ? (
19
+ @current_user_agent_requests = 0
20
+ select_user_agent_using_strategy
21
+ ) : (
22
+ @current_user_agent_requests += 1
23
+ @current_user_agent
24
+ )
25
+ end
26
+
27
+ def list
28
+ @list
29
+ end
30
+
31
+ private
32
+
33
+ def to_array variable
34
+ case variable
35
+ when Array
36
+ variable
37
+ when Hash
38
+ variable.values
39
+ else
40
+ [variable]
41
+ end
42
+ end
43
+
44
+ def limit_list list
45
+ @list_limit <= 0 || @list_limit >= list.length ?
46
+ list :
47
+ @strategy == :randomize ?
48
+ limit_list_randomly(list) :
49
+ list[0..@list_limit-1]
50
+ end
51
+
52
+ def limit_list_randomly list
53
+ random_list = []
54
+ loop do
55
+ sample = list.sample
56
+ if list.include? sample
57
+ random_list << sample
58
+ list.delete(sample)
59
+ end
60
+ break if random_list.length >= @list_limit
61
+ end
62
+ random_list
63
+ end
64
+
65
+ def get_a_new_user_agent?
66
+ !@current_user_agent ||
67
+ @current_user_agent_requests >= @change_after_n_requests
68
+ end
69
+
70
+ def select_user_agent_using_strategy
71
+ @strategy == :randomize ?
72
+ random_user_agent_from_list :
73
+ next_user_agent_from_list
74
+ end
75
+
76
+ def random_user_agent_from_list
77
+ @current_user_agent_requests += 1
78
+ @current_user_agent = ensure_a_new_random_user_agent
79
+ end
80
+
81
+ def ensure_a_new_random_user_agent
82
+ return @list.first if @list.length == 1
83
+ random_user_agent = nil
84
+ loop do
85
+ random_user_agent = @list.sample
86
+ break unless random_user_agent == @current_user_agent
87
+ end
88
+ random_user_agent
89
+ end
90
+
91
+ def next_user_agent_from_list
92
+ @reducing_list = @list.clone if @reducing_list.empty?
93
+ @current_user_agent_requests += 1
94
+ @current_user_agent = @reducing_list.shift
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,66 @@
1
+ require 'useragents'
2
+
3
+ module Scraypa
4
+ class UserAgentRandom < UserAgentAbstract
5
+ attr_reader :current_user_agent
6
+
7
+ def initialize *args
8
+ super(*args)
9
+ @config = args[0] || {}
10
+ @change_after_n_requests = @config.fetch(:change_after_n_requests, 0)
11
+ @list_limit = @config.fetch(:list_limit, 0).to_i
12
+ random_limited_list if @list_limit > 0
13
+ @current_user_agent = nil
14
+ @current_user_agent_requests = 0
15
+ end
16
+
17
+ def user_agent
18
+ get_a_new_user_agent? ? (
19
+ @current_user_agent_requests = 0
20
+ @list ?
21
+ next_user_agent_from_list :
22
+ select_user_agent_using_randomizer
23
+ ) : (
24
+ @current_user_agent_requests += 1
25
+ @current_user_agent
26
+ )
27
+ end
28
+
29
+ private
30
+
31
+ def random_limited_list
32
+ @list = []
33
+ loop do
34
+ random_ua = UserAgents.rand()
35
+ @list << random_ua unless @list.include? random_ua
36
+ break if @list.length >= @list_limit
37
+ end
38
+ @reducing_list = @list.clone
39
+ end
40
+
41
+ def get_a_new_user_agent?
42
+ !@current_user_agent ||
43
+ @current_user_agent_requests >= @change_after_n_requests
44
+ end
45
+
46
+ def select_user_agent_using_randomizer
47
+ @current_user_agent_requests += 1
48
+ @current_user_agent = ensure_a_new_random_user_agent
49
+ end
50
+
51
+ def ensure_a_new_random_user_agent
52
+ random_user_agent = nil
53
+ loop do
54
+ random_user_agent = UserAgents.rand()
55
+ break unless random_user_agent == @current_user_agent
56
+ end
57
+ random_user_agent
58
+ end
59
+
60
+ def next_user_agent_from_list
61
+ @reducing_list = @list.clone if @reducing_list.empty?
62
+ @current_user_agent_requests += 1
63
+ @current_user_agent = @reducing_list.shift
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,3 @@
1
+ module Scraypa
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,120 @@
1
+ module Scraypa
2
+ include Capybara::DSL
3
+
4
+ class VisitCapybaraHeadlessChromium < VisitInterface
5
+ def initialize params={}
6
+ super(params)
7
+ @config = params[:config]
8
+ @driver_resetter = params[:driver_resetter]
9
+ @user_agent_retriever = params[:user_agent_retriever]
10
+ reset_and_setup_driver
11
+ end
12
+
13
+ def execute params={}
14
+ visit_get_response params
15
+ end
16
+
17
+ private
18
+
19
+ def visit_get_response params={}
20
+ update_user_agent_if_changed if @has_visited
21
+ @has_visited = true
22
+ Capybara.visit params[:url]
23
+ @driver_resetter.reset_if_nth_request if @driver_resetter
24
+ Capybara.page
25
+ end
26
+
27
+ def update_user_agent_if_changed
28
+ if @user_agent_retriever
29
+ new_user_agent = @user_agent_retriever.user_agent
30
+ update_user_agent_and_setup_driver new_user_agent if
31
+ @current_user_agent != new_user_agent
32
+ end
33
+ end
34
+
35
+ def update_user_agent_and_setup_driver new_user_agent
36
+ @current_user_agent = new_user_agent
37
+ @user_agents << @current_user_agent unless
38
+ @user_agents.include? @current_user_agent
39
+ setup_headless_chromium_driver
40
+ end
41
+
42
+ def reset_and_setup_driver
43
+ case @config.driver
44
+ when :headless_chromium
45
+ reset_headless_chromium_drivers
46
+ update_user_agent_if_changed
47
+ setup_headless_chromium_driver
48
+ when :selenium_chrome_billy
49
+ setup_billy_driver
50
+ else
51
+ raise CapybaraDriverUnsupported,
52
+ "Currently no support for capybara driver: #{@config.driver}"
53
+ end
54
+ end
55
+
56
+ def reset_headless_chromium_drivers
57
+ clear_capybara_session_pool
58
+ Capybara.drivers.delete_if{|k,v|
59
+ ![:poltergeist_billy,
60
+ :selenium_chrome_billy].include?(k)
61
+ }
62
+ @current_user_agent = nil
63
+ @user_agents = []
64
+ end
65
+
66
+ def clear_capybara_session_pool
67
+ Capybara.reset_sessions!
68
+ Capybara.send(:session_pool).each do |session_name, session|
69
+ session.driver.quit if session_name.include?('headless_chromium')
70
+ end
71
+ Capybara.send(:session_pool).delete_if{true}
72
+ end
73
+
74
+ def setup_billy_driver
75
+ Capybara.javascript_driver = @config.driver
76
+ end
77
+
78
+ def setup_headless_chromium_driver
79
+ driver_name = driver_name_from_config
80
+ Capybara.register_driver driver_name do |app|
81
+ Capybara::Selenium::Driver.new(app,
82
+ build_driver_options_from_config)
83
+ end unless Capybara.drivers.keys.include? driver_name
84
+ Capybara.default_driver = driver_name
85
+ end
86
+
87
+ def driver_name_from_config
88
+ (@config.driver.to_s +
89
+ (@config.tor ? "tor#{@config.tor_options[:tor_port]}" : "") +
90
+ (@current_user_agent ?
91
+ "ua#{@user_agents.index(@current_user_agent)}" : "")).to_sym
92
+ end
93
+
94
+ def build_driver_options_from_config
95
+ driver_options = {browser: @config.headless_chromium[:browser] || :chrome}
96
+ if @config.headless_chromium[:chromeOptions] || @current_user_agent
97
+ driver_options[:desired_capabilities] =
98
+ Selenium::WebDriver::Remote::Capabilities.chrome(
99
+ :chromeOptions =>
100
+ merge_user_agent_with_chrome_options
101
+ )
102
+ end
103
+ driver_options[:args] = @config.headless_chromium[:args] if
104
+ @config.headless_chromium[:args]
105
+ driver_options
106
+ end
107
+
108
+ def merge_user_agent_with_chrome_options
109
+ chrome_options =
110
+ @config.headless_chromium[:chromeOptions] || {args: []}
111
+ if @current_user_agent &&
112
+ (chrome_options[:args] || chrome_options['args'])
113
+ args_key = chrome_options[:args] ? :args : 'args'
114
+ chrome_options[args_key].delete_if {|d| d.include?("user-agent=")}
115
+ chrome_options[args_key] << "--user-agent=#{@current_user_agent}"
116
+ end
117
+ chrome_options
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,76 @@
1
+ require 'capybara'
2
+ require 'capybara/poltergeist'
3
+ require 'phantomjs'
4
+
5
+ module Scraypa
6
+ include Capybara::DSL
7
+
8
+ class VisitCapybaraPoltergeist < VisitInterface
9
+ def initialize params={}
10
+ super(params)
11
+ @config = params[:config]
12
+ @tor_proxy = params[:tor_proxy]
13
+ @driver_resetter = params[:driver_resetter]
14
+ @user_agent_retriever = params[:user_agent_retriever]
15
+ setup_driver
16
+ @current_user_agent = nil
17
+ end
18
+
19
+ def execute params={}
20
+ @config.tor && @tor_proxy ?
21
+ visit_get_response_through_tor(params) :
22
+ visit_get_response(params)
23
+ end
24
+
25
+ private
26
+
27
+ def visit_get_response_through_tor params={}
28
+ @tor_proxy.proxy do
29
+ return visit_get_response params
30
+ end
31
+ end
32
+
33
+ def visit_get_response params={}
34
+ update_user_agent_if_changed
35
+ Capybara.visit params[:url]
36
+ @driver_resetter.reset_if_nth_request if @driver_resetter
37
+ Capybara.page
38
+ end
39
+
40
+ def update_user_agent_if_changed
41
+ if @user_agent_retriever
42
+ new_user_agent = @user_agent_retriever.user_agent
43
+ if @current_user_agent != new_user_agent
44
+ @current_user_agent = new_user_agent
45
+ Capybara.page.driver.add_headers(
46
+ "User-Agent" => @current_user_agent)
47
+ end
48
+ end
49
+ end
50
+
51
+ def setup_driver
52
+ case @config.driver
53
+ when :poltergeist
54
+ setup_poltergeist_driver
55
+ when :poltergeist_billy
56
+ setup_billy_driver
57
+ else
58
+ raise CapybaraDriverUnsupported,
59
+ "Currently no support for capybara driver: #{@config.driver}"
60
+ end
61
+ end
62
+
63
+ def setup_poltergeist_driver
64
+ driver_name = (@config.driver.to_s +
65
+ (@config.tor ? "tor#{@config.tor_options[:tor_port]}" : '')).to_sym
66
+ Capybara.default_driver = driver_name
67
+ Capybara.register_driver driver_name do |app|
68
+ Capybara::Poltergeist::Driver.new(app, @config.driver_options || {})
69
+ end
70
+ end
71
+
72
+ def setup_billy_driver
73
+ Capybara.javascript_driver = @config.driver
74
+ end
75
+ end
76
+ end