scruber 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +5 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +39 -0
  8. data/Rakefile +6 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/scruber +4 -0
  12. data/lib/scruber/app_searcher.rb +31 -0
  13. data/lib/scruber/cli/project_generator.rb +47 -0
  14. data/lib/scruber/cli/templates/Gemfile.tt +6 -0
  15. data/lib/scruber/cli/templates/application.tt +18 -0
  16. data/lib/scruber/cli/templates/bin/scruber.tt +6 -0
  17. data/lib/scruber/cli/templates/boot.tt +3 -0
  18. data/lib/scruber/cli/templates/gitignore.tt +12 -0
  19. data/lib/scruber/cli/templates/initializers/proxies.tt +10 -0
  20. data/lib/scruber/cli/templates/initializers/user_agents.tt +14 -0
  21. data/lib/scruber/cli/templates/scrapers/sample.tt +7 -0
  22. data/lib/scruber/cli.rb +40 -0
  23. data/lib/scruber/core/configuration.rb +30 -0
  24. data/lib/scruber/core/crawler.rb +92 -0
  25. data/lib/scruber/core/extensions/base.rb +26 -0
  26. data/lib/scruber/core/extensions/csv_output.rb +62 -0
  27. data/lib/scruber/core/extensions/loop.rb +39 -0
  28. data/lib/scruber/core/page_format/base.rb +11 -0
  29. data/lib/scruber/core/page_format/html.rb +13 -0
  30. data/lib/scruber/core/page_format/xml.rb +13 -0
  31. data/lib/scruber/core/page_format.rb +33 -0
  32. data/lib/scruber/fetcher.rb +34 -0
  33. data/lib/scruber/fetcher_adapters/abstract_adapter.rb +119 -0
  34. data/lib/scruber/fetcher_adapters/typhoeus_fetcher.rb +78 -0
  35. data/lib/scruber/helpers/dictionary_reader/csv.rb +27 -0
  36. data/lib/scruber/helpers/dictionary_reader/xml.rb +23 -0
  37. data/lib/scruber/helpers/dictionary_reader.rb +33 -0
  38. data/lib/scruber/helpers/fetcher_agent.rb +40 -0
  39. data/lib/scruber/helpers/fetcher_agent_adapters/abstract_adapter.rb +69 -0
  40. data/lib/scruber/helpers/fetcher_agent_adapters/memory.rb +41 -0
  41. data/lib/scruber/helpers/proxy_rotator.rb +125 -0
  42. data/lib/scruber/helpers/user_agent_rotator.rb +91 -0
  43. data/lib/scruber/queue.rb +34 -0
  44. data/lib/scruber/queue_adapters/abstract_adapter.rb +112 -0
  45. data/lib/scruber/queue_adapters/memory.rb +70 -0
  46. data/lib/scruber/version.rb +3 -0
  47. data/lib/scruber.rb +69 -0
  48. data/scruber.gemspec +43 -0
  49. metadata +233 -0
@@ -0,0 +1,33 @@
1
+ module Scruber
2
+ module Core
3
+ module PageFormat
4
+ class << self
5
+ def process(page, page_format)
6
+ if page_format.nil?
7
+ nil
8
+ elsif _registered_formats.keys.include?(page_format.to_sym)
9
+ _registered_formats[page_format.to_sym].process(page)
10
+ else
11
+ raise "Unsupported format"
12
+ end
13
+ end
14
+
15
+ def add(label, claz)
16
+ unless claz.respond_to?(:process)
17
+ raise NoMethodError, "process is not declared in the #{claz.inspect}"
18
+ end
19
+
20
+ _registered_formats[label] = claz
21
+ end
22
+
23
+ def [](label)
24
+ _registered_formats[label]
25
+ end
26
+
27
+ def _registered_formats
28
+ @registered_formats ||= {}
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,34 @@
1
+ module Scruber
2
+ module Fetcher
3
+ class << self
4
+ attr_writer :adapter
5
+
6
+ def new(options={})
7
+ adapter.new(::Scruber.configuration.fetcher_options.merge(options))
8
+ end
9
+
10
+ def adapter
11
+ unless @adapter
12
+ @adapter = ::Scruber.configuration.fetcher_adapter || _adapters.keys.first
13
+ end
14
+ raise Scruber::ArgumentError.new("Adapter not found") unless @adapter
15
+ _adapters[@adapter]
16
+ end
17
+
18
+ def add_adapter(label, claz)
19
+ unless claz.method_defined?(:run)
20
+ raise NoMethodError, "run is not declared in the #{label.inspect}"
21
+ end
22
+ _adapters[label] = claz
23
+ end
24
+
25
+ def [](label)
26
+ _adapters[label]
27
+ end
28
+
29
+ def _adapters
30
+ @_adapters ||= {}
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,119 @@
1
+ module Scruber
2
+ module FetcherAdapters
3
+ class AbstractAdapter
4
+ attr_accessor :options, # all passed options
5
+ :max_concurrency,
6
+ :max_retry_times,
7
+ :retry_delays,
8
+ :followlocation,
9
+ :request_timeout
10
+
11
+ def initialize(options={})
12
+ @options = options
13
+ @max_concurrency = options.fetch(:max_concurrency) { 1 }
14
+ @max_retry_times = options.fetch(:max_retry_times) { 5 }
15
+ @retry_delays = options.fetch(:retry_delays) { [1,2,2,4,4] }
16
+ @followlocation = options.fetch(:followlocation) { false }
17
+ @request_timeout = options.fetch(:request_timeout) { 15 }
18
+ end
19
+
20
+ def run(queue)
21
+ raise NotImplementedError
22
+ end
23
+
24
+ def before_request_callback(page)
25
+ page
26
+ end
27
+
28
+ def after_request_callback(page)
29
+ if bad_response?(page)
30
+ page.retry_at = determine_retry_at(page)
31
+ page.retry_count += 1
32
+ if page.max_retry_times.nil?
33
+ page.max_retry_times = @max_retry_times
34
+ end
35
+ else
36
+ page.fetched_at = Time.now.to_i
37
+ end
38
+ page
39
+ end
40
+
41
+ def headers_for(page)
42
+ if page.fetcher_agent
43
+ headers = page.fetcher_agent.headers
44
+ else
45
+ headers = page.headers
46
+ end
47
+ headers = {} unless headers.is_a?(Hash)
48
+ headers["User-Agent"] = user_agent_for(page)
49
+ cookie = cookie_for(page)
50
+ if cookie
51
+ headers["Cookie"] = cookie
52
+ end
53
+ headers
54
+ end
55
+
56
+ def cookie_for(page)
57
+ if page.fetcher_agent
58
+ cookie = page.fetcher_agent.cookie_for(page.url)
59
+ cookie.blank? ? nil : cookie
60
+ else
61
+ nil
62
+ end
63
+ end
64
+
65
+ def user_agent_for(page)
66
+ if page.user_agent
67
+ page.user_agent
68
+ elsif page.fetcher_agent && page.fetcher_agent.user_agent
69
+ page.fetcher_agent.user_agent
70
+ else
71
+ Scruber::Helpers::UserAgentRotator.next
72
+ end
73
+ end
74
+
75
+ def proxy_for(page)
76
+ if page.proxy
77
+ page.proxy
78
+ elsif page.fetcher_agent && page.fetcher_agent.proxy
79
+ page.fetcher_agent.proxy
80
+ else
81
+ Scruber::Helpers::ProxyRotator.next
82
+ end
83
+ end
84
+
85
+ def determine_retry_at(page)
86
+ delay = @retry_delays[page.retry_count] || @retry_delays.last
87
+ Time.now.to_i + delay
88
+ end
89
+
90
+ def bad_response?(page)
91
+ case page.response_code
92
+ when 0
93
+ true
94
+ when 1
95
+ true
96
+ when 100..199
97
+ true
98
+ when 200
99
+ false
100
+ when 201..299
101
+ false
102
+ when 300..399
103
+ @options.fetch(:followlocation) { false }
104
+ when 404
105
+ false
106
+ when 407
107
+ raise "RejectedByProxyError"
108
+ when 400..499
109
+ true
110
+ when 500..599
111
+ true
112
+ else
113
+ true
114
+ end
115
+ end
116
+
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,78 @@
1
+ require 'typhoeus'
2
+ module Scruber
3
+ module FetcherAdapters
4
+ class TyphoeusFetcher < AbstractAdapter
5
+ attr_accessor :ssl_verifypeer,
6
+ :ssl_verifyhost
7
+
8
+ def initialize(options={})
9
+ super(options)
10
+ @ssl_verifypeer = options.fetch(:ssl_verifypeer) { false }
11
+ @ssl_verifyhost = options.fetch(:ssl_verifyhost) { 0 }
12
+ @max_requests = options.fetch(:max_requests) { @max_concurrency * 10 }
13
+ end
14
+
15
+ def run(queue)
16
+ queue.fetch_pending(@max_requests).each do |page|
17
+ request = build_request(page)
18
+
19
+ hydra.queue(request)
20
+ end
21
+ if hydra.queued_requests.count > 0
22
+ hydra.run
23
+ else
24
+ sleep 1
25
+ end
26
+ end
27
+
28
+ def build_request(page)
29
+ page = before_request_callback(page)
30
+ request_options = {
31
+ method: page[:method],
32
+ body: page[:body],
33
+ # params: page[:params],
34
+ headers: headers_for(page),
35
+ accept_encoding: 'gzip',
36
+ forbid_reuse: true,
37
+ followlocation: page.options.fetch(:followlocation){ @followlocation },
38
+ ssl_verifypeer: page.options.fetch(:ssl_verifypeer){ @ssl_verifypeer },
39
+ ssl_verifyhost: page.options.fetch(:ssl_verifyhost){ @ssl_verifyhost },
40
+ timeout: @request_timeout
41
+ }
42
+
43
+ proxy = proxy_for(page)
44
+ request_options.merge!({proxy: proxy.http? ? proxy.address : "socks://#{proxy.address}"}) if proxy
45
+ request_options.merge!({proxyuserpwd: proxy.proxyuserpwd}) if proxy && proxy.proxyuserpwd.present?
46
+
47
+ request = Typhoeus::Request.new(page[:url], request_options)
48
+
49
+ request.on_complete do |response|
50
+ on_complete_callback(page, response)
51
+ end
52
+
53
+ request
54
+ end
55
+
56
+ def hydra
57
+ @hydra ||= Typhoeus::Hydra.new(max_concurrency: @max_concurrency)
58
+ end
59
+
60
+ def on_complete_callback(page, response)
61
+ page.response_code = response.code
62
+ page.response_body = response.body
63
+ page.response_headers = response.response_headers
64
+ page.response_total_time = response.total_time
65
+
66
+ if response.timed_out?
67
+ page[:response_code] = 1
68
+ end
69
+
70
+ page = after_request_callback(page)
71
+ page.save
72
+ end
73
+
74
+ end
75
+ end
76
+ end
77
+
78
+ Scruber::Fetcher.add_adapter(:typhoeus_fetcher, Scruber::FetcherAdapters::TyphoeusFetcher)
@@ -0,0 +1,27 @@
1
+ module Scruber
2
+ module Helpers
3
+ module DictionaryReader
4
+ class Csv
5
+ def initialize(file_path)
6
+ @file_path = file_path
7
+ end
8
+
9
+ def read(options={})
10
+ col_sep = options.delete(:col_sep) || ';'
11
+
12
+ CSV.foreach(@file_path, col_sep: col_sep, headers: true, encoding: 'utf-8') do |csv_row|
13
+ if options.blank?
14
+ yield csv_row
15
+ else
16
+ if options.all?{|(k,v)| csv_row[k.to_s] == v }
17
+ yield csv_row
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ Scruber::Helpers::DictionaryReader.add(:csv, Scruber::Helpers::DictionaryReader::Csv)
@@ -0,0 +1,23 @@
1
+ module Scruber
2
+ module Helpers
3
+ module DictionaryReader
4
+ class Xml
5
+ def initialize(file_path)
6
+ @xml = Nokogiri.parse(File.open(file_path).read)
7
+ end
8
+
9
+ def read(options={})
10
+ selector = options.delete(:selector) || 'item'
11
+ options.each do |k,v|
12
+ selector = "#{selector}[#{k}=\"#{v}\"]"
13
+ end
14
+ @xml.search(selector).each do |item|
15
+ yield item.to_h
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+
23
+ Scruber::Helpers::DictionaryReader.add(:xml, Scruber::Helpers::DictionaryReader::Xml)
@@ -0,0 +1,33 @@
1
+ module Scruber
2
+ module Helpers
3
+ module DictionaryReader
4
+ class << self
5
+ def read(file_path, file_type, options)
6
+ if _registered_types.keys.include?(file_type.to_sym)
7
+ _registered_types[file_type.to_sym].new(file_path).read(options) do |obj|
8
+ yield obj
9
+ end
10
+ else
11
+ raise "Unsupported type, supported types #{_registered_types.keys}"
12
+ end
13
+ end
14
+
15
+ def add(label, claz)
16
+ unless claz.instance_methods.include?(:read)
17
+ raise NoMethodError, "read is not declared in the #{claz.inspect}"
18
+ end
19
+
20
+ _registered_types[label] = claz
21
+ end
22
+
23
+ def [](label)
24
+ _registered_types[label]
25
+ end
26
+
27
+ def _registered_types
28
+ @registered_types ||= {}
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,40 @@
1
+ module Scruber
2
+ module Helpers
3
+ module FetcherAgent
4
+ class << self
5
+ attr_writer :adapter
6
+
7
+ def new(options={})
8
+ adapter.new(::Scruber.configuration.fetcher_agent_options.merge(options))
9
+ end
10
+
11
+ def find(id)
12
+ adapter.find(id)
13
+ end
14
+
15
+ def adapter
16
+ unless @adapter
17
+ @adapter = ::Scruber.configuration.fetcher_agent_adapter || _adapters.keys.first
18
+ end
19
+ raise Scruber::ArgumentError.new("Adapter not found") unless @adapter
20
+ _adapters[@adapter]
21
+ end
22
+
23
+ def add_adapter(label, claz)
24
+ # unless claz.method_defined?(:run)
25
+ # raise NoMethodError, "run is not declared in the #{label.inspect}"
26
+ # end
27
+ _adapters[label] = claz
28
+ end
29
+
30
+ def [](label)
31
+ _adapters[label]
32
+ end
33
+
34
+ def _adapters
35
+ @_adapters ||= {}
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,69 @@
1
+ module Scruber
2
+ module Helpers
3
+ module FetcherAgentAdapters
4
+ class AbstractAdapter
5
+ attr_accessor :id, :user_agent, :proxy_id, :headers, :cookie_jar, :updated_at, :created_at, :disable_proxy
6
+ attr_reader :jar
7
+
8
+ def initialize(options={})
9
+ @id = options.fetch(:id) { nil }
10
+ @user_agent = options.fetch(:user_agent) { nil }
11
+ @proxy_id = options.fetch(:proxy_id) { nil }
12
+ @headers = options.fetch(:headers) { {} }
13
+ @cookie_jar = options.fetch(:cookie_jar) { {} }
14
+ @disable_proxy = options.fetch(:disable_proxy) { false }
15
+ @updated_at = options.fetch(:updated_at) { Time.now }
16
+ @created_at = options.fetch(:created_at) { Time.now }
17
+ @jar = HTTP::CookieJar.new
18
+ if @cookie_jar.is_a?(String)
19
+ @jar.load(StringIO.new(@cookie_jar))
20
+ end
21
+ @_proxy = false
22
+ end
23
+
24
+ def proxy
25
+ if @_proxy == false
26
+ @_proxy = (@proxy_id ? Scruber::Helpers::ProxyRotator.find(@proxy_id) : nil)
27
+ else
28
+ @_proxy
29
+ end
30
+ end
31
+
32
+ def parse_cookies_from_page!(page)
33
+ cookies = page.response_cookies
34
+ cookies.each do |cookie|
35
+ @jar.parse(cookie, URI(page.url))
36
+ end
37
+ end
38
+
39
+ def serialize_cookies
40
+ io = StringIO.new
41
+ @jar.save(io)
42
+ @cookie_jar = io.string
43
+ end
44
+
45
+ def cookie_for(uri_or_url)
46
+ if uri_or_url.is_a?(String)
47
+ uri_or_url = URI(uri_or_url)
48
+ end
49
+ HTTP::Cookie.cookie_value(@jar.cookies(uri_or_url))
50
+ end
51
+
52
+ def save
53
+ raise NotImplementedError
54
+ end
55
+
56
+ def delete
57
+ raise NotImplementedError
58
+ end
59
+
60
+ class << self
61
+ def find(id)
62
+ raise NotImplementedError
63
+ end
64
+ end
65
+
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,41 @@
1
+ module Scruber
2
+ module Helpers
3
+ module FetcherAgentAdapters
4
+ class Memory < AbstractAdapter
5
+ def initialize(options={})
6
+ super(options)
7
+ @id = Time.now.to_i.to_s+'_'+rand(1_000..999_999).to_s if @id.nil?
8
+ end
9
+
10
+ def save
11
+ Scruber::Helpers::FetcherAgentAdapters::Memory.store(self)
12
+ end
13
+
14
+ def delete
15
+ Scruber::Helpers::FetcherAgentAdapters::Memory.delete(self)
16
+ end
17
+
18
+ class << self
19
+ def find(id)
20
+ _collection[id]
21
+ end
22
+
23
+ def _collection
24
+ @_collection ||= {}
25
+ end
26
+
27
+ def store(fetcher_agent)
28
+ _collection[fetcher_agent.id] = fetcher_agent
29
+ end
30
+
31
+ def delete(fetcher_agent)
32
+ _collection.delete fetcher_agent.id
33
+ end
34
+ end
35
+
36
+ end
37
+ end
38
+ end
39
+ end
40
+
41
+ Scruber::Helpers::FetcherAgent.add_adapter(:memory, Scruber::Helpers::FetcherAgentAdapters::Memory)
@@ -0,0 +1,125 @@
1
+ module Scruber
2
+ module Helpers
3
+ class ProxyRotator
4
+
5
+ class Proxy
6
+ attr_accessor :host, :port, :user, :password, :probability, :type
7
+
8
+ def initialize(proxy, options={})
9
+ @host = proxy.split(':', 2).first
10
+ raise Scruber::ArgumentError.new("You need to specify proxy address") if @host.blank?
11
+ @port = options.fetch(:port) { proxy.split(':', 2)[1] }.to_i rescue nil
12
+ raise Scruber::ArgumentError.new("You need to specify :port for this proxy or pass full proxy address like 127.0.0.1:100") if @port.nil? || @port.zero?
13
+ @type = options.fetch(:type) { 'http' }
14
+ @user = options.fetch(:user) { nil }
15
+ @password = options.fetch(:password) { nil }
16
+ @probability = options.fetch(:probability) { 1 }
17
+ end
18
+
19
+ def id
20
+ (@host + ':' + @port.to_s)
21
+ end
22
+
23
+ def address
24
+ @host + ':' + @port.to_s
25
+ end
26
+
27
+ def http?
28
+ @type == 'http'
29
+ end
30
+
31
+ def proxyuserpwd
32
+ if @user.blank?
33
+ nil
34
+ else
35
+ "#{@user}:#{@password}"
36
+ end
37
+ end
38
+ end
39
+
40
+ class Configuration
41
+ include Scruber::Core::Extensions::Loop::CoreMethods
42
+
43
+ AVAILABLE_MODES=[:random, :round_robin]
44
+
45
+ attr_reader :mode, :proxies, :proxy_keys, :pickup
46
+
47
+ def initialize
48
+ @mode = :round_robin
49
+ @proxies = {}
50
+ @proxy_keys = []
51
+ @pickup = nil
52
+ end
53
+
54
+ def configure(&block)
55
+ instance_eval &block
56
+ rebuild_caches
57
+ end
58
+
59
+ def clean
60
+ @proxies = {}
61
+ end
62
+
63
+ def add(proxy_address, options={})
64
+ proxy = Proxy.new(proxy_address, options)
65
+ @proxies[proxy.id] = proxy
66
+ end
67
+
68
+ def set_mode(mode)
69
+ if AVAILABLE_MODES.include?(mode)
70
+ @mode = mode
71
+ else
72
+ raise Scruber::ArgumentError.new("Wrong mode. Available modes: #{AVAILABLE_MODES}")
73
+ end
74
+ end
75
+
76
+ private
77
+
78
+ def rebuild_caches
79
+ if @mode == :random
80
+ @pickup = Pickup.new(@proxies.inject({}){ |acc,(k,p)| acc[p] = p.probability; acc })
81
+ else
82
+ @proxy_keys = @proxies.keys
83
+ end
84
+ end
85
+ end
86
+
87
+ class << self
88
+ attr_writer :configuration
89
+ attr_accessor :cursor
90
+
91
+ def configuration
92
+ @configuration ||= Configuration.new
93
+ end
94
+
95
+ def configure(&block)
96
+ configuration.configure(&block)
97
+ end
98
+
99
+ def next(options={})
100
+ # raise Scruber::ArgumentError.new("Proxy rotator not configured") if @configuration.nil?
101
+ return nil if @configuration.nil?
102
+ if @configuration.mode == :random
103
+ @configuration.pickup.pick
104
+ else
105
+ if @cursor.nil? || @cursor >= @configuration.proxy_keys.count-1
106
+ @cursor = 0
107
+ else
108
+ @cursor += 1
109
+ end
110
+ @configuration.proxies[@configuration.proxy_keys[@cursor]]
111
+ end
112
+ end
113
+ alias_method :random, :next
114
+
115
+ def find(id)
116
+ @configuration.proxies[id] rescue nil
117
+ end
118
+
119
+ def configured?
120
+ !@configuration.nil? && !@configuration.proxies.blank?
121
+ end
122
+ end
123
+ end
124
+ end
125
+ end