web_crawler 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. data/.gitignore +5 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +11 -0
  4. data/README +1 -0
  5. data/Rakefile +2 -0
  6. data/bin/wcrawler +13 -0
  7. data/lib/ext/array.rb +100 -0
  8. data/lib/ext/hash.rb +45 -0
  9. data/lib/ext/http_response.rb +19 -0
  10. data/lib/web_crawler/application.rb +49 -0
  11. data/lib/web_crawler/batch_request.rb +63 -0
  12. data/lib/web_crawler/cache_adapter/base.rb +33 -0
  13. data/lib/web_crawler/cache_adapter/file.rb +52 -0
  14. data/lib/web_crawler/cache_adapter/memory.rb +23 -0
  15. data/lib/web_crawler/cache_adapter.rb +11 -0
  16. data/lib/web_crawler/cached_request.rb +30 -0
  17. data/lib/web_crawler/cli/thor_hooks.rb +94 -0
  18. data/lib/web_crawler/cli/thor_inherited_options.rb +26 -0
  19. data/lib/web_crawler/cli.rb +122 -0
  20. data/lib/web_crawler/configuration.rb +87 -0
  21. data/lib/web_crawler/factory_url.rb +58 -0
  22. data/lib/web_crawler/follower.rb +26 -0
  23. data/lib/web_crawler/handler.rb +45 -0
  24. data/lib/web_crawler/parsers/url.rb +52 -0
  25. data/lib/web_crawler/parsers.rb +5 -0
  26. data/lib/web_crawler/request.rb +59 -0
  27. data/lib/web_crawler/response.rb +45 -0
  28. data/lib/web_crawler/utility.rb +65 -0
  29. data/lib/web_crawler/version.rb +9 -0
  30. data/lib/web_crawler/view/csv.rb +20 -0
  31. data/lib/web_crawler/view/json.rb +9 -0
  32. data/lib/web_crawler/view/plain.rb +9 -0
  33. data/lib/web_crawler/view/runner.rb +20 -0
  34. data/lib/web_crawler/view/table.rb +69 -0
  35. data/lib/web_crawler/view/xml.rb +38 -0
  36. data/lib/web_crawler/view.rb +44 -0
  37. data/lib/web_crawler.rb +38 -0
  38. data/spec/fake_web_generator.rb +44 -0
  39. data/spec/spec_helper.rb +17 -0
  40. data/spec/web_crawler/batch_request_spec.rb +45 -0
  41. data/spec/web_crawler/cached_request_spec.rb +31 -0
  42. data/spec/web_crawler/factory_url_spec.rb +34 -0
  43. data/spec/web_crawler/follow_spec.rb +32 -0
  44. data/spec/web_crawler/request_spec.rb +29 -0
  45. data/spec/web_crawler/response_spec.rb +27 -0
  46. data/spec/web_crawler/url_parser_spec.rb +41 -0
  47. data/spec/web_crawler/view_spec.rb +95 -0
  48. data/web_crawler.gemspec +30 -0
  49. metadata +151 -0
@@ -0,0 +1,122 @@
1
+ require 'thor'
2
+ require 'thor/actions'
3
+ require 'pathname'
4
+ require 'web_crawler/cli/thor_hooks'
5
+ require 'web_crawler/cli/thor_inherited_options'
6
+
7
+ module WebCrawler
8
+ class CLI < Thor
9
+ include Thor::Actions
10
+ include Thor::Hooks
11
+ include Thor::InheritedOptions
12
+
13
+ default_task :help
14
+
15
+ class_option :format, type: :string, desc: "output format [json, xml, csv]", default: 'plain'
16
+ class_option :json, type: :boolean, desc: "json output format. shortcut for --format json"
17
+ class_option :xml, type: :boolean, desc: "xml output format. shortcut for --format xml"
18
+ class_option :csv, type: :boolean, desc: "csv output format. shortcut for --format csv"
19
+ class_option :table, type: :boolean, desc: "table output format. shortcut for --format table"
20
+ class_option :cached, type: :boolean, desc: "use cached requests. if ./tmp/cache exists use it for cache files"
21
+ class_option :follow, type: :boolean, desc: "follow to urls on the pages"
22
+ class_option :run, type: :string, desc: "run custom script with api access"
23
+ class_option :log, type: :string, desc: "log file path"
24
+
25
+ before_action except: :help do
26
+ @options = options.dup
27
+ @options[:format] = 'json' if options[:json]
28
+ @options[:format] = 'xml' if options[:xml]
29
+ @options[:format] = 'csv' if options[:csv]
30
+ @options[:format] = 'table' if options[:table]
31
+ @options[:format] = 'plain' if options[:plain]
32
+
33
+ @options[:original_format] = @options[:format] if options[:run]
34
+ @options[:format] = 'runner' if options[:run]
35
+
36
+
37
+ WebCrawler.config.logger = Logger.new(@options['log']) if @options['log']
38
+ WebCrawler.config.logger.level = Logger::DEBUG
39
+ WebCrawler.config.logger.datetime_format = "%d-%m-%Y %H:%M:%S"
40
+ WebCrawler.config.severity_colors = { 'DEBUG' => :magenta,
41
+ 'INFO' => :green,
42
+ 'WARN' => :blue,
43
+ 'ERROR' => :red,
44
+ 'FATAL' => :red,
45
+ 'ANY' => :yellow }
46
+
47
+ WebCrawler.config.logger.formatter = proc { |severity, datetime, _, msg|
48
+ color = WebCrawler.config.severity_colors[severity]
49
+
50
+ send(color, ("[#{severity}] ").ljust(8)) << "[#{datetime}] " << "pid #{$$} " << "-- #{msg}\n"
51
+ }
52
+ end
53
+
54
+ render except: :help do |response, options|
55
+ WebCrawler::View.factory(options[:format], response, options).draw
56
+ end
57
+
58
+
59
+ def help(task = nil)
60
+ if task
61
+ self.class.task_help(shell, task)
62
+ else
63
+ self.class.help shell
64
+ end
65
+ end
66
+
67
+ protected
68
+
69
+ def color(text, color_code)
70
+ "#{color_code}#{text}\e[0m"
71
+ end
72
+
73
+ def bold(text)
74
+ color(text, "\e[1m")
75
+ end
76
+
77
+ def white(text)
78
+ color(text, "\e[37m")
79
+ end
80
+
81
+ def green(text)
82
+ color(text, "\e[32m")
83
+ end
84
+
85
+ def red(text)
86
+ color(text, "\e[31m")
87
+ end
88
+
89
+ def magenta(text)
90
+ color(text, "\e[35m")
91
+ end
92
+
93
+ def yellow(text)
94
+ color(text, "\e[33m")
95
+ end
96
+
97
+ def blue(text)
98
+ color(text, "\e[34m")
99
+ end
100
+
101
+ def grey(text)
102
+ color(text, "\e[90m")
103
+ end
104
+
105
+ def short_padding
106
+ ' '
107
+ end
108
+
109
+ def long_padding
110
+ ' '
111
+ end
112
+
113
+ def logger
114
+ WebCrawler.logger
115
+ end
116
+
117
+ def symbolized_options
118
+ @symbolized_options ||= Hash[@options.keys.zip(@options.values)].symbolize_keys
119
+ end
120
+
121
+ end
122
+ end
@@ -0,0 +1,87 @@
1
+ require "logger"
2
+
3
+ module WebCrawler
4
+ class BaseConfiguration
5
+
6
+ def initialize(options = {})
7
+ @@options ||= {}
8
+ @@options.merge! options
9
+ end
10
+
11
+ def respond_to?(name)
12
+ super || @@options.key?(name.to_sym)
13
+ end
14
+
15
+ def config
16
+ self
17
+ end
18
+
19
+ private
20
+
21
+ def method_missing(name, *args, &blk)
22
+ if name.to_s =~ /=$/
23
+ @@options[$`.to_sym] = args.first
24
+ elsif @@options.key?(name)
25
+ @@options[name]
26
+ else
27
+ super
28
+ end
29
+ end
30
+ end
31
+
32
+ class Configuration < BaseConfiguration
33
+
34
+ attr_accessor :cache_adapter, :log_level, :logger, :root, :cache
35
+
36
+ def root
37
+ @root ||= FileUtils.pwd
38
+ end
39
+
40
+ def cache_adapter
41
+ @cache_adapter ||= begin
42
+ if File.exist?("#{root}/tmp/cache/")
43
+ WebCrawler::CacheAdapter::File.new "#{root}/tmp/cache/"
44
+ else
45
+ WebCrawler::CacheAdapter::Memory.new
46
+ end
47
+ end
48
+ end
49
+
50
+ def cache(&block)
51
+ @cache ||= BaseConfiguration.new expire_within: 60
52
+ if block_given?
53
+ @cache.instance_eval(block)
54
+ else
55
+ @cache
56
+ end
57
+ end
58
+
59
+ def logger
60
+ @logger ||= Logger.new(STDOUT).tap do |log|
61
+ log.level = Logger.const_get log_level.to_s.upcase
62
+ end
63
+ end
64
+
65
+ def log_level
66
+ @log_level ||= :debug
67
+ end
68
+
69
+ end
70
+
71
+ module Configurable
72
+ def self.included(base)
73
+ base.extend ClassMethods
74
+ end
75
+
76
+ module ClassMethods
77
+ def configure(&block)
78
+ module_eval(&block)
79
+ end
80
+
81
+ def config
82
+ @config ||= Configuration.new
83
+ end
84
+
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,58 @@
1
+ module WebCrawler
2
+ #
3
+ # p = FactoryUrl.new "http://www.somehost.com/$1/$2?param=$3", 0..10, "a".."z", [3,7,34,876,92]
4
+ # p.urls #=> ["http://www.somehost.com/1/a?param=3",
5
+ # # "http://www.somehost.com/1/b?param=7",
6
+ # # "http://www.somehost.com/1/c?param=34",
7
+ # # ...
8
+ # # "http://www.somehost.com/10/x?param=34",
9
+ # # "http://www.somehost.com/10/y?param=876",
10
+ # # "http://www.somehost.com/10/z?param=92"]
11
+ # p = FactoryUrl.new 0..10, "a".."z", [3,7,34,876,92] do |first, second, third|
12
+ # "http://www.somehost.com/#{first}/#{second}?param=#{third}"
13
+ # end
14
+ #
15
+ class FactoryUrl
16
+ include Enumerable
17
+
18
+ attr_reader :urls, :params, :pattern
19
+
20
+ def initialize(*args, &block)
21
+ if block_given?
22
+ @block = block
23
+ else
24
+ @pattern = args.shift
25
+ raise ArgumentError, "first argument must be an url pattern(String)" unless pattern.is_a? String
26
+ end
27
+ @params = normalize_arguments(args)
28
+ end
29
+
30
+ def factory
31
+ if pattern
32
+ @urls ||= params.map { |opts| pattern.gsub(/\$(\d+)/) { opts[$1.to_i - 1] } }
33
+ else
34
+ @urls ||= params.map { |opts| @block.call *opts }
35
+ end
36
+ end
37
+
38
+ def each
39
+ @urls = nil
40
+ factory.each do |url|
41
+ yield url
42
+ end
43
+ end
44
+
45
+ protected
46
+ def normalize_arguments(args)
47
+ args = args.first if args.size == 1 && args.first.is_a?(Enumerable)
48
+ args.shift if args.first.is_a? String
49
+ params = args.map { |arg| convert_to_a(arg) }
50
+ @params = params.shift.product(*params)
51
+ end
52
+
53
+ def convert_to_a(arg)
54
+ arg = arg.to_a rescue arg
55
+ [*arg]
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,26 @@
1
+ class WebCrawler::Follower
2
+
3
+ def initialize(*responses)
4
+ @options = responses.last.is_a?(Hash) ? responses.pop : {}
5
+ @responses = responses.flatten
6
+ end
7
+
8
+ def process(options = {})
9
+ WebCrawler::BatchRequest.new(collect, options).process
10
+ end
11
+
12
+ def follow(response)
13
+ @responses << response
14
+ self
15
+ end
16
+
17
+ def collect
18
+ @responses.map do |response|
19
+ parser = WebCrawler::Parsers::Url.new(response.url.host, url: response.url.request_uri, same_host: @options[:same_host])
20
+ parser.parse(response.body) do |url|
21
+ url
22
+ end
23
+ end
24
+ end
25
+
26
+ end
@@ -0,0 +1,45 @@
1
+ module WebCrawler
2
+
3
+ class Handler
4
+
5
+ def initialize(*responses_or_request, &handler)
6
+ @handler = handler
7
+ if responses_or_request.first.is_a?(BatchRequest)
8
+ @target = responses_or_request.first
9
+ else
10
+ @target = responses_or_request.flatten
11
+ end
12
+ end
13
+
14
+ def process
15
+ @result ||= @target.map do |response|
16
+ @handler.call(response, @target)
17
+ end
18
+ end
19
+
20
+ end
21
+
22
+ class HandlerParser < Handler
23
+ def initialize(parser, *responses_or_request)
24
+ @parser = load_parser(parser)
25
+ super(*responses_or_request, &lambda { |response,*| @parser.parse(response) })
26
+ end
27
+
28
+ protected
29
+
30
+ def load_parser(parser)
31
+ case parser
32
+ when String
33
+ Object.const_get parser
34
+ else
35
+ parser.respond_to?(:parse) ? parser : raise(LoadParserError, 'Parser must respond to :parse')
36
+ end
37
+ rescue NameError
38
+ $:.unshift File.expand_path('./')
39
+ require WebCrawler.underscore(parser)
40
+ retry
41
+ end
42
+
43
+
44
+ end
45
+ end
@@ -0,0 +1,52 @@
1
+ class WebCrawler::Parsers::Url
2
+
3
+ attr_reader :host, :scheme
4
+
5
+ def initialize(host, options = { })
6
+ @scheme = options[:secure] ? 'https' : 'http'
7
+ @host = URI.parse(normalize_host(host.to_s))
8
+ @scheme = @host.scheme
9
+ @options = options
10
+ set_current_page
11
+ end
12
+
13
+ def parse(response, &filter)
14
+ (Hpricot(response.to_s) / "a").map do |a|
15
+ normalize(a["href"]).tap do |url|
16
+ url = filter.call(url) if url && filter
17
+ end
18
+ end.compact.uniq
19
+ end
20
+
21
+ def normalize(url)
22
+ if url[/^(:?#{@host.scheme}|https|)\:\/\/#{@host.host}/]
23
+ normalize_host(url)
24
+ elsif url == '#'
25
+ nil
26
+ else
27
+ (url[0] == '/' || url[0] == '?' || url[0] == '#') ? join(url).to_s : (@options[:same_host] ? nil : url)
28
+ end
29
+ end
30
+
31
+ protected
32
+
33
+ def set_current_page
34
+ @current_url = join(@options[:page] || @options[:url] || @options[:path] || '/', @host)
35
+ end
36
+
37
+ def normalize_host(host, scheme = @scheme)
38
+ host[0..3] == 'http' ? host : "#{scheme}://" + host
39
+ end
40
+
41
+ def join(request_uri, host = @current_url)
42
+ return host.dup unless request_uri
43
+ host.dup.tap do |u|
44
+ path_with_query, anchor = request_uri.split('#')
45
+ path, query = path_with_query.split('?')
46
+ u.send(:set_fragment, anchor)
47
+ u.send(:set_query, query) if query && !query.empty?
48
+ u.send(:set_path, path) if path && !path.empty?
49
+ end
50
+ end
51
+
52
+ end
@@ -0,0 +1,5 @@
1
+ require "hpricot"
2
+
3
+ module WebCrawler::Parsers
4
+ autoload :Url, 'web_crawler/parsers/url'
5
+ end
@@ -0,0 +1,59 @@
1
+ module WebCrawler
2
+
3
+ class Request
4
+
5
+ HEADERS = {
6
+ 'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
7
+ 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
8
+ 'Accept-Language' => 'en-us,en;q=0.5',
9
+ 'Accept-Charset' => 'utf-8;windows-1251;q=0.7,*;q=0.7',
10
+ 'Cache-Control' => 'max-age=0'
11
+ }
12
+
13
+ attr_reader :url, :response
14
+
15
+ def initialize(url)
16
+ @url, @request = normalize_url(url), { }
17
+ @headers = HEADERS.dup
18
+ end
19
+
20
+ def process
21
+ @response = Response.new *fetch(url)
22
+ end
23
+
24
+ def inspect
25
+ "#<#{self.class}:#{self.object_id} @url=\"#{@url.to_s}\">"
26
+ end
27
+
28
+ protected
29
+
30
+ def request_for(host, port=nil)
31
+ @request[[host, port]] = Net::HTTP.new(host, port) #.tap { |http| http.set_debug_output(STDERR) }
32
+ end
33
+
34
+ def normalize_url(url)
35
+ URI.parse(url.index("http") == 0 ? url : "http://" + url)
36
+ rescue URI::Error
37
+ WebCrawler.logger.debug "#{url} bad URI(is not URI?)"
38
+ end
39
+
40
+ def fetch(uri, limit = 3, redirect_path = nil)
41
+ raise ArgumentError, "HTTP redirect too deep. #{redirected_from} => #{uri}" if limit <= 0
42
+ response = request_for(uri.host, uri.port).get(uri.request_uri, headers)
43
+ case response
44
+ when Net::HTTPRedirection then
45
+ @headers['Cookie'] = response['Set-Cookie'] if response['Set-Cookie']
46
+ fetch(normalize_url(response['location']), limit - 1, [redirect_path, uri])
47
+ else
48
+ response.redirect_path = redirect_path if redirect_path
49
+ [uri, response]
50
+ end
51
+ end
52
+
53
+ def headers
54
+ @headers
55
+ end
56
+
57
+ end
58
+
59
+ end
@@ -0,0 +1,45 @@
1
+ module WebCrawler
2
+ class Response
3
+ extend ::Forwardable
4
+
5
+ delegate [:body, :http_version, :code, :message, :msg, :code_type, :[], :redirect_path, :redirect?] => '@response'
6
+
7
+ attr_reader :url, :expire, :date, :cached
8
+
9
+ def initialize(url, response)
10
+ raise ArgumentError, "response must be a Net::HTTPResponse, but #{response.class} given" unless response.is_a? Net::HTTPResponse
11
+ @url, @response = url, response
12
+ @date = Time.parse(self['Date']) rescue Time.now
13
+ @expire ||= Time.parse(self['Expires']) rescue Time.now
14
+ end
15
+
16
+ def set_cached_flag
17
+ @cached = ' CACHED'
18
+ end
19
+
20
+ def foul?
21
+ date >= expire
22
+ end
23
+
24
+ def success?
25
+ @response.is_a? Net::HTTPSuccess
26
+ end
27
+
28
+ def failure?
29
+ !success?
30
+ end
31
+
32
+ def inspect
33
+ redirected = redirect? ? " redirect path: \"" + redirect_path.join(', ') + "\"" : ""
34
+ "#<#{self.class}::0x#{self.object_id.to_s(16).rjust(14, '0')}#{@cached} " <<
35
+ "#{type} #{code} #{message} #{@url}" <<
36
+ "#{redirected}>"
37
+ end
38
+
39
+ alias :to_s :body
40
+
41
+ def type
42
+ @response.class
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,65 @@
1
+ module WebCrawler::Utility
2
+ extend self
3
+
4
+ # By default, +camelize+ converts strings to UpperCamelCase. If the argument to +camelize+
5
+ # is set to <tt>:lower</tt> then +camelize+ produces lowerCamelCase.
6
+ #
7
+ # +camelize+ will also convert '/' to '::' which is useful for converting paths to namespaces.
8
+ #
9
+ # Examples:
10
+ # "active_record".camelize # => "ActiveRecord"
11
+ # "active_record".camelize(:lower) # => "activeRecord"
12
+ # "active_record/errors".camelize # => "ActiveRecord::Errors"
13
+ # "active_record/errors".camelize(:lower) # => "activeRecord::Errors"
14
+ #
15
+ # As a rule of thumb you can think of +camelize+ as the inverse of +underscore+,
16
+ # though there are cases where that does not hold:
17
+ #
18
+ # "SSLError".underscore.camelize # => "SslError"
19
+ def camelize(lower_case_and_underscored_word, first_letter_in_uppercase = true)
20
+ if first_letter_in_uppercase
21
+ lower_case_and_underscored_word.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
22
+ else
23
+ lower_case_and_underscored_word.to_s[0].chr.downcase + camelize(lower_case_and_underscored_word)[1..-1]
24
+ end
25
+ end
26
+
27
+ # Makes an underscored, lowercase form from the expression in the string.
28
+ #
29
+ # Changes '::' to '/' to convert namespaces to paths.
30
+ #
31
+ # Examples:
32
+ # "ActiveRecord".underscore # => "active_record"
33
+ # "ActiveRecord::Errors".underscore # => active_record/errors
34
+ #
35
+ # As a rule of thumb you can think of +underscore+ as the inverse of +camelize+,
36
+ # though there are cases where that does not hold:
37
+ #
38
+ # "SSLError".underscore.camelize # => "SslError"
39
+ def underscore(camel_cased_word)
40
+ word = camel_cased_word.to_s.dup
41
+ word.gsub!(/::/, '/')
42
+ word.gsub!(/([A-Z]+)([A-Z][a-z])/, '\1_\2')
43
+ word.gsub!(/([a-z\d])([A-Z])/, '\1_\2')
44
+ word.tr!("-", "_")
45
+ word.downcase!
46
+ word
47
+ end
48
+
49
+ # Replaces underscores with dashes in the string.
50
+ #
51
+ # Example:
52
+ # "puni_puni" # => "puni-puni"
53
+ def dasherize(underscored_word)
54
+ underscored_word.gsub(/_/, '-')
55
+ end
56
+
57
+ # Removes the module part from the expression in the string.
58
+ #
59
+ # Examples:
60
+ # "ActiveRecord::CoreExtensions::String::Inflections".demodulize # => "Inflections"
61
+ # "Inflections".demodulize # => "Inflections"
62
+ def demodulize(class_name_in_module)
63
+ class_name_in_module.to_s.gsub(/^.*::/, '')
64
+ end
65
+ end
@@ -0,0 +1,9 @@
1
+ module WebCrawler
2
+ module VERSION
3
+ MAJOR = 0
4
+ MINOR = 2
5
+ TINY = 0
6
+
7
+ STRING = [MAJOR, MINOR, TINY].join('.')
8
+ end
9
+ end
@@ -0,0 +1,20 @@
1
+ require 'csv'
2
+
3
+ module WebCrawler::View
4
+ class Csv < Base
5
+ def initialize(input, options = { })
6
+ in_group_of_num = options.delete(:in_group_of)
7
+ input = input.first.in_groups_of(in_group_of_num) if in_group_of_num && input.size == 1
8
+ headers = options.delete(:headers) || input.select { |i| i.is_a? Hash }.max_by(&:size).keys
9
+ rescue NoMethodError
10
+ ensure
11
+ input = input.dup.unshift(headers) unless headers.nil?
12
+ super(input, options)
13
+ end
14
+
15
+ def format(item)
16
+ values = item.respond_to?(:values) ? item.values : item.to_a
17
+ values.to_csv(@options)
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,9 @@
1
+ require 'json'
2
+
3
+ module WebCrawler::View
4
+ class Json < Base
5
+ def render
6
+ {responses: input}.to_json
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module WebCrawler::View
2
+ class Plain < Base
3
+
4
+ def render
5
+ [*input].join "\n"
6
+ end
7
+
8
+ end
9
+ end
@@ -0,0 +1,20 @@
1
+ require "fileutils"
2
+
3
+ module WebCrawler::View
4
+ class Runner < Base
5
+
6
+ module Space
7
+ extend self
8
+ attr_accessor :responses
9
+ end
10
+
11
+ def render
12
+ unless File.exists? @options['run']
13
+ @options['run'] = File.expand_path @options['run'], FileUtils.pwd
14
+ end
15
+
16
+ Space.responses = input.freeze
17
+ Space.module_eval(File.open(@options['run'], 'r').read)
18
+ end
19
+ end
20
+ end