web_crawler 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/.rspec +1 -0
- data/Gemfile +11 -0
- data/README +1 -0
- data/Rakefile +2 -0
- data/bin/wcrawler +13 -0
- data/lib/ext/array.rb +100 -0
- data/lib/ext/hash.rb +45 -0
- data/lib/ext/http_response.rb +19 -0
- data/lib/web_crawler/application.rb +49 -0
- data/lib/web_crawler/batch_request.rb +63 -0
- data/lib/web_crawler/cache_adapter/base.rb +33 -0
- data/lib/web_crawler/cache_adapter/file.rb +52 -0
- data/lib/web_crawler/cache_adapter/memory.rb +23 -0
- data/lib/web_crawler/cache_adapter.rb +11 -0
- data/lib/web_crawler/cached_request.rb +30 -0
- data/lib/web_crawler/cli/thor_hooks.rb +94 -0
- data/lib/web_crawler/cli/thor_inherited_options.rb +26 -0
- data/lib/web_crawler/cli.rb +122 -0
- data/lib/web_crawler/configuration.rb +87 -0
- data/lib/web_crawler/factory_url.rb +58 -0
- data/lib/web_crawler/follower.rb +26 -0
- data/lib/web_crawler/handler.rb +45 -0
- data/lib/web_crawler/parsers/url.rb +52 -0
- data/lib/web_crawler/parsers.rb +5 -0
- data/lib/web_crawler/request.rb +59 -0
- data/lib/web_crawler/response.rb +45 -0
- data/lib/web_crawler/utility.rb +65 -0
- data/lib/web_crawler/version.rb +9 -0
- data/lib/web_crawler/view/csv.rb +20 -0
- data/lib/web_crawler/view/json.rb +9 -0
- data/lib/web_crawler/view/plain.rb +9 -0
- data/lib/web_crawler/view/runner.rb +20 -0
- data/lib/web_crawler/view/table.rb +69 -0
- data/lib/web_crawler/view/xml.rb +38 -0
- data/lib/web_crawler/view.rb +44 -0
- data/lib/web_crawler.rb +38 -0
- data/spec/fake_web_generator.rb +44 -0
- data/spec/spec_helper.rb +17 -0
- data/spec/web_crawler/batch_request_spec.rb +45 -0
- data/spec/web_crawler/cached_request_spec.rb +31 -0
- data/spec/web_crawler/factory_url_spec.rb +34 -0
- data/spec/web_crawler/follow_spec.rb +32 -0
- data/spec/web_crawler/request_spec.rb +29 -0
- data/spec/web_crawler/response_spec.rb +27 -0
- data/spec/web_crawler/url_parser_spec.rb +41 -0
- data/spec/web_crawler/view_spec.rb +95 -0
- data/web_crawler.gemspec +30 -0
- metadata +151 -0
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'thor/actions'
|
3
|
+
require 'pathname'
|
4
|
+
require 'web_crawler/cli/thor_hooks'
|
5
|
+
require 'web_crawler/cli/thor_inherited_options'
|
6
|
+
|
7
|
+
module WebCrawler
|
8
|
+
class CLI < Thor
|
9
|
+
include Thor::Actions
|
10
|
+
include Thor::Hooks
|
11
|
+
include Thor::InheritedOptions
|
12
|
+
|
13
|
+
default_task :help
|
14
|
+
|
15
|
+
class_option :format, type: :string, desc: "output format [json, xml, csv]", default: 'plain'
|
16
|
+
class_option :json, type: :boolean, desc: "json output format. shortcut for --format json"
|
17
|
+
class_option :xml, type: :boolean, desc: "xml output format. shortcut for --format xml"
|
18
|
+
class_option :csv, type: :boolean, desc: "csv output format. shortcut for --format csv"
|
19
|
+
class_option :table, type: :boolean, desc: "table output format. shortcut for --format table"
|
20
|
+
class_option :cached, type: :boolean, desc: "use cached requests. if ./tmp/cache exists use it for cache files"
|
21
|
+
class_option :follow, type: :boolean, desc: "follow to urls on the pages"
|
22
|
+
class_option :run, type: :string, desc: "run custom script with api access"
|
23
|
+
class_option :log, type: :string, desc: "log file path"
|
24
|
+
|
25
|
+
before_action except: :help do
|
26
|
+
@options = options.dup
|
27
|
+
@options[:format] = 'json' if options[:json]
|
28
|
+
@options[:format] = 'xml' if options[:xml]
|
29
|
+
@options[:format] = 'csv' if options[:csv]
|
30
|
+
@options[:format] = 'table' if options[:table]
|
31
|
+
@options[:format] = 'plain' if options[:plain]
|
32
|
+
|
33
|
+
@options[:original_format] = @options[:format] if options[:run]
|
34
|
+
@options[:format] = 'runner' if options[:run]
|
35
|
+
|
36
|
+
|
37
|
+
WebCrawler.config.logger = Logger.new(@options['log']) if @options['log']
|
38
|
+
WebCrawler.config.logger.level = Logger::DEBUG
|
39
|
+
WebCrawler.config.logger.datetime_format = "%d-%m-%Y %H:%M:%S"
|
40
|
+
WebCrawler.config.severity_colors = { 'DEBUG' => :magenta,
|
41
|
+
'INFO' => :green,
|
42
|
+
'WARN' => :blue,
|
43
|
+
'ERROR' => :red,
|
44
|
+
'FATAL' => :red,
|
45
|
+
'ANY' => :yellow }
|
46
|
+
|
47
|
+
WebCrawler.config.logger.formatter = proc { |severity, datetime, _, msg|
|
48
|
+
color = WebCrawler.config.severity_colors[severity]
|
49
|
+
|
50
|
+
send(color, ("[#{severity}] ").ljust(8)) << "[#{datetime}] " << "pid #{$$} " << "-- #{msg}\n"
|
51
|
+
}
|
52
|
+
end
|
53
|
+
|
54
|
+
render except: :help do |response, options|
|
55
|
+
WebCrawler::View.factory(options[:format], response, options).draw
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
def help(task = nil)
|
60
|
+
if task
|
61
|
+
self.class.task_help(shell, task)
|
62
|
+
else
|
63
|
+
self.class.help shell
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
protected
|
68
|
+
|
69
|
+
def color(text, color_code)
|
70
|
+
"#{color_code}#{text}\e[0m"
|
71
|
+
end
|
72
|
+
|
73
|
+
def bold(text)
|
74
|
+
color(text, "\e[1m")
|
75
|
+
end
|
76
|
+
|
77
|
+
def white(text)
|
78
|
+
color(text, "\e[37m")
|
79
|
+
end
|
80
|
+
|
81
|
+
def green(text)
|
82
|
+
color(text, "\e[32m")
|
83
|
+
end
|
84
|
+
|
85
|
+
def red(text)
|
86
|
+
color(text, "\e[31m")
|
87
|
+
end
|
88
|
+
|
89
|
+
def magenta(text)
|
90
|
+
color(text, "\e[35m")
|
91
|
+
end
|
92
|
+
|
93
|
+
def yellow(text)
|
94
|
+
color(text, "\e[33m")
|
95
|
+
end
|
96
|
+
|
97
|
+
def blue(text)
|
98
|
+
color(text, "\e[34m")
|
99
|
+
end
|
100
|
+
|
101
|
+
def grey(text)
|
102
|
+
color(text, "\e[90m")
|
103
|
+
end
|
104
|
+
|
105
|
+
def short_padding
|
106
|
+
' '
|
107
|
+
end
|
108
|
+
|
109
|
+
def long_padding
|
110
|
+
' '
|
111
|
+
end
|
112
|
+
|
113
|
+
def logger
|
114
|
+
WebCrawler.logger
|
115
|
+
end
|
116
|
+
|
117
|
+
def symbolized_options
|
118
|
+
@symbolized_options ||= Hash[@options.keys.zip(@options.values)].symbolize_keys
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require "logger"
|
2
|
+
|
3
|
+
module WebCrawler
|
4
|
+
class BaseConfiguration
|
5
|
+
|
6
|
+
def initialize(options = {})
|
7
|
+
@@options ||= {}
|
8
|
+
@@options.merge! options
|
9
|
+
end
|
10
|
+
|
11
|
+
def respond_to?(name)
|
12
|
+
super || @@options.key?(name.to_sym)
|
13
|
+
end
|
14
|
+
|
15
|
+
def config
|
16
|
+
self
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def method_missing(name, *args, &blk)
|
22
|
+
if name.to_s =~ /=$/
|
23
|
+
@@options[$`.to_sym] = args.first
|
24
|
+
elsif @@options.key?(name)
|
25
|
+
@@options[name]
|
26
|
+
else
|
27
|
+
super
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
class Configuration < BaseConfiguration
|
33
|
+
|
34
|
+
attr_accessor :cache_adapter, :log_level, :logger, :root, :cache
|
35
|
+
|
36
|
+
def root
|
37
|
+
@root ||= FileUtils.pwd
|
38
|
+
end
|
39
|
+
|
40
|
+
def cache_adapter
|
41
|
+
@cache_adapter ||= begin
|
42
|
+
if File.exist?("#{root}/tmp/cache/")
|
43
|
+
WebCrawler::CacheAdapter::File.new "#{root}/tmp/cache/"
|
44
|
+
else
|
45
|
+
WebCrawler::CacheAdapter::Memory.new
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def cache(&block)
|
51
|
+
@cache ||= BaseConfiguration.new expire_within: 60
|
52
|
+
if block_given?
|
53
|
+
@cache.instance_eval(block)
|
54
|
+
else
|
55
|
+
@cache
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def logger
|
60
|
+
@logger ||= Logger.new(STDOUT).tap do |log|
|
61
|
+
log.level = Logger.const_get log_level.to_s.upcase
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def log_level
|
66
|
+
@log_level ||= :debug
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
|
71
|
+
module Configurable
|
72
|
+
def self.included(base)
|
73
|
+
base.extend ClassMethods
|
74
|
+
end
|
75
|
+
|
76
|
+
module ClassMethods
|
77
|
+
def configure(&block)
|
78
|
+
module_eval(&block)
|
79
|
+
end
|
80
|
+
|
81
|
+
def config
|
82
|
+
@config ||= Configuration.new
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module WebCrawler
|
2
|
+
#
|
3
|
+
# p = FactoryUrl.new "http://www.somehost.com/$1/$2?param=$3", 0..10, "a".."z", [3,7,34,876,92]
|
4
|
+
# p.urls #=> ["http://www.somehost.com/1/a?param=3",
|
5
|
+
# # "http://www.somehost.com/1/b?param=7",
|
6
|
+
# # "http://www.somehost.com/1/c?param=34",
|
7
|
+
# # ...
|
8
|
+
# # "http://www.somehost.com/10/x?param=34",
|
9
|
+
# # "http://www.somehost.com/10/y?param=876",
|
10
|
+
# # "http://www.somehost.com/10/z?param=92"]
|
11
|
+
# p = FactoryUrl.new 0..10, "a".."z", [3,7,34,876,92] do |first, second, third|
|
12
|
+
# "http://www.somehost.com/#{first}/#{second}?param=#{third}"
|
13
|
+
# end
|
14
|
+
#
|
15
|
+
class FactoryUrl
|
16
|
+
include Enumerable
|
17
|
+
|
18
|
+
attr_reader :urls, :params, :pattern
|
19
|
+
|
20
|
+
def initialize(*args, &block)
|
21
|
+
if block_given?
|
22
|
+
@block = block
|
23
|
+
else
|
24
|
+
@pattern = args.shift
|
25
|
+
raise ArgumentError, "first argument must be an url pattern(String)" unless pattern.is_a? String
|
26
|
+
end
|
27
|
+
@params = normalize_arguments(args)
|
28
|
+
end
|
29
|
+
|
30
|
+
def factory
|
31
|
+
if pattern
|
32
|
+
@urls ||= params.map { |opts| pattern.gsub(/\$(\d+)/) { opts[$1.to_i - 1] } }
|
33
|
+
else
|
34
|
+
@urls ||= params.map { |opts| @block.call *opts }
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def each
|
39
|
+
@urls = nil
|
40
|
+
factory.each do |url|
|
41
|
+
yield url
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
protected
|
46
|
+
def normalize_arguments(args)
|
47
|
+
args = args.first if args.size == 1 && args.first.is_a?(Enumerable)
|
48
|
+
args.shift if args.first.is_a? String
|
49
|
+
params = args.map { |arg| convert_to_a(arg) }
|
50
|
+
@params = params.shift.product(*params)
|
51
|
+
end
|
52
|
+
|
53
|
+
def convert_to_a(arg)
|
54
|
+
arg = arg.to_a rescue arg
|
55
|
+
[*arg]
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
class WebCrawler::Follower
|
2
|
+
|
3
|
+
def initialize(*responses)
|
4
|
+
@options = responses.last.is_a?(Hash) ? responses.pop : {}
|
5
|
+
@responses = responses.flatten
|
6
|
+
end
|
7
|
+
|
8
|
+
def process(options = {})
|
9
|
+
WebCrawler::BatchRequest.new(collect, options).process
|
10
|
+
end
|
11
|
+
|
12
|
+
def follow(response)
|
13
|
+
@responses << response
|
14
|
+
self
|
15
|
+
end
|
16
|
+
|
17
|
+
def collect
|
18
|
+
@responses.map do |response|
|
19
|
+
parser = WebCrawler::Parsers::Url.new(response.url.host, url: response.url.request_uri, same_host: @options[:same_host])
|
20
|
+
parser.parse(response.body) do |url|
|
21
|
+
url
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module WebCrawler
|
2
|
+
|
3
|
+
class Handler
|
4
|
+
|
5
|
+
def initialize(*responses_or_request, &handler)
|
6
|
+
@handler = handler
|
7
|
+
if responses_or_request.first.is_a?(BatchRequest)
|
8
|
+
@target = responses_or_request.first
|
9
|
+
else
|
10
|
+
@target = responses_or_request.flatten
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def process
|
15
|
+
@result ||= @target.map do |response|
|
16
|
+
@handler.call(response, @target)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
class HandlerParser < Handler
|
23
|
+
def initialize(parser, *responses_or_request)
|
24
|
+
@parser = load_parser(parser)
|
25
|
+
super(*responses_or_request, &lambda { |response,*| @parser.parse(response) })
|
26
|
+
end
|
27
|
+
|
28
|
+
protected
|
29
|
+
|
30
|
+
def load_parser(parser)
|
31
|
+
case parser
|
32
|
+
when String
|
33
|
+
Object.const_get parser
|
34
|
+
else
|
35
|
+
parser.respond_to?(:parse) ? parser : raise(LoadParserError, 'Parser must respond to :parse')
|
36
|
+
end
|
37
|
+
rescue NameError
|
38
|
+
$:.unshift File.expand_path('./')
|
39
|
+
require WebCrawler.underscore(parser)
|
40
|
+
retry
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
class WebCrawler::Parsers::Url
|
2
|
+
|
3
|
+
attr_reader :host, :scheme
|
4
|
+
|
5
|
+
def initialize(host, options = { })
|
6
|
+
@scheme = options[:secure] ? 'https' : 'http'
|
7
|
+
@host = URI.parse(normalize_host(host.to_s))
|
8
|
+
@scheme = @host.scheme
|
9
|
+
@options = options
|
10
|
+
set_current_page
|
11
|
+
end
|
12
|
+
|
13
|
+
def parse(response, &filter)
|
14
|
+
(Hpricot(response.to_s) / "a").map do |a|
|
15
|
+
normalize(a["href"]).tap do |url|
|
16
|
+
url = filter.call(url) if url && filter
|
17
|
+
end
|
18
|
+
end.compact.uniq
|
19
|
+
end
|
20
|
+
|
21
|
+
def normalize(url)
|
22
|
+
if url[/^(:?#{@host.scheme}|https|)\:\/\/#{@host.host}/]
|
23
|
+
normalize_host(url)
|
24
|
+
elsif url == '#'
|
25
|
+
nil
|
26
|
+
else
|
27
|
+
(url[0] == '/' || url[0] == '?' || url[0] == '#') ? join(url).to_s : (@options[:same_host] ? nil : url)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
protected
|
32
|
+
|
33
|
+
def set_current_page
|
34
|
+
@current_url = join(@options[:page] || @options[:url] || @options[:path] || '/', @host)
|
35
|
+
end
|
36
|
+
|
37
|
+
def normalize_host(host, scheme = @scheme)
|
38
|
+
host[0..3] == 'http' ? host : "#{scheme}://" + host
|
39
|
+
end
|
40
|
+
|
41
|
+
def join(request_uri, host = @current_url)
|
42
|
+
return host.dup unless request_uri
|
43
|
+
host.dup.tap do |u|
|
44
|
+
path_with_query, anchor = request_uri.split('#')
|
45
|
+
path, query = path_with_query.split('?')
|
46
|
+
u.send(:set_fragment, anchor)
|
47
|
+
u.send(:set_query, query) if query && !query.empty?
|
48
|
+
u.send(:set_path, path) if path && !path.empty?
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module WebCrawler
|
2
|
+
|
3
|
+
class Request
|
4
|
+
|
5
|
+
HEADERS = {
|
6
|
+
'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
|
7
|
+
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
8
|
+
'Accept-Language' => 'en-us,en;q=0.5',
|
9
|
+
'Accept-Charset' => 'utf-8;windows-1251;q=0.7,*;q=0.7',
|
10
|
+
'Cache-Control' => 'max-age=0'
|
11
|
+
}
|
12
|
+
|
13
|
+
attr_reader :url, :response
|
14
|
+
|
15
|
+
def initialize(url)
|
16
|
+
@url, @request = normalize_url(url), { }
|
17
|
+
@headers = HEADERS.dup
|
18
|
+
end
|
19
|
+
|
20
|
+
def process
|
21
|
+
@response = Response.new *fetch(url)
|
22
|
+
end
|
23
|
+
|
24
|
+
def inspect
|
25
|
+
"#<#{self.class}:#{self.object_id} @url=\"#{@url.to_s}\">"
|
26
|
+
end
|
27
|
+
|
28
|
+
protected
|
29
|
+
|
30
|
+
def request_for(host, port=nil)
|
31
|
+
@request[[host, port]] = Net::HTTP.new(host, port) #.tap { |http| http.set_debug_output(STDERR) }
|
32
|
+
end
|
33
|
+
|
34
|
+
def normalize_url(url)
|
35
|
+
URI.parse(url.index("http") == 0 ? url : "http://" + url)
|
36
|
+
rescue URI::Error
|
37
|
+
WebCrawler.logger.debug "#{url} bad URI(is not URI?)"
|
38
|
+
end
|
39
|
+
|
40
|
+
def fetch(uri, limit = 3, redirect_path = nil)
|
41
|
+
raise ArgumentError, "HTTP redirect too deep. #{redirected_from} => #{uri}" if limit <= 0
|
42
|
+
response = request_for(uri.host, uri.port).get(uri.request_uri, headers)
|
43
|
+
case response
|
44
|
+
when Net::HTTPRedirection then
|
45
|
+
@headers['Cookie'] = response['Set-Cookie'] if response['Set-Cookie']
|
46
|
+
fetch(normalize_url(response['location']), limit - 1, [redirect_path, uri])
|
47
|
+
else
|
48
|
+
response.redirect_path = redirect_path if redirect_path
|
49
|
+
[uri, response]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def headers
|
54
|
+
@headers
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module WebCrawler
|
2
|
+
class Response
|
3
|
+
extend ::Forwardable
|
4
|
+
|
5
|
+
delegate [:body, :http_version, :code, :message, :msg, :code_type, :[], :redirect_path, :redirect?] => '@response'
|
6
|
+
|
7
|
+
attr_reader :url, :expire, :date, :cached
|
8
|
+
|
9
|
+
def initialize(url, response)
|
10
|
+
raise ArgumentError, "response must be a Net::HTTPResponse, but #{response.class} given" unless response.is_a? Net::HTTPResponse
|
11
|
+
@url, @response = url, response
|
12
|
+
@date = Time.parse(self['Date']) rescue Time.now
|
13
|
+
@expire ||= Time.parse(self['Expires']) rescue Time.now
|
14
|
+
end
|
15
|
+
|
16
|
+
def set_cached_flag
|
17
|
+
@cached = ' CACHED'
|
18
|
+
end
|
19
|
+
|
20
|
+
def foul?
|
21
|
+
date >= expire
|
22
|
+
end
|
23
|
+
|
24
|
+
def success?
|
25
|
+
@response.is_a? Net::HTTPSuccess
|
26
|
+
end
|
27
|
+
|
28
|
+
def failure?
|
29
|
+
!success?
|
30
|
+
end
|
31
|
+
|
32
|
+
def inspect
|
33
|
+
redirected = redirect? ? " redirect path: \"" + redirect_path.join(', ') + "\"" : ""
|
34
|
+
"#<#{self.class}::0x#{self.object_id.to_s(16).rjust(14, '0')}#{@cached} " <<
|
35
|
+
"#{type} #{code} #{message} #{@url}" <<
|
36
|
+
"#{redirected}>"
|
37
|
+
end
|
38
|
+
|
39
|
+
alias :to_s :body
|
40
|
+
|
41
|
+
def type
|
42
|
+
@response.class
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module WebCrawler::Utility
|
2
|
+
extend self
|
3
|
+
|
4
|
+
# By default, +camelize+ converts strings to UpperCamelCase. If the argument to +camelize+
|
5
|
+
# is set to <tt>:lower</tt> then +camelize+ produces lowerCamelCase.
|
6
|
+
#
|
7
|
+
# +camelize+ will also convert '/' to '::' which is useful for converting paths to namespaces.
|
8
|
+
#
|
9
|
+
# Examples:
|
10
|
+
# "active_record".camelize # => "ActiveRecord"
|
11
|
+
# "active_record".camelize(:lower) # => "activeRecord"
|
12
|
+
# "active_record/errors".camelize # => "ActiveRecord::Errors"
|
13
|
+
# "active_record/errors".camelize(:lower) # => "activeRecord::Errors"
|
14
|
+
#
|
15
|
+
# As a rule of thumb you can think of +camelize+ as the inverse of +underscore+,
|
16
|
+
# though there are cases where that does not hold:
|
17
|
+
#
|
18
|
+
# "SSLError".underscore.camelize # => "SslError"
|
19
|
+
def camelize(lower_case_and_underscored_word, first_letter_in_uppercase = true)
|
20
|
+
if first_letter_in_uppercase
|
21
|
+
lower_case_and_underscored_word.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
|
22
|
+
else
|
23
|
+
lower_case_and_underscored_word.to_s[0].chr.downcase + camelize(lower_case_and_underscored_word)[1..-1]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Makes an underscored, lowercase form from the expression in the string.
|
28
|
+
#
|
29
|
+
# Changes '::' to '/' to convert namespaces to paths.
|
30
|
+
#
|
31
|
+
# Examples:
|
32
|
+
# "ActiveRecord".underscore # => "active_record"
|
33
|
+
# "ActiveRecord::Errors".underscore # => active_record/errors
|
34
|
+
#
|
35
|
+
# As a rule of thumb you can think of +underscore+ as the inverse of +camelize+,
|
36
|
+
# though there are cases where that does not hold:
|
37
|
+
#
|
38
|
+
# "SSLError".underscore.camelize # => "SslError"
|
39
|
+
def underscore(camel_cased_word)
|
40
|
+
word = camel_cased_word.to_s.dup
|
41
|
+
word.gsub!(/::/, '/')
|
42
|
+
word.gsub!(/([A-Z]+)([A-Z][a-z])/, '\1_\2')
|
43
|
+
word.gsub!(/([a-z\d])([A-Z])/, '\1_\2')
|
44
|
+
word.tr!("-", "_")
|
45
|
+
word.downcase!
|
46
|
+
word
|
47
|
+
end
|
48
|
+
|
49
|
+
# Replaces underscores with dashes in the string.
|
50
|
+
#
|
51
|
+
# Example:
|
52
|
+
# "puni_puni" # => "puni-puni"
|
53
|
+
def dasherize(underscored_word)
|
54
|
+
underscored_word.gsub(/_/, '-')
|
55
|
+
end
|
56
|
+
|
57
|
+
# Removes the module part from the expression in the string.
|
58
|
+
#
|
59
|
+
# Examples:
|
60
|
+
# "ActiveRecord::CoreExtensions::String::Inflections".demodulize # => "Inflections"
|
61
|
+
# "Inflections".demodulize # => "Inflections"
|
62
|
+
def demodulize(class_name_in_module)
|
63
|
+
class_name_in_module.to_s.gsub(/^.*::/, '')
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module WebCrawler::View
|
4
|
+
class Csv < Base
|
5
|
+
def initialize(input, options = { })
|
6
|
+
in_group_of_num = options.delete(:in_group_of)
|
7
|
+
input = input.first.in_groups_of(in_group_of_num) if in_group_of_num && input.size == 1
|
8
|
+
headers = options.delete(:headers) || input.select { |i| i.is_a? Hash }.max_by(&:size).keys
|
9
|
+
rescue NoMethodError
|
10
|
+
ensure
|
11
|
+
input = input.dup.unshift(headers) unless headers.nil?
|
12
|
+
super(input, options)
|
13
|
+
end
|
14
|
+
|
15
|
+
def format(item)
|
16
|
+
values = item.respond_to?(:values) ? item.values : item.to_a
|
17
|
+
values.to_csv(@options)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require "fileutils"
|
2
|
+
|
3
|
+
module WebCrawler::View
|
4
|
+
class Runner < Base
|
5
|
+
|
6
|
+
module Space
|
7
|
+
extend self
|
8
|
+
attr_accessor :responses
|
9
|
+
end
|
10
|
+
|
11
|
+
def render
|
12
|
+
unless File.exists? @options['run']
|
13
|
+
@options['run'] = File.expand_path @options['run'], FileUtils.pwd
|
14
|
+
end
|
15
|
+
|
16
|
+
Space.responses = input.freeze
|
17
|
+
Space.module_eval(File.open(@options['run'], 'r').read)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|