web_crawler 0.3.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +2 -0
- data/README +22 -1
- data/lib/web_crawler.rb +2 -0
- data/lib/web_crawler/application.rb +33 -2
- data/lib/web_crawler/base.rb +113 -0
- data/lib/web_crawler/batch_request.rb +10 -4
- data/lib/web_crawler/cached_request.rb +16 -7
- data/lib/web_crawler/configuration.rb +5 -5
- data/lib/web_crawler/factory_url.rb +27 -7
- data/lib/web_crawler/follower.rb +11 -9
- data/lib/web_crawler/parsers.rb +1 -0
- data/lib/web_crawler/parsers/mapper.rb +114 -0
- data/lib/web_crawler/parsers/url.rb +3 -5
- data/lib/web_crawler/request.rb +14 -2
- data/lib/web_crawler/response.rb +2 -2
- data/lib/web_crawler/version.rb +2 -2
- data/lib/web_crawler/view.rb +1 -1
- data/lib/web_crawler/view/csv.rb +1 -1
- data/lib/web_crawler/view/json.rb +1 -1
- data/lib/web_crawler/view/yaml.rb +1 -1
- data/spec/fixtures/example.xml +171 -0
- data/spec/fixtures/my_crawler.rb +82 -0
- data/spec/fixtures/test_crawler.rb +108 -0
- data/spec/fixtures/test_crawler2.rb +77 -0
- data/spec/spec_helper.rb +8 -3
- data/spec/web_crawler/batch_request_spec.rb +0 -11
- data/spec/web_crawler/cached_request_spec.rb +17 -11
- data/spec/web_crawler/factory_url_spec.rb +19 -6
- data/spec/web_crawler/follow_spec.rb +11 -4
- data/spec/web_crawler/view_spec.rb +10 -10
- data/spec/web_crawler/web_crawler_api_base_class_spec.rb +143 -0
- data/web_crawler.gemspec +2 -0
- metadata +43 -8
data/Gemfile
CHANGED
data/README
CHANGED
@@ -1 +1,22 @@
|
|
1
|
-
Web crawler help you with parse and collect data from the web
|
1
|
+
Web crawler help you with parse and collect data from the web
|
2
|
+
|
3
|
+
#TODO
|
4
|
+
|
5
|
+
Base web crawler class for API present
|
6
|
+
Its showld work like this:
|
7
|
+
|
8
|
+
class MyCrawler < WebCrawler::Base
|
9
|
+
|
10
|
+
target "www.example.com"
|
11
|
+
target "www.example.com/page2"
|
12
|
+
target %[www.example.com/contacts www.example.com/about]
|
13
|
+
target "www.example.com/category_:category/page:page/", :categories => [1,2,3,4], :page => 1..100
|
14
|
+
|
15
|
+
target { call_advanced_logic_for_url_generating }
|
16
|
+
|
17
|
+
logger "path/to/log/file" # or Logger.new(...)
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
end
|
data/lib/web_crawler.rb
CHANGED
@@ -6,6 +6,7 @@ require 'forwardable'
|
|
6
6
|
require "ext/hash"
|
7
7
|
require "ext/array"
|
8
8
|
require "ext/http_response"
|
9
|
+
require "active_support/core_ext"
|
9
10
|
|
10
11
|
module WebCrawler
|
11
12
|
autoload :Request, 'web_crawler/request'
|
@@ -26,6 +27,7 @@ module WebCrawler
|
|
26
27
|
autoload :View, 'web_crawler/view'
|
27
28
|
autoload :CLI, 'web_crawler/cli'
|
28
29
|
autoload :Application, 'web_crawler/application'
|
30
|
+
autoload :Base, 'web_crawler/base'
|
29
31
|
|
30
32
|
include Configurable
|
31
33
|
extend Utility
|
@@ -2,7 +2,35 @@ module WebCrawler
|
|
2
2
|
class Application < CLI
|
3
3
|
|
4
4
|
desc "test", "Test task"
|
5
|
+
|
5
6
|
def test
|
7
|
+
urls = FactoryUrl.new('http://www.superjob.ru/rabota/554/veb-programmist/?from=$1', [[140]]).factory
|
8
|
+
|
9
|
+
logger.info "start requests with #{urls.join(' ')} in 4 processes"
|
10
|
+
|
11
|
+
targets = BatchRequest.new(urls).process
|
12
|
+
|
13
|
+
logger.info "#{targets.size} targets collected"
|
14
|
+
|
15
|
+
urls = Follower.new(targets, same_host: false).collect { |url| url =~ /vacancy\/\?id=\d+/ }
|
16
|
+
|
17
|
+
logger.info "#{urls.size} urls collected"
|
18
|
+
logger.info "start requests with in 4 processes"
|
19
|
+
|
20
|
+
puts BatchRequest.new(urls).process.inspect
|
21
|
+
|
22
|
+
""
|
23
|
+
end
|
24
|
+
|
25
|
+
desc "runner CLASS", "Run crawler class"
|
26
|
+
method_option :lib, type: :array, desc: "lib directories"
|
27
|
+
def runner(name)
|
28
|
+
$:.unshift './'
|
29
|
+
Array.wrap(@options[:lib]).each { |l| $:.unshift l }
|
30
|
+
require name.underscore
|
31
|
+
|
32
|
+
klass = name.classify.constantize
|
33
|
+
klass.run allow_format(:json, :yaml)
|
6
34
|
end
|
7
35
|
|
8
36
|
desc "get <URL...>", "Get pages from passed urls"
|
@@ -36,8 +64,7 @@ module WebCrawler
|
|
36
64
|
def factory(pattern, *params)
|
37
65
|
params.map! { |param| eval(param) }
|
38
66
|
urls = FactoryUrl.new(pattern, params)
|
39
|
-
|
40
|
-
sep = options[:list] ? "\n" : ' '
|
67
|
+
sep = options[:list] ? "\n" : ' '
|
41
68
|
if options[:output] || options[:list]
|
42
69
|
puts urls.factory.map { |u| u.inspect }.join(sep).gsub('"', "'")
|
43
70
|
else
|
@@ -45,5 +72,9 @@ module WebCrawler
|
|
45
72
|
end
|
46
73
|
end
|
47
74
|
|
75
|
+
protected
|
76
|
+
def allow_format(*allow)
|
77
|
+
allow.flatten.select { |f| f == @options[:format] }.first
|
78
|
+
end
|
48
79
|
end
|
49
80
|
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require "set"
|
2
|
+
|
3
|
+
module WebCrawler
|
4
|
+
class Base
|
5
|
+
class_attribute :targets, :logger, :mappers, :cache, :follower, :responses
|
6
|
+
|
7
|
+
self.targets, self.logger, self.mappers = Set.new, WebCrawler.config.logger, []
|
8
|
+
|
9
|
+
delegate :run, :to => :'self.class'
|
10
|
+
|
11
|
+
class << self
|
12
|
+
|
13
|
+
include ActiveSupport::Callbacks
|
14
|
+
|
15
|
+
def run(format=nil, format_options={ })
|
16
|
+
compile_targets
|
17
|
+
self.responses = WebCrawler::BatchRequest.new(targets.to_a).process
|
18
|
+
if format
|
19
|
+
formated(process(responses), format, format_options)
|
20
|
+
else
|
21
|
+
process(responses)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
protected
|
26
|
+
|
27
|
+
def after(&block)
|
28
|
+
@after_callback = block
|
29
|
+
end
|
30
|
+
|
31
|
+
def compile_targets
|
32
|
+
following = targets.select { |target| target.is_a?(Array) && target.first.is_a?(Proc) }
|
33
|
+
self.targets = targets - following
|
34
|
+
|
35
|
+
following.each do |target|
|
36
|
+
target.first.call(target.last)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def log_to(logger_or_path)
|
41
|
+
case logger_or_path
|
42
|
+
when Logger
|
43
|
+
WebCrawler.config.logger = self.logger = logger_or_path
|
44
|
+
when nil
|
45
|
+
WebCrawler.config.logger = self.logger = Logger.new('/dev/null')
|
46
|
+
else
|
47
|
+
WebCrawler.config.logger = self.logger = Logger.new(logger_or_path)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def cache_to(path_or_cache_adapter)
|
52
|
+
adapter = nil
|
53
|
+
adapter = path_or_cache_adapter if path_or_cache_adapter.is_a? WebCrawler::CacheAdapter::Base
|
54
|
+
adapter = WebCrawler::CacheAdapter::File.new(path_or_cache_adapter) if File.directory? path_or_cache_adapter
|
55
|
+
|
56
|
+
WebCrawler.configure do
|
57
|
+
config.cache.adapter = adapter
|
58
|
+
end if adapter
|
59
|
+
end
|
60
|
+
|
61
|
+
def follow(*targets)
|
62
|
+
options = targets.extract_options!
|
63
|
+
responses = WebCrawler::BatchRequest.new(targets).process
|
64
|
+
self.target WebCrawler::Follower.new(responses, options).collect
|
65
|
+
end
|
66
|
+
|
67
|
+
def context(selector, name=selector, &block)
|
68
|
+
mapper = WebCrawler::Parsers::Mapper.new(name, self, selector)
|
69
|
+
if block.arity.zero?
|
70
|
+
mapper.instance_exec(&block)
|
71
|
+
else
|
72
|
+
mapper.callback(&block)
|
73
|
+
end
|
74
|
+
self.mappers += [mapper]
|
75
|
+
end
|
76
|
+
|
77
|
+
def target(*targets, &block)
|
78
|
+
options = targets.extract_options!
|
79
|
+
unless options.empty?
|
80
|
+
raise ArgumentError, 'target accept only one pattern if options given' if targets.size > 1
|
81
|
+
targets = generate_urls(targets.first, options)
|
82
|
+
end
|
83
|
+
if block_given?
|
84
|
+
self.targets << [block, targets]
|
85
|
+
else
|
86
|
+
self.targets += targets.flatten
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def generate_urls(pattern, options)
|
91
|
+
WebCrawler::FactoryUrl.new(pattern, options).factory
|
92
|
+
end
|
93
|
+
|
94
|
+
def formated(data, format, options)
|
95
|
+
require "active_support/core_ext/string"
|
96
|
+
WebCrawler::View.factory(format, data, options).render
|
97
|
+
end
|
98
|
+
|
99
|
+
def process(responses)
|
100
|
+
return responses.map(&:to_s) if mappers.empty?
|
101
|
+
|
102
|
+
{ }.tap do |results|
|
103
|
+
mappers.each do |mapper|
|
104
|
+
results[mapper.name] = responses.map do |response|
|
105
|
+
mapper.collect(response)
|
106
|
+
end.flatten
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
end
|
@@ -1,10 +1,15 @@
|
|
1
|
+
require "parallel"
|
2
|
+
|
1
3
|
module WebCrawler
|
2
4
|
|
5
|
+
# Usage:
|
6
|
+
# BatchRequest.new(urls).process #=> array of Responses
|
7
|
+
#
|
3
8
|
class BatchRequest
|
4
9
|
|
5
10
|
attr_reader :urls, :responses, :requests
|
6
11
|
attr_writer :requests
|
7
|
-
|
12
|
+
|
8
13
|
include Enumerable
|
9
14
|
|
10
15
|
def initialize(*urls)
|
@@ -19,10 +24,11 @@ module WebCrawler
|
|
19
24
|
if @handler
|
20
25
|
block_given? ? yield(@handler.process) : @handler.process
|
21
26
|
else
|
22
|
-
|
27
|
+
ready = requests.select{|r| r.ready? }
|
28
|
+
@responses ||= Parallel.map(requests - ready) do |req|
|
23
29
|
WebCrawler.logger.info "start request to #{req.url.to_s}"
|
24
30
|
block_given? ? yield(req.process) : req.process
|
25
|
-
end
|
31
|
+
end.compact + ready.map(&:process)
|
26
32
|
end
|
27
33
|
end
|
28
34
|
|
@@ -57,7 +63,7 @@ module WebCrawler
|
|
57
63
|
end
|
58
64
|
|
59
65
|
def request_class
|
60
|
-
|
66
|
+
!@options[:no_cached] && WebCrawler.config.cache.adapter.is_a?(WebCrawler::CacheAdapter::Base) ? CachedRequest : Request
|
61
67
|
end
|
62
68
|
end
|
63
69
|
|
@@ -5,23 +5,32 @@ module WebCrawler
|
|
5
5
|
|
6
6
|
def initialize(url, options = { })
|
7
7
|
super(url)
|
8
|
-
@cache = options[:cache] || WebCrawler.config.
|
8
|
+
@cache = options[:cache] || WebCrawler.config.cache.adapter
|
9
|
+
@ready = true if @cache.exist? url
|
9
10
|
end
|
10
11
|
|
11
12
|
def process
|
12
|
-
cached do
|
13
|
+
@response || cached do
|
13
14
|
Response.new *fetch(url)
|
14
15
|
end
|
15
16
|
end
|
16
17
|
|
17
18
|
protected
|
18
19
|
|
20
|
+
def load_response
|
21
|
+
@response = @cache.get url
|
22
|
+
end
|
23
|
+
|
24
|
+
def put_response(response)
|
25
|
+
@response = @cache.put(response)
|
26
|
+
end
|
27
|
+
|
19
28
|
def cached
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
29
|
+
if @cache.exist? url
|
30
|
+
load_response
|
31
|
+
else
|
32
|
+
put_response(yield)
|
33
|
+
end
|
25
34
|
@response
|
26
35
|
end
|
27
36
|
|
@@ -3,8 +3,8 @@ require "logger"
|
|
3
3
|
module WebCrawler
|
4
4
|
class BaseConfiguration
|
5
5
|
|
6
|
-
def initialize(options = {})
|
7
|
-
@@options ||= {}
|
6
|
+
def initialize(options = { })
|
7
|
+
@@options ||= { }
|
8
8
|
@@options.merge! options
|
9
9
|
end
|
10
10
|
|
@@ -15,7 +15,7 @@ module WebCrawler
|
|
15
15
|
def config
|
16
16
|
self
|
17
17
|
end
|
18
|
-
|
18
|
+
|
19
19
|
private
|
20
20
|
|
21
21
|
def method_missing(name, *args, &blk)
|
@@ -48,7 +48,7 @@ module WebCrawler
|
|
48
48
|
end
|
49
49
|
|
50
50
|
def cache(&block)
|
51
|
-
@cache ||= BaseConfiguration.new
|
51
|
+
@cache ||= BaseConfiguration.new(expire_within: 60, adapter: self.cache_adapter)
|
52
52
|
if block_given?
|
53
53
|
@cache.instance_eval(block)
|
54
54
|
else
|
@@ -58,7 +58,7 @@ module WebCrawler
|
|
58
58
|
|
59
59
|
def logger
|
60
60
|
@logger ||= Logger.new(STDOUT).tap do |log|
|
61
|
-
|
61
|
+
log.level = Logger.const_get log_level.to_s.upcase
|
62
62
|
end
|
63
63
|
end
|
64
64
|
|
@@ -1,4 +1,13 @@
|
|
1
1
|
module WebCrawler
|
2
|
+
#
|
3
|
+
# p = FactoryUrl.new "http://www.somehost.com/:second/:first/", :first => 0..10, :second => "a".."z"
|
4
|
+
# p.urls #=> ["http://www.somehost.com/a/1",
|
5
|
+
# # "http://www.somehost.com/b/1",
|
6
|
+
# # "http://www.somehost.com/c/1",
|
7
|
+
# # ...
|
8
|
+
# # "http://www.somehost.com/x/10",
|
9
|
+
# # "http://www.somehost.com/y/10",
|
10
|
+
# # "http://www.somehost.com/z/10/"]
|
2
11
|
#
|
3
12
|
# p = FactoryUrl.new "http://www.somehost.com/$1/$2?param=$3", 0..10, "a".."z", [3,7,34,876,92]
|
4
13
|
# p.urls #=> ["http://www.somehost.com/1/a?param=3",
|
@@ -8,6 +17,7 @@ module WebCrawler
|
|
8
17
|
# # "http://www.somehost.com/10/x?param=34",
|
9
18
|
# # "http://www.somehost.com/10/y?param=876",
|
10
19
|
# # "http://www.somehost.com/10/z?param=92"]
|
20
|
+
#
|
11
21
|
# p = FactoryUrl.new 0..10, "a".."z", [3,7,34,876,92] do |first, second, third|
|
12
22
|
# "http://www.somehost.com/#{first}/#{second}?param=#{third}"
|
13
23
|
# end
|
@@ -18,21 +28,31 @@ module WebCrawler
|
|
18
28
|
attr_reader :urls, :params, :pattern
|
19
29
|
|
20
30
|
def initialize(*args, &block)
|
31
|
+
@options = args.extract_options!
|
21
32
|
if block_given?
|
22
33
|
@block = block
|
23
34
|
else
|
24
35
|
@pattern = args.shift
|
25
36
|
raise ArgumentError, "first argument must be an url pattern(String)" unless pattern.is_a? String
|
26
37
|
end
|
27
|
-
|
38
|
+
|
39
|
+
if @options.empty?
|
40
|
+
@params = normalize_arguments(args)
|
41
|
+
else
|
42
|
+
values, keys = @options.values.map(&:to_a), @options.keys
|
43
|
+
values = values.shift.product(*values)
|
44
|
+
@params = values.map{|a| Hash[keys.zip(a)]}
|
45
|
+
end
|
28
46
|
end
|
29
47
|
|
30
48
|
def factory
|
31
|
-
if pattern
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
49
|
+
@urls ||= if pattern && params.first.is_a?(Hash)
|
50
|
+
params.map { |opts| pattern.gsub(/:([a-z_]+)/) { opts[$1.to_sym] } }
|
51
|
+
elsif pattern
|
52
|
+
params.map { |opts| pattern.gsub(/\$(\d+)/) { opts[$1.to_i - 1] } }
|
53
|
+
else
|
54
|
+
params.map { |opts| @block.call *opts }
|
55
|
+
end
|
36
56
|
end
|
37
57
|
|
38
58
|
def each
|
@@ -46,7 +66,7 @@ module WebCrawler
|
|
46
66
|
def normalize_arguments(args)
|
47
67
|
args = args.first if args.size == 1 && args.first.is_a?(Enumerable)
|
48
68
|
args.shift if args.first.is_a? String
|
49
|
-
params
|
69
|
+
params = args.map { |arg| convert_to_a(arg) }
|
50
70
|
@params = params.shift.product(*params)
|
51
71
|
end
|
52
72
|
|
data/lib/web_crawler/follower.rb
CHANGED
@@ -1,26 +1,28 @@
|
|
1
1
|
class WebCrawler::Follower
|
2
2
|
|
3
|
+
attr_reader :options
|
4
|
+
|
3
5
|
def initialize(*responses)
|
4
|
-
@options = responses.
|
6
|
+
@options = responses.extract_options!
|
5
7
|
@responses = responses.flatten
|
6
8
|
end
|
7
9
|
|
8
|
-
def process(options = {})
|
10
|
+
def process(options = { })
|
9
11
|
WebCrawler::BatchRequest.new(collect, options).process
|
10
12
|
end
|
11
13
|
|
12
14
|
def follow(response)
|
13
|
-
@responses
|
15
|
+
@responses += Array.wrap(response)
|
14
16
|
self
|
15
17
|
end
|
16
18
|
|
17
|
-
def collect
|
18
|
-
@responses.map do |response|
|
19
|
+
def collect(&block)
|
20
|
+
urls = @responses.map do |response|
|
19
21
|
parser = WebCrawler::Parsers::Url.new(response.url.host, url: response.url.request_uri, same_host: @options[:same_host])
|
20
|
-
parser.parse(response.body)
|
21
|
-
|
22
|
-
|
23
|
-
|
22
|
+
parser.parse(response.body, &block)
|
23
|
+
end.flatten
|
24
|
+
urls = urls.select { |url| url =~ @options[:only] } if @options[:only]
|
25
|
+
urls
|
24
26
|
end
|
25
27
|
|
26
28
|
end
|