web_crawler 0.3.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +2 -0
- data/README +22 -1
- data/lib/web_crawler.rb +2 -0
- data/lib/web_crawler/application.rb +33 -2
- data/lib/web_crawler/base.rb +113 -0
- data/lib/web_crawler/batch_request.rb +10 -4
- data/lib/web_crawler/cached_request.rb +16 -7
- data/lib/web_crawler/configuration.rb +5 -5
- data/lib/web_crawler/factory_url.rb +27 -7
- data/lib/web_crawler/follower.rb +11 -9
- data/lib/web_crawler/parsers.rb +1 -0
- data/lib/web_crawler/parsers/mapper.rb +114 -0
- data/lib/web_crawler/parsers/url.rb +3 -5
- data/lib/web_crawler/request.rb +14 -2
- data/lib/web_crawler/response.rb +2 -2
- data/lib/web_crawler/version.rb +2 -2
- data/lib/web_crawler/view.rb +1 -1
- data/lib/web_crawler/view/csv.rb +1 -1
- data/lib/web_crawler/view/json.rb +1 -1
- data/lib/web_crawler/view/yaml.rb +1 -1
- data/spec/fixtures/example.xml +171 -0
- data/spec/fixtures/my_crawler.rb +82 -0
- data/spec/fixtures/test_crawler.rb +108 -0
- data/spec/fixtures/test_crawler2.rb +77 -0
- data/spec/spec_helper.rb +8 -3
- data/spec/web_crawler/batch_request_spec.rb +0 -11
- data/spec/web_crawler/cached_request_spec.rb +17 -11
- data/spec/web_crawler/factory_url_spec.rb +19 -6
- data/spec/web_crawler/follow_spec.rb +11 -4
- data/spec/web_crawler/view_spec.rb +10 -10
- data/spec/web_crawler/web_crawler_api_base_class_spec.rb +143 -0
- data/web_crawler.gemspec +2 -0
- metadata +43 -8
data/Gemfile
CHANGED
data/README
CHANGED
@@ -1 +1,22 @@
|
|
1
|
-
Web crawler help you with parse and collect data from the web
|
1
|
+
Web crawler help you with parse and collect data from the web
|
2
|
+
|
3
|
+
#TODO
|
4
|
+
|
5
|
+
Base web crawler class for API present
|
6
|
+
Its showld work like this:
|
7
|
+
|
8
|
+
class MyCrawler < WebCrawler::Base
|
9
|
+
|
10
|
+
target "www.example.com"
|
11
|
+
target "www.example.com/page2"
|
12
|
+
target %[www.example.com/contacts www.example.com/about]
|
13
|
+
target "www.example.com/category_:category/page:page/", :categories => [1,2,3,4], :page => 1..100
|
14
|
+
|
15
|
+
target { call_advanced_logic_for_url_generating }
|
16
|
+
|
17
|
+
logger "path/to/log/file" # or Logger.new(...)
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
end
|
data/lib/web_crawler.rb
CHANGED
@@ -6,6 +6,7 @@ require 'forwardable'
|
|
6
6
|
require "ext/hash"
|
7
7
|
require "ext/array"
|
8
8
|
require "ext/http_response"
|
9
|
+
require "active_support/core_ext"
|
9
10
|
|
10
11
|
module WebCrawler
|
11
12
|
autoload :Request, 'web_crawler/request'
|
@@ -26,6 +27,7 @@ module WebCrawler
|
|
26
27
|
autoload :View, 'web_crawler/view'
|
27
28
|
autoload :CLI, 'web_crawler/cli'
|
28
29
|
autoload :Application, 'web_crawler/application'
|
30
|
+
autoload :Base, 'web_crawler/base'
|
29
31
|
|
30
32
|
include Configurable
|
31
33
|
extend Utility
|
@@ -2,7 +2,35 @@ module WebCrawler
|
|
2
2
|
class Application < CLI
|
3
3
|
|
4
4
|
desc "test", "Test task"
|
5
|
+
|
5
6
|
def test
|
7
|
+
urls = FactoryUrl.new('http://www.superjob.ru/rabota/554/veb-programmist/?from=$1', [[140]]).factory
|
8
|
+
|
9
|
+
logger.info "start requests with #{urls.join(' ')} in 4 processes"
|
10
|
+
|
11
|
+
targets = BatchRequest.new(urls).process
|
12
|
+
|
13
|
+
logger.info "#{targets.size} targets collected"
|
14
|
+
|
15
|
+
urls = Follower.new(targets, same_host: false).collect { |url| url =~ /vacancy\/\?id=\d+/ }
|
16
|
+
|
17
|
+
logger.info "#{urls.size} urls collected"
|
18
|
+
logger.info "start requests with in 4 processes"
|
19
|
+
|
20
|
+
puts BatchRequest.new(urls).process.inspect
|
21
|
+
|
22
|
+
""
|
23
|
+
end
|
24
|
+
|
25
|
+
desc "runner CLASS", "Run crawler class"
|
26
|
+
method_option :lib, type: :array, desc: "lib directories"
|
27
|
+
def runner(name)
|
28
|
+
$:.unshift './'
|
29
|
+
Array.wrap(@options[:lib]).each { |l| $:.unshift l }
|
30
|
+
require name.underscore
|
31
|
+
|
32
|
+
klass = name.classify.constantize
|
33
|
+
klass.run allow_format(:json, :yaml)
|
6
34
|
end
|
7
35
|
|
8
36
|
desc "get <URL...>", "Get pages from passed urls"
|
@@ -36,8 +64,7 @@ module WebCrawler
|
|
36
64
|
def factory(pattern, *params)
|
37
65
|
params.map! { |param| eval(param) }
|
38
66
|
urls = FactoryUrl.new(pattern, params)
|
39
|
-
|
40
|
-
sep = options[:list] ? "\n" : ' '
|
67
|
+
sep = options[:list] ? "\n" : ' '
|
41
68
|
if options[:output] || options[:list]
|
42
69
|
puts urls.factory.map { |u| u.inspect }.join(sep).gsub('"', "'")
|
43
70
|
else
|
@@ -45,5 +72,9 @@ module WebCrawler
|
|
45
72
|
end
|
46
73
|
end
|
47
74
|
|
75
|
+
protected
|
76
|
+
def allow_format(*allow)
|
77
|
+
allow.flatten.select { |f| f == @options[:format] }.first
|
78
|
+
end
|
48
79
|
end
|
49
80
|
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require "set"
|
2
|
+
|
3
|
+
module WebCrawler
|
4
|
+
class Base
|
5
|
+
class_attribute :targets, :logger, :mappers, :cache, :follower, :responses
|
6
|
+
|
7
|
+
self.targets, self.logger, self.mappers = Set.new, WebCrawler.config.logger, []
|
8
|
+
|
9
|
+
delegate :run, :to => :'self.class'
|
10
|
+
|
11
|
+
class << self
|
12
|
+
|
13
|
+
include ActiveSupport::Callbacks
|
14
|
+
|
15
|
+
def run(format=nil, format_options={ })
|
16
|
+
compile_targets
|
17
|
+
self.responses = WebCrawler::BatchRequest.new(targets.to_a).process
|
18
|
+
if format
|
19
|
+
formated(process(responses), format, format_options)
|
20
|
+
else
|
21
|
+
process(responses)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
protected
|
26
|
+
|
27
|
+
def after(&block)
|
28
|
+
@after_callback = block
|
29
|
+
end
|
30
|
+
|
31
|
+
def compile_targets
|
32
|
+
following = targets.select { |target| target.is_a?(Array) && target.first.is_a?(Proc) }
|
33
|
+
self.targets = targets - following
|
34
|
+
|
35
|
+
following.each do |target|
|
36
|
+
target.first.call(target.last)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def log_to(logger_or_path)
|
41
|
+
case logger_or_path
|
42
|
+
when Logger
|
43
|
+
WebCrawler.config.logger = self.logger = logger_or_path
|
44
|
+
when nil
|
45
|
+
WebCrawler.config.logger = self.logger = Logger.new('/dev/null')
|
46
|
+
else
|
47
|
+
WebCrawler.config.logger = self.logger = Logger.new(logger_or_path)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def cache_to(path_or_cache_adapter)
|
52
|
+
adapter = nil
|
53
|
+
adapter = path_or_cache_adapter if path_or_cache_adapter.is_a? WebCrawler::CacheAdapter::Base
|
54
|
+
adapter = WebCrawler::CacheAdapter::File.new(path_or_cache_adapter) if File.directory? path_or_cache_adapter
|
55
|
+
|
56
|
+
WebCrawler.configure do
|
57
|
+
config.cache.adapter = adapter
|
58
|
+
end if adapter
|
59
|
+
end
|
60
|
+
|
61
|
+
def follow(*targets)
|
62
|
+
options = targets.extract_options!
|
63
|
+
responses = WebCrawler::BatchRequest.new(targets).process
|
64
|
+
self.target WebCrawler::Follower.new(responses, options).collect
|
65
|
+
end
|
66
|
+
|
67
|
+
def context(selector, name=selector, &block)
|
68
|
+
mapper = WebCrawler::Parsers::Mapper.new(name, self, selector)
|
69
|
+
if block.arity.zero?
|
70
|
+
mapper.instance_exec(&block)
|
71
|
+
else
|
72
|
+
mapper.callback(&block)
|
73
|
+
end
|
74
|
+
self.mappers += [mapper]
|
75
|
+
end
|
76
|
+
|
77
|
+
def target(*targets, &block)
|
78
|
+
options = targets.extract_options!
|
79
|
+
unless options.empty?
|
80
|
+
raise ArgumentError, 'target accept only one pattern if options given' if targets.size > 1
|
81
|
+
targets = generate_urls(targets.first, options)
|
82
|
+
end
|
83
|
+
if block_given?
|
84
|
+
self.targets << [block, targets]
|
85
|
+
else
|
86
|
+
self.targets += targets.flatten
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def generate_urls(pattern, options)
|
91
|
+
WebCrawler::FactoryUrl.new(pattern, options).factory
|
92
|
+
end
|
93
|
+
|
94
|
+
def formated(data, format, options)
|
95
|
+
require "active_support/core_ext/string"
|
96
|
+
WebCrawler::View.factory(format, data, options).render
|
97
|
+
end
|
98
|
+
|
99
|
+
def process(responses)
|
100
|
+
return responses.map(&:to_s) if mappers.empty?
|
101
|
+
|
102
|
+
{ }.tap do |results|
|
103
|
+
mappers.each do |mapper|
|
104
|
+
results[mapper.name] = responses.map do |response|
|
105
|
+
mapper.collect(response)
|
106
|
+
end.flatten
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
end
|
@@ -1,10 +1,15 @@
|
|
1
|
+
require "parallel"
|
2
|
+
|
1
3
|
module WebCrawler
|
2
4
|
|
5
|
+
# Usage:
|
6
|
+
# BatchRequest.new(urls).process #=> array of Responses
|
7
|
+
#
|
3
8
|
class BatchRequest
|
4
9
|
|
5
10
|
attr_reader :urls, :responses, :requests
|
6
11
|
attr_writer :requests
|
7
|
-
|
12
|
+
|
8
13
|
include Enumerable
|
9
14
|
|
10
15
|
def initialize(*urls)
|
@@ -19,10 +24,11 @@ module WebCrawler
|
|
19
24
|
if @handler
|
20
25
|
block_given? ? yield(@handler.process) : @handler.process
|
21
26
|
else
|
22
|
-
|
27
|
+
ready = requests.select{|r| r.ready? }
|
28
|
+
@responses ||= Parallel.map(requests - ready) do |req|
|
23
29
|
WebCrawler.logger.info "start request to #{req.url.to_s}"
|
24
30
|
block_given? ? yield(req.process) : req.process
|
25
|
-
end
|
31
|
+
end.compact + ready.map(&:process)
|
26
32
|
end
|
27
33
|
end
|
28
34
|
|
@@ -57,7 +63,7 @@ module WebCrawler
|
|
57
63
|
end
|
58
64
|
|
59
65
|
def request_class
|
60
|
-
|
66
|
+
!@options[:no_cached] && WebCrawler.config.cache.adapter.is_a?(WebCrawler::CacheAdapter::Base) ? CachedRequest : Request
|
61
67
|
end
|
62
68
|
end
|
63
69
|
|
@@ -5,23 +5,32 @@ module WebCrawler
|
|
5
5
|
|
6
6
|
def initialize(url, options = { })
|
7
7
|
super(url)
|
8
|
-
@cache = options[:cache] || WebCrawler.config.
|
8
|
+
@cache = options[:cache] || WebCrawler.config.cache.adapter
|
9
|
+
@ready = true if @cache.exist? url
|
9
10
|
end
|
10
11
|
|
11
12
|
def process
|
12
|
-
cached do
|
13
|
+
@response || cached do
|
13
14
|
Response.new *fetch(url)
|
14
15
|
end
|
15
16
|
end
|
16
17
|
|
17
18
|
protected
|
18
19
|
|
20
|
+
def load_response
|
21
|
+
@response = @cache.get url
|
22
|
+
end
|
23
|
+
|
24
|
+
def put_response(response)
|
25
|
+
@response = @cache.put(response)
|
26
|
+
end
|
27
|
+
|
19
28
|
def cached
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
29
|
+
if @cache.exist? url
|
30
|
+
load_response
|
31
|
+
else
|
32
|
+
put_response(yield)
|
33
|
+
end
|
25
34
|
@response
|
26
35
|
end
|
27
36
|
|
@@ -3,8 +3,8 @@ require "logger"
|
|
3
3
|
module WebCrawler
|
4
4
|
class BaseConfiguration
|
5
5
|
|
6
|
-
def initialize(options = {})
|
7
|
-
@@options ||= {}
|
6
|
+
def initialize(options = { })
|
7
|
+
@@options ||= { }
|
8
8
|
@@options.merge! options
|
9
9
|
end
|
10
10
|
|
@@ -15,7 +15,7 @@ module WebCrawler
|
|
15
15
|
def config
|
16
16
|
self
|
17
17
|
end
|
18
|
-
|
18
|
+
|
19
19
|
private
|
20
20
|
|
21
21
|
def method_missing(name, *args, &blk)
|
@@ -48,7 +48,7 @@ module WebCrawler
|
|
48
48
|
end
|
49
49
|
|
50
50
|
def cache(&block)
|
51
|
-
@cache ||= BaseConfiguration.new
|
51
|
+
@cache ||= BaseConfiguration.new(expire_within: 60, adapter: self.cache_adapter)
|
52
52
|
if block_given?
|
53
53
|
@cache.instance_eval(block)
|
54
54
|
else
|
@@ -58,7 +58,7 @@ module WebCrawler
|
|
58
58
|
|
59
59
|
def logger
|
60
60
|
@logger ||= Logger.new(STDOUT).tap do |log|
|
61
|
-
|
61
|
+
log.level = Logger.const_get log_level.to_s.upcase
|
62
62
|
end
|
63
63
|
end
|
64
64
|
|
@@ -1,4 +1,13 @@
|
|
1
1
|
module WebCrawler
|
2
|
+
#
|
3
|
+
# p = FactoryUrl.new "http://www.somehost.com/:second/:first/", :first => 0..10, :second => "a".."z"
|
4
|
+
# p.urls #=> ["http://www.somehost.com/a/1",
|
5
|
+
# # "http://www.somehost.com/b/1",
|
6
|
+
# # "http://www.somehost.com/c/1",
|
7
|
+
# # ...
|
8
|
+
# # "http://www.somehost.com/x/10",
|
9
|
+
# # "http://www.somehost.com/y/10",
|
10
|
+
# # "http://www.somehost.com/z/10/"]
|
2
11
|
#
|
3
12
|
# p = FactoryUrl.new "http://www.somehost.com/$1/$2?param=$3", 0..10, "a".."z", [3,7,34,876,92]
|
4
13
|
# p.urls #=> ["http://www.somehost.com/1/a?param=3",
|
@@ -8,6 +17,7 @@ module WebCrawler
|
|
8
17
|
# # "http://www.somehost.com/10/x?param=34",
|
9
18
|
# # "http://www.somehost.com/10/y?param=876",
|
10
19
|
# # "http://www.somehost.com/10/z?param=92"]
|
20
|
+
#
|
11
21
|
# p = FactoryUrl.new 0..10, "a".."z", [3,7,34,876,92] do |first, second, third|
|
12
22
|
# "http://www.somehost.com/#{first}/#{second}?param=#{third}"
|
13
23
|
# end
|
@@ -18,21 +28,31 @@ module WebCrawler
|
|
18
28
|
attr_reader :urls, :params, :pattern
|
19
29
|
|
20
30
|
def initialize(*args, &block)
|
31
|
+
@options = args.extract_options!
|
21
32
|
if block_given?
|
22
33
|
@block = block
|
23
34
|
else
|
24
35
|
@pattern = args.shift
|
25
36
|
raise ArgumentError, "first argument must be an url pattern(String)" unless pattern.is_a? String
|
26
37
|
end
|
27
|
-
|
38
|
+
|
39
|
+
if @options.empty?
|
40
|
+
@params = normalize_arguments(args)
|
41
|
+
else
|
42
|
+
values, keys = @options.values.map(&:to_a), @options.keys
|
43
|
+
values = values.shift.product(*values)
|
44
|
+
@params = values.map{|a| Hash[keys.zip(a)]}
|
45
|
+
end
|
28
46
|
end
|
29
47
|
|
30
48
|
def factory
|
31
|
-
if pattern
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
49
|
+
@urls ||= if pattern && params.first.is_a?(Hash)
|
50
|
+
params.map { |opts| pattern.gsub(/:([a-z_]+)/) { opts[$1.to_sym] } }
|
51
|
+
elsif pattern
|
52
|
+
params.map { |opts| pattern.gsub(/\$(\d+)/) { opts[$1.to_i - 1] } }
|
53
|
+
else
|
54
|
+
params.map { |opts| @block.call *opts }
|
55
|
+
end
|
36
56
|
end
|
37
57
|
|
38
58
|
def each
|
@@ -46,7 +66,7 @@ module WebCrawler
|
|
46
66
|
def normalize_arguments(args)
|
47
67
|
args = args.first if args.size == 1 && args.first.is_a?(Enumerable)
|
48
68
|
args.shift if args.first.is_a? String
|
49
|
-
params
|
69
|
+
params = args.map { |arg| convert_to_a(arg) }
|
50
70
|
@params = params.shift.product(*params)
|
51
71
|
end
|
52
72
|
|
data/lib/web_crawler/follower.rb
CHANGED
@@ -1,26 +1,28 @@
|
|
1
1
|
class WebCrawler::Follower
|
2
2
|
|
3
|
+
attr_reader :options
|
4
|
+
|
3
5
|
def initialize(*responses)
|
4
|
-
@options = responses.
|
6
|
+
@options = responses.extract_options!
|
5
7
|
@responses = responses.flatten
|
6
8
|
end
|
7
9
|
|
8
|
-
def process(options = {})
|
10
|
+
def process(options = { })
|
9
11
|
WebCrawler::BatchRequest.new(collect, options).process
|
10
12
|
end
|
11
13
|
|
12
14
|
def follow(response)
|
13
|
-
@responses
|
15
|
+
@responses += Array.wrap(response)
|
14
16
|
self
|
15
17
|
end
|
16
18
|
|
17
|
-
def collect
|
18
|
-
@responses.map do |response|
|
19
|
+
def collect(&block)
|
20
|
+
urls = @responses.map do |response|
|
19
21
|
parser = WebCrawler::Parsers::Url.new(response.url.host, url: response.url.request_uri, same_host: @options[:same_host])
|
20
|
-
parser.parse(response.body)
|
21
|
-
|
22
|
-
|
23
|
-
|
22
|
+
parser.parse(response.body, &block)
|
23
|
+
end.flatten
|
24
|
+
urls = urls.select { |url| url =~ @options[:only] } if @options[:only]
|
25
|
+
urls
|
24
26
|
end
|
25
27
|
|
26
28
|
end
|