web_crawler 0.3.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -2,6 +2,8 @@ source :gemcutter
2
2
 
3
3
  gem 'thor', '>=0.14.6'
4
4
  gem 'mime-types', '>=1.16'
5
+ gem 'parallel', '>=0.5.5'
6
+ gem 'activesupport'
5
7
 
6
8
  # Specify your gem's dependencies in web_crawler.gemspec
7
9
  gemspec
data/README CHANGED
@@ -1 +1,22 @@
1
- Web crawler help you with parse and collect data from the web
1
+ Web crawler help you with parse and collect data from the web
2
+
3
+ #TODO
4
+
5
+ Base web crawler class for API present
6
+ Its showld work like this:
7
+
8
+ class MyCrawler < WebCrawler::Base
9
+
10
+ target "www.example.com"
11
+ target "www.example.com/page2"
12
+ target %[www.example.com/contacts www.example.com/about]
13
+ target "www.example.com/category_:category/page:page/", :categories => [1,2,3,4], :page => 1..100
14
+
15
+ target { call_advanced_logic_for_url_generating }
16
+
17
+ logger "path/to/log/file" # or Logger.new(...)
18
+
19
+
20
+
21
+
22
+ end
data/lib/web_crawler.rb CHANGED
@@ -6,6 +6,7 @@ require 'forwardable'
6
6
  require "ext/hash"
7
7
  require "ext/array"
8
8
  require "ext/http_response"
9
+ require "active_support/core_ext"
9
10
 
10
11
  module WebCrawler
11
12
  autoload :Request, 'web_crawler/request'
@@ -26,6 +27,7 @@ module WebCrawler
26
27
  autoload :View, 'web_crawler/view'
27
28
  autoload :CLI, 'web_crawler/cli'
28
29
  autoload :Application, 'web_crawler/application'
30
+ autoload :Base, 'web_crawler/base'
29
31
 
30
32
  include Configurable
31
33
  extend Utility
@@ -2,7 +2,35 @@ module WebCrawler
2
2
  class Application < CLI
3
3
 
4
4
  desc "test", "Test task"
5
+
5
6
  def test
7
+ urls = FactoryUrl.new('http://www.superjob.ru/rabota/554/veb-programmist/?from=$1', [[140]]).factory
8
+
9
+ logger.info "start requests with #{urls.join(' ')} in 4 processes"
10
+
11
+ targets = BatchRequest.new(urls).process
12
+
13
+ logger.info "#{targets.size} targets collected"
14
+
15
+ urls = Follower.new(targets, same_host: false).collect { |url| url =~ /vacancy\/\?id=\d+/ }
16
+
17
+ logger.info "#{urls.size} urls collected"
18
+ logger.info "start requests with in 4 processes"
19
+
20
+ puts BatchRequest.new(urls).process.inspect
21
+
22
+ ""
23
+ end
24
+
25
+ desc "runner CLASS", "Run crawler class"
26
+ method_option :lib, type: :array, desc: "lib directories"
27
+ def runner(name)
28
+ $:.unshift './'
29
+ Array.wrap(@options[:lib]).each { |l| $:.unshift l }
30
+ require name.underscore
31
+
32
+ klass = name.classify.constantize
33
+ klass.run allow_format(:json, :yaml)
6
34
  end
7
35
 
8
36
  desc "get <URL...>", "Get pages from passed urls"
@@ -36,8 +64,7 @@ module WebCrawler
36
64
  def factory(pattern, *params)
37
65
  params.map! { |param| eval(param) }
38
66
  urls = FactoryUrl.new(pattern, params)
39
- puts options.inspect
40
- sep = options[:list] ? "\n" : ' '
67
+ sep = options[:list] ? "\n" : ' '
41
68
  if options[:output] || options[:list]
42
69
  puts urls.factory.map { |u| u.inspect }.join(sep).gsub('"', "'")
43
70
  else
@@ -45,5 +72,9 @@ module WebCrawler
45
72
  end
46
73
  end
47
74
 
75
+ protected
76
+ def allow_format(*allow)
77
+ allow.flatten.select { |f| f == @options[:format] }.first
78
+ end
48
79
  end
49
80
  end
@@ -0,0 +1,113 @@
1
+ require "set"
2
+
3
+ module WebCrawler
4
+ class Base
5
+ class_attribute :targets, :logger, :mappers, :cache, :follower, :responses
6
+
7
+ self.targets, self.logger, self.mappers = Set.new, WebCrawler.config.logger, []
8
+
9
+ delegate :run, :to => :'self.class'
10
+
11
+ class << self
12
+
13
+ include ActiveSupport::Callbacks
14
+
15
+ def run(format=nil, format_options={ })
16
+ compile_targets
17
+ self.responses = WebCrawler::BatchRequest.new(targets.to_a).process
18
+ if format
19
+ formated(process(responses), format, format_options)
20
+ else
21
+ process(responses)
22
+ end
23
+ end
24
+
25
+ protected
26
+
27
+ def after(&block)
28
+ @after_callback = block
29
+ end
30
+
31
+ def compile_targets
32
+ following = targets.select { |target| target.is_a?(Array) && target.first.is_a?(Proc) }
33
+ self.targets = targets - following
34
+
35
+ following.each do |target|
36
+ target.first.call(target.last)
37
+ end
38
+ end
39
+
40
+ def log_to(logger_or_path)
41
+ case logger_or_path
42
+ when Logger
43
+ WebCrawler.config.logger = self.logger = logger_or_path
44
+ when nil
45
+ WebCrawler.config.logger = self.logger = Logger.new('/dev/null')
46
+ else
47
+ WebCrawler.config.logger = self.logger = Logger.new(logger_or_path)
48
+ end
49
+ end
50
+
51
+ def cache_to(path_or_cache_adapter)
52
+ adapter = nil
53
+ adapter = path_or_cache_adapter if path_or_cache_adapter.is_a? WebCrawler::CacheAdapter::Base
54
+ adapter = WebCrawler::CacheAdapter::File.new(path_or_cache_adapter) if File.directory? path_or_cache_adapter
55
+
56
+ WebCrawler.configure do
57
+ config.cache.adapter = adapter
58
+ end if adapter
59
+ end
60
+
61
+ def follow(*targets)
62
+ options = targets.extract_options!
63
+ responses = WebCrawler::BatchRequest.new(targets).process
64
+ self.target WebCrawler::Follower.new(responses, options).collect
65
+ end
66
+
67
+ def context(selector, name=selector, &block)
68
+ mapper = WebCrawler::Parsers::Mapper.new(name, self, selector)
69
+ if block.arity.zero?
70
+ mapper.instance_exec(&block)
71
+ else
72
+ mapper.callback(&block)
73
+ end
74
+ self.mappers += [mapper]
75
+ end
76
+
77
+ def target(*targets, &block)
78
+ options = targets.extract_options!
79
+ unless options.empty?
80
+ raise ArgumentError, 'target accept only one pattern if options given' if targets.size > 1
81
+ targets = generate_urls(targets.first, options)
82
+ end
83
+ if block_given?
84
+ self.targets << [block, targets]
85
+ else
86
+ self.targets += targets.flatten
87
+ end
88
+ end
89
+
90
+ def generate_urls(pattern, options)
91
+ WebCrawler::FactoryUrl.new(pattern, options).factory
92
+ end
93
+
94
+ def formated(data, format, options)
95
+ require "active_support/core_ext/string"
96
+ WebCrawler::View.factory(format, data, options).render
97
+ end
98
+
99
+ def process(responses)
100
+ return responses.map(&:to_s) if mappers.empty?
101
+
102
+ { }.tap do |results|
103
+ mappers.each do |mapper|
104
+ results[mapper.name] = responses.map do |response|
105
+ mapper.collect(response)
106
+ end.flatten
107
+ end
108
+ end
109
+ end
110
+ end
111
+
112
+ end
113
+ end
@@ -1,10 +1,15 @@
1
+ require "parallel"
2
+
1
3
  module WebCrawler
2
4
 
5
+ # Usage:
6
+ # BatchRequest.new(urls).process #=> array of Responses
7
+ #
3
8
  class BatchRequest
4
9
 
5
10
  attr_reader :urls, :responses, :requests
6
11
  attr_writer :requests
7
-
12
+
8
13
  include Enumerable
9
14
 
10
15
  def initialize(*urls)
@@ -19,10 +24,11 @@ module WebCrawler
19
24
  if @handler
20
25
  block_given? ? yield(@handler.process) : @handler.process
21
26
  else
22
- @responses ||= requests.map do |req|
27
+ ready = requests.select{|r| r.ready? }
28
+ @responses ||= Parallel.map(requests - ready) do |req|
23
29
  WebCrawler.logger.info "start request to #{req.url.to_s}"
24
30
  block_given? ? yield(req.process) : req.process
25
- end
31
+ end.compact + ready.map(&:process)
26
32
  end
27
33
  end
28
34
 
@@ -57,7 +63,7 @@ module WebCrawler
57
63
  end
58
64
 
59
65
  def request_class
60
- @options[:cached] ? CachedRequest : Request
66
+ !@options[:no_cached] && WebCrawler.config.cache.adapter.is_a?(WebCrawler::CacheAdapter::Base) ? CachedRequest : Request
61
67
  end
62
68
  end
63
69
 
@@ -5,23 +5,32 @@ module WebCrawler
5
5
 
6
6
  def initialize(url, options = { })
7
7
  super(url)
8
- @cache = options[:cache] || WebCrawler.config.cache_adapter
8
+ @cache = options[:cache] || WebCrawler.config.cache.adapter
9
+ @ready = true if @cache.exist? url
9
10
  end
10
11
 
11
12
  def process
12
- cached do
13
+ @response || cached do
13
14
  Response.new *fetch(url)
14
15
  end
15
16
  end
16
17
 
17
18
  protected
18
19
 
20
+ def load_response
21
+ @response = @cache.get url
22
+ end
23
+
24
+ def put_response(response)
25
+ @response = @cache.put(response)
26
+ end
27
+
19
28
  def cached
20
- @response = if @cache.exist? url
21
- @cache.get url
22
- else
23
- @cache.put yield
24
- end
29
+ if @cache.exist? url
30
+ load_response
31
+ else
32
+ put_response(yield)
33
+ end
25
34
  @response
26
35
  end
27
36
 
@@ -3,8 +3,8 @@ require "logger"
3
3
  module WebCrawler
4
4
  class BaseConfiguration
5
5
 
6
- def initialize(options = {})
7
- @@options ||= {}
6
+ def initialize(options = { })
7
+ @@options ||= { }
8
8
  @@options.merge! options
9
9
  end
10
10
 
@@ -15,7 +15,7 @@ module WebCrawler
15
15
  def config
16
16
  self
17
17
  end
18
-
18
+
19
19
  private
20
20
 
21
21
  def method_missing(name, *args, &blk)
@@ -48,7 +48,7 @@ module WebCrawler
48
48
  end
49
49
 
50
50
  def cache(&block)
51
- @cache ||= BaseConfiguration.new expire_within: 60
51
+ @cache ||= BaseConfiguration.new(expire_within: 60, adapter: self.cache_adapter)
52
52
  if block_given?
53
53
  @cache.instance_eval(block)
54
54
  else
@@ -58,7 +58,7 @@ module WebCrawler
58
58
 
59
59
  def logger
60
60
  @logger ||= Logger.new(STDOUT).tap do |log|
61
- log.level = Logger.const_get log_level.to_s.upcase
61
+ log.level = Logger.const_get log_level.to_s.upcase
62
62
  end
63
63
  end
64
64
 
@@ -1,4 +1,13 @@
1
1
  module WebCrawler
2
+ #
3
+ # p = FactoryUrl.new "http://www.somehost.com/:second/:first/", :first => 0..10, :second => "a".."z"
4
+ # p.urls #=> ["http://www.somehost.com/a/1",
5
+ # # "http://www.somehost.com/b/1",
6
+ # # "http://www.somehost.com/c/1",
7
+ # # ...
8
+ # # "http://www.somehost.com/x/10",
9
+ # # "http://www.somehost.com/y/10",
10
+ # # "http://www.somehost.com/z/10/"]
2
11
  #
3
12
  # p = FactoryUrl.new "http://www.somehost.com/$1/$2?param=$3", 0..10, "a".."z", [3,7,34,876,92]
4
13
  # p.urls #=> ["http://www.somehost.com/1/a?param=3",
@@ -8,6 +17,7 @@ module WebCrawler
8
17
  # # "http://www.somehost.com/10/x?param=34",
9
18
  # # "http://www.somehost.com/10/y?param=876",
10
19
  # # "http://www.somehost.com/10/z?param=92"]
20
+ #
11
21
  # p = FactoryUrl.new 0..10, "a".."z", [3,7,34,876,92] do |first, second, third|
12
22
  # "http://www.somehost.com/#{first}/#{second}?param=#{third}"
13
23
  # end
@@ -18,21 +28,31 @@ module WebCrawler
18
28
  attr_reader :urls, :params, :pattern
19
29
 
20
30
  def initialize(*args, &block)
31
+ @options = args.extract_options!
21
32
  if block_given?
22
33
  @block = block
23
34
  else
24
35
  @pattern = args.shift
25
36
  raise ArgumentError, "first argument must be an url pattern(String)" unless pattern.is_a? String
26
37
  end
27
- @params = normalize_arguments(args)
38
+
39
+ if @options.empty?
40
+ @params = normalize_arguments(args)
41
+ else
42
+ values, keys = @options.values.map(&:to_a), @options.keys
43
+ values = values.shift.product(*values)
44
+ @params = values.map{|a| Hash[keys.zip(a)]}
45
+ end
28
46
  end
29
47
 
30
48
  def factory
31
- if pattern
32
- @urls ||= params.map { |opts| pattern.gsub(/\$(\d+)/) { opts[$1.to_i - 1] } }
33
- else
34
- @urls ||= params.map { |opts| @block.call *opts }
35
- end
49
+ @urls ||= if pattern && params.first.is_a?(Hash)
50
+ params.map { |opts| pattern.gsub(/:([a-z_]+)/) { opts[$1.to_sym] } }
51
+ elsif pattern
52
+ params.map { |opts| pattern.gsub(/\$(\d+)/) { opts[$1.to_i - 1] } }
53
+ else
54
+ params.map { |opts| @block.call *opts }
55
+ end
36
56
  end
37
57
 
38
58
  def each
@@ -46,7 +66,7 @@ module WebCrawler
46
66
  def normalize_arguments(args)
47
67
  args = args.first if args.size == 1 && args.first.is_a?(Enumerable)
48
68
  args.shift if args.first.is_a? String
49
- params = args.map { |arg| convert_to_a(arg) }
69
+ params = args.map { |arg| convert_to_a(arg) }
50
70
  @params = params.shift.product(*params)
51
71
  end
52
72
 
@@ -1,26 +1,28 @@
1
1
  class WebCrawler::Follower
2
2
 
3
+ attr_reader :options
4
+
3
5
  def initialize(*responses)
4
- @options = responses.last.is_a?(Hash) ? responses.pop : {}
6
+ @options = responses.extract_options!
5
7
  @responses = responses.flatten
6
8
  end
7
9
 
8
- def process(options = {})
10
+ def process(options = { })
9
11
  WebCrawler::BatchRequest.new(collect, options).process
10
12
  end
11
13
 
12
14
  def follow(response)
13
- @responses << response
15
+ @responses += Array.wrap(response)
14
16
  self
15
17
  end
16
18
 
17
- def collect
18
- @responses.map do |response|
19
+ def collect(&block)
20
+ urls = @responses.map do |response|
19
21
  parser = WebCrawler::Parsers::Url.new(response.url.host, url: response.url.request_uri, same_host: @options[:same_host])
20
- parser.parse(response.body) do |url|
21
- url
22
- end
23
- end
22
+ parser.parse(response.body, &block)
23
+ end.flatten
24
+ urls = urls.select { |url| url =~ @options[:only] } if @options[:only]
25
+ urls
24
26
  end
25
27
 
26
28
  end
@@ -2,4 +2,5 @@ require "hpricot"
2
2
 
3
3
  module WebCrawler::Parsers
4
4
  autoload :Url, 'web_crawler/parsers/url'
5
+ autoload :Mapper, 'web_crawler/parsers/mapper'
5
6
  end