web_crawler 0.3.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -2,6 +2,8 @@ source :gemcutter
2
2
 
3
3
  gem 'thor', '>=0.14.6'
4
4
  gem 'mime-types', '>=1.16'
5
+ gem 'parallel', '>=0.5.5'
6
+ gem 'activesupport'
5
7
 
6
8
  # Specify your gem's dependencies in web_crawler.gemspec
7
9
  gemspec
data/README CHANGED
@@ -1 +1,22 @@
1
- Web crawler help you with parse and collect data from the web
1
+ Web crawler help you with parse and collect data from the web
2
+
3
+ #TODO
4
+
5
+ Base web crawler class for API present
6
+ Its showld work like this:
7
+
8
+ class MyCrawler < WebCrawler::Base
9
+
10
+ target "www.example.com"
11
+ target "www.example.com/page2"
12
+ target %[www.example.com/contacts www.example.com/about]
13
+ target "www.example.com/category_:category/page:page/", :categories => [1,2,3,4], :page => 1..100
14
+
15
+ target { call_advanced_logic_for_url_generating }
16
+
17
+ logger "path/to/log/file" # or Logger.new(...)
18
+
19
+
20
+
21
+
22
+ end
data/lib/web_crawler.rb CHANGED
@@ -6,6 +6,7 @@ require 'forwardable'
6
6
  require "ext/hash"
7
7
  require "ext/array"
8
8
  require "ext/http_response"
9
+ require "active_support/core_ext"
9
10
 
10
11
  module WebCrawler
11
12
  autoload :Request, 'web_crawler/request'
@@ -26,6 +27,7 @@ module WebCrawler
26
27
  autoload :View, 'web_crawler/view'
27
28
  autoload :CLI, 'web_crawler/cli'
28
29
  autoload :Application, 'web_crawler/application'
30
+ autoload :Base, 'web_crawler/base'
29
31
 
30
32
  include Configurable
31
33
  extend Utility
@@ -2,7 +2,35 @@ module WebCrawler
2
2
  class Application < CLI
3
3
 
4
4
  desc "test", "Test task"
5
+
5
6
  def test
7
+ urls = FactoryUrl.new('http://www.superjob.ru/rabota/554/veb-programmist/?from=$1', [[140]]).factory
8
+
9
+ logger.info "start requests with #{urls.join(' ')} in 4 processes"
10
+
11
+ targets = BatchRequest.new(urls).process
12
+
13
+ logger.info "#{targets.size} targets collected"
14
+
15
+ urls = Follower.new(targets, same_host: false).collect { |url| url =~ /vacancy\/\?id=\d+/ }
16
+
17
+ logger.info "#{urls.size} urls collected"
18
+ logger.info "start requests with in 4 processes"
19
+
20
+ puts BatchRequest.new(urls).process.inspect
21
+
22
+ ""
23
+ end
24
+
25
+ desc "runner CLASS", "Run crawler class"
26
+ method_option :lib, type: :array, desc: "lib directories"
27
+ def runner(name)
28
+ $:.unshift './'
29
+ Array.wrap(@options[:lib]).each { |l| $:.unshift l }
30
+ require name.underscore
31
+
32
+ klass = name.classify.constantize
33
+ klass.run allow_format(:json, :yaml)
6
34
  end
7
35
 
8
36
  desc "get <URL...>", "Get pages from passed urls"
@@ -36,8 +64,7 @@ module WebCrawler
36
64
  def factory(pattern, *params)
37
65
  params.map! { |param| eval(param) }
38
66
  urls = FactoryUrl.new(pattern, params)
39
- puts options.inspect
40
- sep = options[:list] ? "\n" : ' '
67
+ sep = options[:list] ? "\n" : ' '
41
68
  if options[:output] || options[:list]
42
69
  puts urls.factory.map { |u| u.inspect }.join(sep).gsub('"', "'")
43
70
  else
@@ -45,5 +72,9 @@ module WebCrawler
45
72
  end
46
73
  end
47
74
 
75
+ protected
76
+ def allow_format(*allow)
77
+ allow.flatten.select { |f| f == @options[:format] }.first
78
+ end
48
79
  end
49
80
  end
@@ -0,0 +1,113 @@
1
+ require "set"
2
+
3
+ module WebCrawler
4
+ class Base
5
+ class_attribute :targets, :logger, :mappers, :cache, :follower, :responses
6
+
7
+ self.targets, self.logger, self.mappers = Set.new, WebCrawler.config.logger, []
8
+
9
+ delegate :run, :to => :'self.class'
10
+
11
+ class << self
12
+
13
+ include ActiveSupport::Callbacks
14
+
15
+ def run(format=nil, format_options={ })
16
+ compile_targets
17
+ self.responses = WebCrawler::BatchRequest.new(targets.to_a).process
18
+ if format
19
+ formated(process(responses), format, format_options)
20
+ else
21
+ process(responses)
22
+ end
23
+ end
24
+
25
+ protected
26
+
27
+ def after(&block)
28
+ @after_callback = block
29
+ end
30
+
31
+ def compile_targets
32
+ following = targets.select { |target| target.is_a?(Array) && target.first.is_a?(Proc) }
33
+ self.targets = targets - following
34
+
35
+ following.each do |target|
36
+ target.first.call(target.last)
37
+ end
38
+ end
39
+
40
+ def log_to(logger_or_path)
41
+ case logger_or_path
42
+ when Logger
43
+ WebCrawler.config.logger = self.logger = logger_or_path
44
+ when nil
45
+ WebCrawler.config.logger = self.logger = Logger.new('/dev/null')
46
+ else
47
+ WebCrawler.config.logger = self.logger = Logger.new(logger_or_path)
48
+ end
49
+ end
50
+
51
+ def cache_to(path_or_cache_adapter)
52
+ adapter = nil
53
+ adapter = path_or_cache_adapter if path_or_cache_adapter.is_a? WebCrawler::CacheAdapter::Base
54
+ adapter = WebCrawler::CacheAdapter::File.new(path_or_cache_adapter) if File.directory? path_or_cache_adapter
55
+
56
+ WebCrawler.configure do
57
+ config.cache.adapter = adapter
58
+ end if adapter
59
+ end
60
+
61
+ def follow(*targets)
62
+ options = targets.extract_options!
63
+ responses = WebCrawler::BatchRequest.new(targets).process
64
+ self.target WebCrawler::Follower.new(responses, options).collect
65
+ end
66
+
67
+ def context(selector, name=selector, &block)
68
+ mapper = WebCrawler::Parsers::Mapper.new(name, self, selector)
69
+ if block.arity.zero?
70
+ mapper.instance_exec(&block)
71
+ else
72
+ mapper.callback(&block)
73
+ end
74
+ self.mappers += [mapper]
75
+ end
76
+
77
+ def target(*targets, &block)
78
+ options = targets.extract_options!
79
+ unless options.empty?
80
+ raise ArgumentError, 'target accept only one pattern if options given' if targets.size > 1
81
+ targets = generate_urls(targets.first, options)
82
+ end
83
+ if block_given?
84
+ self.targets << [block, targets]
85
+ else
86
+ self.targets += targets.flatten
87
+ end
88
+ end
89
+
90
+ def generate_urls(pattern, options)
91
+ WebCrawler::FactoryUrl.new(pattern, options).factory
92
+ end
93
+
94
+ def formated(data, format, options)
95
+ require "active_support/core_ext/string"
96
+ WebCrawler::View.factory(format, data, options).render
97
+ end
98
+
99
+ def process(responses)
100
+ return responses.map(&:to_s) if mappers.empty?
101
+
102
+ { }.tap do |results|
103
+ mappers.each do |mapper|
104
+ results[mapper.name] = responses.map do |response|
105
+ mapper.collect(response)
106
+ end.flatten
107
+ end
108
+ end
109
+ end
110
+ end
111
+
112
+ end
113
+ end
@@ -1,10 +1,15 @@
1
+ require "parallel"
2
+
1
3
  module WebCrawler
2
4
 
5
+ # Usage:
6
+ # BatchRequest.new(urls).process #=> array of Responses
7
+ #
3
8
  class BatchRequest
4
9
 
5
10
  attr_reader :urls, :responses, :requests
6
11
  attr_writer :requests
7
-
12
+
8
13
  include Enumerable
9
14
 
10
15
  def initialize(*urls)
@@ -19,10 +24,11 @@ module WebCrawler
19
24
  if @handler
20
25
  block_given? ? yield(@handler.process) : @handler.process
21
26
  else
22
- @responses ||= requests.map do |req|
27
+ ready = requests.select{|r| r.ready? }
28
+ @responses ||= Parallel.map(requests - ready) do |req|
23
29
  WebCrawler.logger.info "start request to #{req.url.to_s}"
24
30
  block_given? ? yield(req.process) : req.process
25
- end
31
+ end.compact + ready.map(&:process)
26
32
  end
27
33
  end
28
34
 
@@ -57,7 +63,7 @@ module WebCrawler
57
63
  end
58
64
 
59
65
  def request_class
60
- @options[:cached] ? CachedRequest : Request
66
+ !@options[:no_cached] && WebCrawler.config.cache.adapter.is_a?(WebCrawler::CacheAdapter::Base) ? CachedRequest : Request
61
67
  end
62
68
  end
63
69
 
@@ -5,23 +5,32 @@ module WebCrawler
5
5
 
6
6
  def initialize(url, options = { })
7
7
  super(url)
8
- @cache = options[:cache] || WebCrawler.config.cache_adapter
8
+ @cache = options[:cache] || WebCrawler.config.cache.adapter
9
+ @ready = true if @cache.exist? url
9
10
  end
10
11
 
11
12
  def process
12
- cached do
13
+ @response || cached do
13
14
  Response.new *fetch(url)
14
15
  end
15
16
  end
16
17
 
17
18
  protected
18
19
 
20
+ def load_response
21
+ @response = @cache.get url
22
+ end
23
+
24
+ def put_response(response)
25
+ @response = @cache.put(response)
26
+ end
27
+
19
28
  def cached
20
- @response = if @cache.exist? url
21
- @cache.get url
22
- else
23
- @cache.put yield
24
- end
29
+ if @cache.exist? url
30
+ load_response
31
+ else
32
+ put_response(yield)
33
+ end
25
34
  @response
26
35
  end
27
36
 
@@ -3,8 +3,8 @@ require "logger"
3
3
  module WebCrawler
4
4
  class BaseConfiguration
5
5
 
6
- def initialize(options = {})
7
- @@options ||= {}
6
+ def initialize(options = { })
7
+ @@options ||= { }
8
8
  @@options.merge! options
9
9
  end
10
10
 
@@ -15,7 +15,7 @@ module WebCrawler
15
15
  def config
16
16
  self
17
17
  end
18
-
18
+
19
19
  private
20
20
 
21
21
  def method_missing(name, *args, &blk)
@@ -48,7 +48,7 @@ module WebCrawler
48
48
  end
49
49
 
50
50
  def cache(&block)
51
- @cache ||= BaseConfiguration.new expire_within: 60
51
+ @cache ||= BaseConfiguration.new(expire_within: 60, adapter: self.cache_adapter)
52
52
  if block_given?
53
53
  @cache.instance_eval(block)
54
54
  else
@@ -58,7 +58,7 @@ module WebCrawler
58
58
 
59
59
  def logger
60
60
  @logger ||= Logger.new(STDOUT).tap do |log|
61
- log.level = Logger.const_get log_level.to_s.upcase
61
+ log.level = Logger.const_get log_level.to_s.upcase
62
62
  end
63
63
  end
64
64
 
@@ -1,4 +1,13 @@
1
1
  module WebCrawler
2
+ #
3
+ # p = FactoryUrl.new "http://www.somehost.com/:second/:first/", :first => 0..10, :second => "a".."z"
4
+ # p.urls #=> ["http://www.somehost.com/a/1",
5
+ # # "http://www.somehost.com/b/1",
6
+ # # "http://www.somehost.com/c/1",
7
+ # # ...
8
+ # # "http://www.somehost.com/x/10",
9
+ # # "http://www.somehost.com/y/10",
10
+ # # "http://www.somehost.com/z/10/"]
2
11
  #
3
12
  # p = FactoryUrl.new "http://www.somehost.com/$1/$2?param=$3", 0..10, "a".."z", [3,7,34,876,92]
4
13
  # p.urls #=> ["http://www.somehost.com/1/a?param=3",
@@ -8,6 +17,7 @@ module WebCrawler
8
17
  # # "http://www.somehost.com/10/x?param=34",
9
18
  # # "http://www.somehost.com/10/y?param=876",
10
19
  # # "http://www.somehost.com/10/z?param=92"]
20
+ #
11
21
  # p = FactoryUrl.new 0..10, "a".."z", [3,7,34,876,92] do |first, second, third|
12
22
  # "http://www.somehost.com/#{first}/#{second}?param=#{third}"
13
23
  # end
@@ -18,21 +28,31 @@ module WebCrawler
18
28
  attr_reader :urls, :params, :pattern
19
29
 
20
30
  def initialize(*args, &block)
31
+ @options = args.extract_options!
21
32
  if block_given?
22
33
  @block = block
23
34
  else
24
35
  @pattern = args.shift
25
36
  raise ArgumentError, "first argument must be an url pattern(String)" unless pattern.is_a? String
26
37
  end
27
- @params = normalize_arguments(args)
38
+
39
+ if @options.empty?
40
+ @params = normalize_arguments(args)
41
+ else
42
+ values, keys = @options.values.map(&:to_a), @options.keys
43
+ values = values.shift.product(*values)
44
+ @params = values.map{|a| Hash[keys.zip(a)]}
45
+ end
28
46
  end
29
47
 
30
48
  def factory
31
- if pattern
32
- @urls ||= params.map { |opts| pattern.gsub(/\$(\d+)/) { opts[$1.to_i - 1] } }
33
- else
34
- @urls ||= params.map { |opts| @block.call *opts }
35
- end
49
+ @urls ||= if pattern && params.first.is_a?(Hash)
50
+ params.map { |opts| pattern.gsub(/:([a-z_]+)/) { opts[$1.to_sym] } }
51
+ elsif pattern
52
+ params.map { |opts| pattern.gsub(/\$(\d+)/) { opts[$1.to_i - 1] } }
53
+ else
54
+ params.map { |opts| @block.call *opts }
55
+ end
36
56
  end
37
57
 
38
58
  def each
@@ -46,7 +66,7 @@ module WebCrawler
46
66
  def normalize_arguments(args)
47
67
  args = args.first if args.size == 1 && args.first.is_a?(Enumerable)
48
68
  args.shift if args.first.is_a? String
49
- params = args.map { |arg| convert_to_a(arg) }
69
+ params = args.map { |arg| convert_to_a(arg) }
50
70
  @params = params.shift.product(*params)
51
71
  end
52
72
 
@@ -1,26 +1,28 @@
1
1
  class WebCrawler::Follower
2
2
 
3
+ attr_reader :options
4
+
3
5
  def initialize(*responses)
4
- @options = responses.last.is_a?(Hash) ? responses.pop : {}
6
+ @options = responses.extract_options!
5
7
  @responses = responses.flatten
6
8
  end
7
9
 
8
- def process(options = {})
10
+ def process(options = { })
9
11
  WebCrawler::BatchRequest.new(collect, options).process
10
12
  end
11
13
 
12
14
  def follow(response)
13
- @responses << response
15
+ @responses += Array.wrap(response)
14
16
  self
15
17
  end
16
18
 
17
- def collect
18
- @responses.map do |response|
19
+ def collect(&block)
20
+ urls = @responses.map do |response|
19
21
  parser = WebCrawler::Parsers::Url.new(response.url.host, url: response.url.request_uri, same_host: @options[:same_host])
20
- parser.parse(response.body) do |url|
21
- url
22
- end
23
- end
22
+ parser.parse(response.body, &block)
23
+ end.flatten
24
+ urls = urls.select { |url| url =~ @options[:only] } if @options[:only]
25
+ urls
24
26
  end
25
27
 
26
28
  end
@@ -2,4 +2,5 @@ require "hpricot"
2
2
 
3
3
  module WebCrawler::Parsers
4
4
  autoload :Url, 'web_crawler/parsers/url'
5
+ autoload :Mapper, 'web_crawler/parsers/mapper'
5
6
  end