web_crawler 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. data/.gitignore +5 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +11 -0
  4. data/README +1 -0
  5. data/Rakefile +2 -0
  6. data/bin/wcrawler +13 -0
  7. data/lib/ext/array.rb +100 -0
  8. data/lib/ext/hash.rb +45 -0
  9. data/lib/ext/http_response.rb +19 -0
  10. data/lib/web_crawler/application.rb +49 -0
  11. data/lib/web_crawler/batch_request.rb +63 -0
  12. data/lib/web_crawler/cache_adapter/base.rb +33 -0
  13. data/lib/web_crawler/cache_adapter/file.rb +52 -0
  14. data/lib/web_crawler/cache_adapter/memory.rb +23 -0
  15. data/lib/web_crawler/cache_adapter.rb +11 -0
  16. data/lib/web_crawler/cached_request.rb +30 -0
  17. data/lib/web_crawler/cli/thor_hooks.rb +94 -0
  18. data/lib/web_crawler/cli/thor_inherited_options.rb +26 -0
  19. data/lib/web_crawler/cli.rb +122 -0
  20. data/lib/web_crawler/configuration.rb +87 -0
  21. data/lib/web_crawler/factory_url.rb +58 -0
  22. data/lib/web_crawler/follower.rb +26 -0
  23. data/lib/web_crawler/handler.rb +45 -0
  24. data/lib/web_crawler/parsers/url.rb +52 -0
  25. data/lib/web_crawler/parsers.rb +5 -0
  26. data/lib/web_crawler/request.rb +59 -0
  27. data/lib/web_crawler/response.rb +45 -0
  28. data/lib/web_crawler/utility.rb +65 -0
  29. data/lib/web_crawler/version.rb +9 -0
  30. data/lib/web_crawler/view/csv.rb +20 -0
  31. data/lib/web_crawler/view/json.rb +9 -0
  32. data/lib/web_crawler/view/plain.rb +9 -0
  33. data/lib/web_crawler/view/runner.rb +20 -0
  34. data/lib/web_crawler/view/table.rb +69 -0
  35. data/lib/web_crawler/view/xml.rb +38 -0
  36. data/lib/web_crawler/view.rb +44 -0
  37. data/lib/web_crawler.rb +38 -0
  38. data/spec/fake_web_generator.rb +44 -0
  39. data/spec/spec_helper.rb +17 -0
  40. data/spec/web_crawler/batch_request_spec.rb +45 -0
  41. data/spec/web_crawler/cached_request_spec.rb +31 -0
  42. data/spec/web_crawler/factory_url_spec.rb +34 -0
  43. data/spec/web_crawler/follow_spec.rb +32 -0
  44. data/spec/web_crawler/request_spec.rb +29 -0
  45. data/spec/web_crawler/response_spec.rb +27 -0
  46. data/spec/web_crawler/url_parser_spec.rb +41 -0
  47. data/spec/web_crawler/view_spec.rb +95 -0
  48. data/web_crawler.gemspec +30 -0
  49. metadata +151 -0
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ .idea
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/Gemfile ADDED
@@ -0,0 +1,11 @@
1
+ source :gemcutter
2
+
3
+ # Specify your gem's dependencies in web_crawler.gemspec
4
+ gemspec
5
+
6
+ group :development, :test do
7
+ gem "rspec", ">=2.6"
8
+ gem "autotest"
9
+ gem "autotest-growl"
10
+ gem "fakeweb"
11
+ end
data/README ADDED
@@ -0,0 +1 @@
1
+ Web crawler help you with parse and collect data from the web
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
data/bin/wcrawler ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $:.unshift File.expand_path("../../lib", __FILE__)
4
+
5
+ # Check if an older version of bundler is installed
6
+ require 'web_crawler'
7
+ require 'web_crawler/application'
8
+
9
+ begin
10
+ WebCrawler::Application.start
11
+ end
12
+
13
+
data/lib/ext/array.rb ADDED
@@ -0,0 +1,100 @@
1
+ require 'enumerator'
2
+
3
+ class Array
4
+ # Splits or iterates over the array in groups of size +number+,
5
+ # padding any remaining slots with +fill_with+ unless it is +false+.
6
+ #
7
+ # %w(1 2 3 4 5 6 7).in_groups_of(3) {|group| p group}
8
+ # ["1", "2", "3"]
9
+ # ["4", "5", "6"]
10
+ # ["7", nil, nil]
11
+ #
12
+ # %w(1 2 3).in_groups_of(2, ' ') {|group| p group}
13
+ # ["1", "2"]
14
+ # ["3", " "]
15
+ #
16
+ # %w(1 2 3).in_groups_of(2, false) {|group| p group}
17
+ # ["1", "2"]
18
+ # ["3"]
19
+ def in_groups_of(number, fill_with = nil)
20
+ if fill_with == false
21
+ collection = self
22
+ else
23
+ # size % number gives how many extra we have;
24
+ # subtracting from number gives how many to add;
25
+ # modulo number ensures we don't add group of just fill.
26
+ padding = (number - size % number) % number
27
+ collection = dup.concat([fill_with] * padding)
28
+ end
29
+
30
+ if block_given?
31
+ collection.each_slice(number) { |slice| yield(slice) }
32
+ else
33
+ groups = []
34
+ collection.each_slice(number) { |group| groups << group }
35
+ groups
36
+ end
37
+ end
38
+
39
+ # Splits or iterates over the array in +number+ of groups, padding any
40
+ # remaining slots with +fill_with+ unless it is +false+.
41
+ #
42
+ # %w(1 2 3 4 5 6 7 8 9 10).in_groups(3) {|group| p group}
43
+ # ["1", "2", "3", "4"]
44
+ # ["5", "6", "7", nil]
45
+ # ["8", "9", "10", nil]
46
+ #
47
+ # %w(1 2 3 4 5 6 7).in_groups(3, '&nbsp;') {|group| p group}
48
+ # ["1", "2", "3"]
49
+ # ["4", "5", "&nbsp;"]
50
+ # ["6", "7", "&nbsp;"]
51
+ #
52
+ # %w(1 2 3 4 5 6 7).in_groups(3, false) {|group| p group}
53
+ # ["1", "2", "3"]
54
+ # ["4", "5"]
55
+ # ["6", "7"]
56
+ def in_groups(number, fill_with = nil)
57
+ # size / number gives minor group size;
58
+ # size % number gives how many objects need extra accommodation;
59
+ # each group hold either division or division + 1 items.
60
+ division = size / number
61
+ modulo = size % number
62
+
63
+ # create a new array avoiding dup
64
+ groups = []
65
+ start = 0
66
+
67
+ number.times do |index|
68
+ length = division + (modulo > 0 && modulo > index ? 1 : 0)
69
+ padding = fill_with != false &&
70
+ modulo > 0 && length == division ? 1 : 0
71
+ groups << slice(start, length).concat([fill_with] * padding)
72
+ start += length
73
+ end
74
+
75
+ if block_given?
76
+ groups.each { |g| yield(g) }
77
+ else
78
+ groups
79
+ end
80
+ end
81
+
82
+ # Divides the array into one or more subarrays based on a delimiting +value+
83
+ # or the result of an optional block.
84
+ #
85
+ # [1, 2, 3, 4, 5].split(3) # => [[1, 2], [4, 5]]
86
+ # (1..10).to_a.split { |i| i % 3 == 0 } # => [[1, 2], [4, 5], [7, 8], [10]]
87
+ def split(value = nil)
88
+ using_block = block_given?
89
+
90
+ inject([[]]) do |results, element|
91
+ if (using_block && yield(element)) || (value == element)
92
+ results << []
93
+ else
94
+ results.last << element
95
+ end
96
+
97
+ results
98
+ end
99
+ end
100
+ end
data/lib/ext/hash.rb ADDED
@@ -0,0 +1,45 @@
1
+ class Hash
2
+ # Return a new hash with all keys converted to strings.
3
+ def stringify_keys
4
+ dup.stringify_keys!
5
+ end
6
+
7
+ # Destructively convert all keys to strings.
8
+ def stringify_keys!
9
+ keys.each do |key|
10
+ self[key.to_s] = delete(key)
11
+ end
12
+ self
13
+ end
14
+
15
+ # Return a new hash with all keys converted to symbols, as long as
16
+ # they respond to +to_sym+.
17
+ def symbolize_keys
18
+ dup.symbolize_keys!
19
+ end
20
+
21
+ # Destructively convert all keys to symbols, as long as they respond
22
+ # to +to_sym+.
23
+ def symbolize_keys!
24
+ keys.each do |key|
25
+ self[(key.to_sym rescue key) || key] = delete(key)
26
+ end
27
+ self
28
+ end
29
+
30
+ alias_method :to_options, :symbolize_keys
31
+ alias_method :to_options!, :symbolize_keys!
32
+
33
+ # Validate all keys in a hash match *valid keys, raising ArgumentError on a mismatch.
34
+ # Note that keys are NOT treated indifferently, meaning if you use strings for keys but assert symbols
35
+ # as keys, this will fail.
36
+ #
37
+ # ==== Examples
38
+ # { :name => "Rob", :years => "28" }.assert_valid_keys(:name, :age) # => raises "ArgumentError: Unknown key(s): years"
39
+ # { :name => "Rob", :age => "28" }.assert_valid_keys("name", "age") # => raises "ArgumentError: Unknown key(s): name, age"
40
+ # { :name => "Rob", :age => "28" }.assert_valid_keys(:name, :age) # => passes, raises nothing
41
+ def assert_valid_keys(*valid_keys)
42
+ unknown_keys = keys - [valid_keys].flatten
43
+ raise(ArgumentError, "Unknown key(s): #{unknown_keys.join(", ")}") unless unknown_keys.empty?
44
+ end
45
+ end
@@ -0,0 +1,19 @@
1
+ module RedirectPath
2
+
3
+ def redirect_path=(path)
4
+ @redirect_path = path.flatten.compact.map(&:to_s).reject(&:empty?)
5
+ end
6
+
7
+ def redirect_path
8
+ @redirect_path
9
+ end
10
+
11
+ def redirect?
12
+ !!redirect_path
13
+ end
14
+
15
+ end
16
+
17
+ class Net::HTTPResponse
18
+ include RedirectPath
19
+ end
@@ -0,0 +1,49 @@
1
+ module WebCrawler
2
+ class Application < CLI
3
+
4
+ desc "test", "Test task"
5
+ def test
6
+ end
7
+
8
+ desc "get <URL...>", "Get pages from passed urls"
9
+ method_option :parser, type: :array, desc: "first item is a parser class, second item is a path to parser file"
10
+ method_option 'same-host', type: :boolean, desc: "find urls with same host only"
11
+
12
+ def get(url, *urls)
13
+ urls.unshift url
14
+
15
+ batch = BatchRequest.new(*urls, symbolized_options)
16
+ batch.process
17
+ end
18
+
19
+ map 'show-urls' => :show_urls
20
+ desc "show-urls <URL...>", "Get pages from passed urls"
21
+ method_option 'same-host', type: :boolean, desc: "find urls with same host only"
22
+ method_option 'cols', type: :numeric, desc: "output columns size"
23
+
24
+ def show_urls(url, *urls)
25
+ urls.unshift url
26
+ batch = BatchRequest.new(*urls, symbolized_options)
27
+ options[:cols] ||= 1
28
+ Follower.new(batch.process, same_host: options['same-host']).collect.first.in_groups_of(options[:cols], "")
29
+ end
30
+
31
+ desc "factory URL_PATTERN [params,...]", "Generate urls and run get action"
32
+ inherited_method_options :get
33
+ method_option :output, type: :boolean, desc: "show output and exit"
34
+ method_option :list, type: :boolean, desc: "show output like a list and exit"
35
+
36
+ def factory(pattern, *params)
37
+ params.map! { |param| eval(param) }
38
+ urls = FactoryUrl.new(pattern, params)
39
+ puts options.inspect
40
+ sep = options[:list] ? "\n" : ' '
41
+ if options[:output] || options[:list]
42
+ puts urls.factory.map { |u| u.inspect }.join(sep).gsub('"', "'")
43
+ else
44
+ get *urls.factory
45
+ end
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,63 @@
1
+ module WebCrawler
2
+
3
+ class BatchRequest
4
+
5
+ attr_reader :urls, :responses, :requests
6
+ attr_writer :requests
7
+
8
+ include Enumerable
9
+
10
+ def initialize(*urls)
11
+ @options = urls.last.is_a?(Hash) ? urls.pop : { }
12
+ set_handler
13
+
14
+ @urls, @requests = urls.flatten, []
15
+ init_requests!
16
+ end
17
+
18
+ def process
19
+ if @handler
20
+ block_given? ? yield(@handler.process) : @handler.process
21
+ else
22
+ @responses ||= requests.map do |req|
23
+ block_given? ? yield(req.process) : req.process
24
+ end
25
+ end
26
+ end
27
+
28
+ def each &block
29
+ @responses = []
30
+ requests.each do |req|
31
+ @responses << req.process
32
+ block.call(@responses.last)
33
+ end
34
+ end
35
+
36
+ def responses=(value)
37
+ @responses += value.flatten
38
+ end
39
+
40
+ def response
41
+ responses.first
42
+ end
43
+
44
+ def build_request(url)
45
+ request_class.new(url)
46
+ end
47
+
48
+ protected
49
+
50
+ def set_handler
51
+ @handler = WebCrawler::HandlerParser.new(@options[:parser], self) if @options[:parser]
52
+ end
53
+
54
+ def init_requests!
55
+ @requests = @urls.map { |url| build_request(url) }
56
+ end
57
+
58
+ def request_class
59
+ @options[:cached] ? CachedRequest : Request
60
+ end
61
+ end
62
+
63
+ end
@@ -0,0 +1,33 @@
1
+ class WebCrawler::CacheAdapter::Base
2
+
3
+ def expired?(response, &block)
4
+ block_result = block_given? ? block.call : false
5
+ (response.foul? && response.date < expire_within) || block_result
6
+ end
7
+
8
+ def expire_within(seconds = nil)
9
+ Time.now - (seconds || WebCrawler.config.cache.expire_within)
10
+ end
11
+
12
+ def prepare_response(response)
13
+ response.set_cached_flag
14
+ response
15
+ end
16
+
17
+ def put response
18
+ prepare_response(response.dup)
19
+ end
20
+
21
+ def set response
22
+ put response
23
+ end
24
+
25
+ def get uri
26
+ raise NotImplementedError
27
+ end
28
+
29
+ def exist? uri
30
+ raise NotImplementedError
31
+ end
32
+
33
+ end
@@ -0,0 +1,52 @@
1
+ require "pathname"
2
+
3
+ module WebCrawler::CacheAdapter
4
+
5
+ class File < Base
6
+
7
+ attr_reader :dir
8
+
9
+ def initialize(dir)
10
+ @dir = Pathname.new dir
11
+ end
12
+
13
+ def put response
14
+ response.tap { write(super) }
15
+ end
16
+
17
+ def get uri
18
+ response = read(uri)
19
+ expire!(response) if expired?(response)
20
+ response
21
+ end
22
+
23
+ def exist? uri
24
+ file(uri).exist?
25
+ end
26
+
27
+ def file(response_or_url)
28
+ url = response_or_url.url rescue response_or_url
29
+ dir.join(uri_to_filename(url))
30
+ end
31
+
32
+ def expire!(response)
33
+ file(response).delete
34
+ end
35
+
36
+ protected
37
+
38
+ def read(uri)
39
+ Marshal.load(file(uri).read)
40
+ end
41
+
42
+ def write(response)
43
+ file(response).open('w+') { |f| f << Marshal.dump(response) }
44
+ end
45
+
46
+ def uri_to_filename(uri)
47
+ uri.to_s.gsub(/\W/, '_').gsub(/_+/, '_')
48
+ end
49
+
50
+ end
51
+
52
+ end
@@ -0,0 +1,23 @@
1
+ module WebCrawler::CacheAdapter
2
+
3
+ class Memory < Base
4
+ class << self
5
+ attr_accessor :cache
6
+ end
7
+
8
+ self.cache = {}
9
+
10
+ def put response
11
+ response.tap { self.class.cache[response.url.to_s] = super }
12
+ end
13
+
14
+ def get uri
15
+ self.class.cache[uri.to_s]
16
+ end
17
+
18
+ def exist? uri
19
+ self.class.cache.key? uri.to_s
20
+ end
21
+ end
22
+
23
+ end
@@ -0,0 +1,11 @@
1
+ module WebCrawler
2
+
3
+ module CacheAdapter
4
+
5
+ autoload :Base, 'web_crawler/cache_adapter/base'
6
+ autoload :Memory, 'web_crawler/cache_adapter/memory'
7
+ autoload :File, 'web_crawler/cache_adapter/file'
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,30 @@
1
+ module WebCrawler
2
+
3
+ class CachedRequest < Request
4
+ extend ::Forwardable
5
+
6
+ def initialize(url, options = { })
7
+ super(url)
8
+ @cache = options[:cache] || WebCrawler.config.cache_adapter
9
+ end
10
+
11
+ def process
12
+ cached do
13
+ Response.new *fetch(url)
14
+ end
15
+ end
16
+
17
+ protected
18
+
19
+ def cached
20
+ @response = if @cache.exist? url
21
+ @cache.get url
22
+ else
23
+ @cache.put yield
24
+ end
25
+ @response
26
+ end
27
+
28
+ end
29
+
30
+ end
@@ -0,0 +1,94 @@
1
+ class Thor
2
+ module Hooks
3
+
4
+ def self.included(base)
5
+ base.send :include, InstanceMethods
6
+ base.send :extend, ClassMethods
7
+ end
8
+
9
+ module InstanceMethods
10
+ attr_reader :response
11
+
12
+ def before_hooks
13
+ self.class.before_hooks
14
+ end
15
+
16
+ def after_hooks
17
+ self.class.after_hooks
18
+ end
19
+
20
+ # Invoke the given task if the given args.
21
+ def invoke_task(task, *args) #:nodoc:
22
+ self.class.run_hooks :before, self, task
23
+ @task_result = super(task, *args)
24
+ @task_result.tap do
25
+ self.class.run_hooks :after, self, task
26
+ end
27
+ end
28
+ end
29
+
30
+ module ClassMethods
31
+ def hooks
32
+ @@hooks ||= { before: [], after: [] }
33
+ end
34
+
35
+ def before_hooks
36
+ hooks[:before]
37
+ end
38
+
39
+ def after_hooks
40
+ hooks[:after]
41
+ end
42
+
43
+ def before_action(*args, &block)
44
+ options = args.last.is_a?(Hash) ? args.pop : { }
45
+ check_hooks_options! options
46
+ add_hook :before, args, options, &block
47
+ end
48
+
49
+ def after_action(*args, &block)
50
+ options = args.last.is_a?(Hash) ? args.pop : { }
51
+ check_hooks_options! options
52
+ add_hook :after, args, options, &block
53
+ end
54
+
55
+ def render(*args, &block)
56
+ after_action(*args) do
57
+ block.call @task_result, @options
58
+ end
59
+ end
60
+
61
+ def run_hooks(place, instance, task)
62
+ hooks[place].each { |hook| self.run_hook(instance, task, hook) }
63
+ end
64
+
65
+
66
+ protected
67
+
68
+ def check_hooks_options!(options)
69
+ raise ArgumentError, <<-M.gsub(/^\s+/, '') if options.keys.include?(:only) && options.keys.include?(:except)
70
+ both ":only" and ":except" given. You should use alone option ":only" or ":except"
71
+ M
72
+ end
73
+
74
+ def add_hook(place, args, options, &block)
75
+ options[:only] ||= []
76
+ options[:except] ||= []
77
+ options[:only] = [*options[:only]]
78
+ options[:except] = [*options[:except]]
79
+ hooks[place] << { block: block, options: options, args: args }
80
+ end
81
+
82
+ def run_hook(instance, task, hook)
83
+ instance.instance_eval(&hook[:block]) if runnable?(task, hook)
84
+ end
85
+
86
+ def runnable?(task, hook)
87
+ with_only = hook[:options][:only].empty? || hook[:options][:only].include?(task.name.to_sym)
88
+ with_except = !hook[:options][:except].include?(task.name.to_sym)
89
+ with_only && with_except
90
+ end
91
+ end
92
+
93
+ end
94
+ end
@@ -0,0 +1,26 @@
1
+ class Thor
2
+ module InheritedOptions
3
+
4
+ def self.included(base)
5
+ base.send :extend, ClassMethods
6
+ end
7
+
8
+ module ClassMethods
9
+ def option_to_hash(option)
10
+ values = option.instance_variables.map { |v| option.instance_variable_get v }
11
+ keys = option.instance_variables.map { |sym| sym.to_s.sub('@', '') }
12
+ Hash[keys.zip values]
13
+ end
14
+
15
+ def inherited_method_options(from_action, for_action = nil)
16
+ tasks[from_action.to_s].options.each do |name, option|
17
+ option_hash = option_to_hash(option).symbolize_keys
18
+ option_hash.merge! for: for_action.to_s if for_action
19
+ option_hash[:desc] = option_hash[:description]
20
+ method_option name, option_hash
21
+ end
22
+ end
23
+ end
24
+
25
+ end
26
+ end