web_crawler 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/.gitignore +5 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +11 -0
  4. data/README +1 -0
  5. data/Rakefile +2 -0
  6. data/bin/wcrawler +13 -0
  7. data/lib/ext/array.rb +100 -0
  8. data/lib/ext/hash.rb +45 -0
  9. data/lib/ext/http_response.rb +19 -0
  10. data/lib/web_crawler/application.rb +49 -0
  11. data/lib/web_crawler/batch_request.rb +63 -0
  12. data/lib/web_crawler/cache_adapter/base.rb +33 -0
  13. data/lib/web_crawler/cache_adapter/file.rb +52 -0
  14. data/lib/web_crawler/cache_adapter/memory.rb +23 -0
  15. data/lib/web_crawler/cache_adapter.rb +11 -0
  16. data/lib/web_crawler/cached_request.rb +30 -0
  17. data/lib/web_crawler/cli/thor_hooks.rb +94 -0
  18. data/lib/web_crawler/cli/thor_inherited_options.rb +26 -0
  19. data/lib/web_crawler/cli.rb +122 -0
  20. data/lib/web_crawler/configuration.rb +87 -0
  21. data/lib/web_crawler/factory_url.rb +58 -0
  22. data/lib/web_crawler/follower.rb +26 -0
  23. data/lib/web_crawler/handler.rb +45 -0
  24. data/lib/web_crawler/parsers/url.rb +52 -0
  25. data/lib/web_crawler/parsers.rb +5 -0
  26. data/lib/web_crawler/request.rb +59 -0
  27. data/lib/web_crawler/response.rb +45 -0
  28. data/lib/web_crawler/utility.rb +65 -0
  29. data/lib/web_crawler/version.rb +9 -0
  30. data/lib/web_crawler/view/csv.rb +20 -0
  31. data/lib/web_crawler/view/json.rb +9 -0
  32. data/lib/web_crawler/view/plain.rb +9 -0
  33. data/lib/web_crawler/view/runner.rb +20 -0
  34. data/lib/web_crawler/view/table.rb +69 -0
  35. data/lib/web_crawler/view/xml.rb +38 -0
  36. data/lib/web_crawler/view.rb +44 -0
  37. data/lib/web_crawler.rb +38 -0
  38. data/spec/fake_web_generator.rb +44 -0
  39. data/spec/spec_helper.rb +17 -0
  40. data/spec/web_crawler/batch_request_spec.rb +45 -0
  41. data/spec/web_crawler/cached_request_spec.rb +31 -0
  42. data/spec/web_crawler/factory_url_spec.rb +34 -0
  43. data/spec/web_crawler/follow_spec.rb +32 -0
  44. data/spec/web_crawler/request_spec.rb +29 -0
  45. data/spec/web_crawler/response_spec.rb +27 -0
  46. data/spec/web_crawler/url_parser_spec.rb +41 -0
  47. data/spec/web_crawler/view_spec.rb +95 -0
  48. data/web_crawler.gemspec +30 -0
  49. metadata +151 -0
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ .idea
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/Gemfile ADDED
@@ -0,0 +1,11 @@
1
+ source :gemcutter
2
+
3
+ # Specify your gem's dependencies in web_crawler.gemspec
4
+ gemspec
5
+
6
+ group :development, :test do
7
+ gem "rspec", ">=2.6"
8
+ gem "autotest"
9
+ gem "autotest-growl"
10
+ gem "fakeweb"
11
+ end
data/README ADDED
@@ -0,0 +1 @@
1
+ Web crawler help you with parse and collect data from the web
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
data/bin/wcrawler ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $:.unshift File.expand_path("../../lib", __FILE__)
4
+
5
+ # Check if an older version of bundler is installed
6
+ require 'web_crawler'
7
+ require 'web_crawler/application'
8
+
9
+ begin
10
+ WebCrawler::Application.start
11
+ end
12
+
13
+
data/lib/ext/array.rb ADDED
@@ -0,0 +1,100 @@
1
+ require 'enumerator'
2
+
3
+ class Array
4
+ # Splits or iterates over the array in groups of size +number+,
5
+ # padding any remaining slots with +fill_with+ unless it is +false+.
6
+ #
7
+ # %w(1 2 3 4 5 6 7).in_groups_of(3) {|group| p group}
8
+ # ["1", "2", "3"]
9
+ # ["4", "5", "6"]
10
+ # ["7", nil, nil]
11
+ #
12
+ # %w(1 2 3).in_groups_of(2, ' ') {|group| p group}
13
+ # ["1", "2"]
14
+ # ["3", " "]
15
+ #
16
+ # %w(1 2 3).in_groups_of(2, false) {|group| p group}
17
+ # ["1", "2"]
18
+ # ["3"]
19
+ def in_groups_of(number, fill_with = nil)
20
+ if fill_with == false
21
+ collection = self
22
+ else
23
+ # size % number gives how many extra we have;
24
+ # subtracting from number gives how many to add;
25
+ # modulo number ensures we don't add group of just fill.
26
+ padding = (number - size % number) % number
27
+ collection = dup.concat([fill_with] * padding)
28
+ end
29
+
30
+ if block_given?
31
+ collection.each_slice(number) { |slice| yield(slice) }
32
+ else
33
+ groups = []
34
+ collection.each_slice(number) { |group| groups << group }
35
+ groups
36
+ end
37
+ end
38
+
39
+ # Splits or iterates over the array in +number+ of groups, padding any
40
+ # remaining slots with +fill_with+ unless it is +false+.
41
+ #
42
+ # %w(1 2 3 4 5 6 7 8 9 10).in_groups(3) {|group| p group}
43
+ # ["1", "2", "3", "4"]
44
+ # ["5", "6", "7", nil]
45
+ # ["8", "9", "10", nil]
46
+ #
47
+ # %w(1 2 3 4 5 6 7).in_groups(3, '&nbsp;') {|group| p group}
48
+ # ["1", "2", "3"]
49
+ # ["4", "5", "&nbsp;"]
50
+ # ["6", "7", "&nbsp;"]
51
+ #
52
+ # %w(1 2 3 4 5 6 7).in_groups(3, false) {|group| p group}
53
+ # ["1", "2", "3"]
54
+ # ["4", "5"]
55
+ # ["6", "7"]
56
+ def in_groups(number, fill_with = nil)
57
+ # size / number gives minor group size;
58
+ # size % number gives how many objects need extra accommodation;
59
+ # each group hold either division or division + 1 items.
60
+ division = size / number
61
+ modulo = size % number
62
+
63
+ # create a new array avoiding dup
64
+ groups = []
65
+ start = 0
66
+
67
+ number.times do |index|
68
+ length = division + (modulo > 0 && modulo > index ? 1 : 0)
69
+ padding = fill_with != false &&
70
+ modulo > 0 && length == division ? 1 : 0
71
+ groups << slice(start, length).concat([fill_with] * padding)
72
+ start += length
73
+ end
74
+
75
+ if block_given?
76
+ groups.each { |g| yield(g) }
77
+ else
78
+ groups
79
+ end
80
+ end
81
+
82
+ # Divides the array into one or more subarrays based on a delimiting +value+
83
+ # or the result of an optional block.
84
+ #
85
+ # [1, 2, 3, 4, 5].split(3) # => [[1, 2], [4, 5]]
86
+ # (1..10).to_a.split { |i| i % 3 == 0 } # => [[1, 2], [4, 5], [7, 8], [10]]
87
+ def split(value = nil)
88
+ using_block = block_given?
89
+
90
+ inject([[]]) do |results, element|
91
+ if (using_block && yield(element)) || (value == element)
92
+ results << []
93
+ else
94
+ results.last << element
95
+ end
96
+
97
+ results
98
+ end
99
+ end
100
+ end
data/lib/ext/hash.rb ADDED
@@ -0,0 +1,45 @@
1
+ class Hash
2
+ # Return a new hash with all keys converted to strings.
3
+ def stringify_keys
4
+ dup.stringify_keys!
5
+ end
6
+
7
+ # Destructively convert all keys to strings.
8
+ def stringify_keys!
9
+ keys.each do |key|
10
+ self[key.to_s] = delete(key)
11
+ end
12
+ self
13
+ end
14
+
15
+ # Return a new hash with all keys converted to symbols, as long as
16
+ # they respond to +to_sym+.
17
+ def symbolize_keys
18
+ dup.symbolize_keys!
19
+ end
20
+
21
+ # Destructively convert all keys to symbols, as long as they respond
22
+ # to +to_sym+.
23
+ def symbolize_keys!
24
+ keys.each do |key|
25
+ self[(key.to_sym rescue key) || key] = delete(key)
26
+ end
27
+ self
28
+ end
29
+
30
+ alias_method :to_options, :symbolize_keys
31
+ alias_method :to_options!, :symbolize_keys!
32
+
33
+ # Validate all keys in a hash match *valid keys, raising ArgumentError on a mismatch.
34
+ # Note that keys are NOT treated indifferently, meaning if you use strings for keys but assert symbols
35
+ # as keys, this will fail.
36
+ #
37
+ # ==== Examples
38
+ # { :name => "Rob", :years => "28" }.assert_valid_keys(:name, :age) # => raises "ArgumentError: Unknown key(s): years"
39
+ # { :name => "Rob", :age => "28" }.assert_valid_keys("name", "age") # => raises "ArgumentError: Unknown key(s): name, age"
40
+ # { :name => "Rob", :age => "28" }.assert_valid_keys(:name, :age) # => passes, raises nothing
41
+ def assert_valid_keys(*valid_keys)
42
+ unknown_keys = keys - [valid_keys].flatten
43
+ raise(ArgumentError, "Unknown key(s): #{unknown_keys.join(", ")}") unless unknown_keys.empty?
44
+ end
45
+ end
@@ -0,0 +1,19 @@
1
+ module RedirectPath
2
+
3
+ def redirect_path=(path)
4
+ @redirect_path = path.flatten.compact.map(&:to_s).reject(&:empty?)
5
+ end
6
+
7
+ def redirect_path
8
+ @redirect_path
9
+ end
10
+
11
+ def redirect?
12
+ !!redirect_path
13
+ end
14
+
15
+ end
16
+
17
+ class Net::HTTPResponse
18
+ include RedirectPath
19
+ end
@@ -0,0 +1,49 @@
1
+ module WebCrawler
2
+ class Application < CLI
3
+
4
+ desc "test", "Test task"
5
+ def test
6
+ end
7
+
8
+ desc "get <URL...>", "Get pages from passed urls"
9
+ method_option :parser, type: :array, desc: "first item is a parser class, second item is a path to parser file"
10
+ method_option 'same-host', type: :boolean, desc: "find urls with same host only"
11
+
12
+ def get(url, *urls)
13
+ urls.unshift url
14
+
15
+ batch = BatchRequest.new(*urls, symbolized_options)
16
+ batch.process
17
+ end
18
+
19
+ map 'show-urls' => :show_urls
20
+ desc "show-urls <URL...>", "Get pages from passed urls"
21
+ method_option 'same-host', type: :boolean, desc: "find urls with same host only"
22
+ method_option 'cols', type: :numeric, desc: "output columns size"
23
+
24
+ def show_urls(url, *urls)
25
+ urls.unshift url
26
+ batch = BatchRequest.new(*urls, symbolized_options)
27
+ options[:cols] ||= 1
28
+ Follower.new(batch.process, same_host: options['same-host']).collect.first.in_groups_of(options[:cols], "")
29
+ end
30
+
31
+ desc "factory URL_PATTERN [params,...]", "Generate urls and run get action"
32
+ inherited_method_options :get
33
+ method_option :output, type: :boolean, desc: "show output and exit"
34
+ method_option :list, type: :boolean, desc: "show output like a list and exit"
35
+
36
+ def factory(pattern, *params)
37
+ params.map! { |param| eval(param) }
38
+ urls = FactoryUrl.new(pattern, params)
39
+ puts options.inspect
40
+ sep = options[:list] ? "\n" : ' '
41
+ if options[:output] || options[:list]
42
+ puts urls.factory.map { |u| u.inspect }.join(sep).gsub('"', "'")
43
+ else
44
+ get *urls.factory
45
+ end
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,63 @@
1
+ module WebCrawler
2
+
3
+ class BatchRequest
4
+
5
+ attr_reader :urls, :responses, :requests
6
+ attr_writer :requests
7
+
8
+ include Enumerable
9
+
10
+ def initialize(*urls)
11
+ @options = urls.last.is_a?(Hash) ? urls.pop : { }
12
+ set_handler
13
+
14
+ @urls, @requests = urls.flatten, []
15
+ init_requests!
16
+ end
17
+
18
+ def process
19
+ if @handler
20
+ block_given? ? yield(@handler.process) : @handler.process
21
+ else
22
+ @responses ||= requests.map do |req|
23
+ block_given? ? yield(req.process) : req.process
24
+ end
25
+ end
26
+ end
27
+
28
+ def each &block
29
+ @responses = []
30
+ requests.each do |req|
31
+ @responses << req.process
32
+ block.call(@responses.last)
33
+ end
34
+ end
35
+
36
+ def responses=(value)
37
+ @responses += value.flatten
38
+ end
39
+
40
+ def response
41
+ responses.first
42
+ end
43
+
44
+ def build_request(url)
45
+ request_class.new(url)
46
+ end
47
+
48
+ protected
49
+
50
+ def set_handler
51
+ @handler = WebCrawler::HandlerParser.new(@options[:parser], self) if @options[:parser]
52
+ end
53
+
54
+ def init_requests!
55
+ @requests = @urls.map { |url| build_request(url) }
56
+ end
57
+
58
+ def request_class
59
+ @options[:cached] ? CachedRequest : Request
60
+ end
61
+ end
62
+
63
+ end
@@ -0,0 +1,33 @@
1
+ class WebCrawler::CacheAdapter::Base
2
+
3
+ def expired?(response, &block)
4
+ block_result = block_given? ? block.call : false
5
+ (response.foul? && response.date < expire_within) || block_result
6
+ end
7
+
8
+ def expire_within(seconds = nil)
9
+ Time.now - (seconds || WebCrawler.config.cache.expire_within)
10
+ end
11
+
12
+ def prepare_response(response)
13
+ response.set_cached_flag
14
+ response
15
+ end
16
+
17
+ def put response
18
+ prepare_response(response.dup)
19
+ end
20
+
21
+ def set response
22
+ put response
23
+ end
24
+
25
+ def get uri
26
+ raise NotImplementedError
27
+ end
28
+
29
+ def exist? uri
30
+ raise NotImplementedError
31
+ end
32
+
33
+ end
@@ -0,0 +1,52 @@
1
+ require "pathname"
2
+
3
+ module WebCrawler::CacheAdapter
4
+
5
+ class File < Base
6
+
7
+ attr_reader :dir
8
+
9
+ def initialize(dir)
10
+ @dir = Pathname.new dir
11
+ end
12
+
13
+ def put response
14
+ response.tap { write(super) }
15
+ end
16
+
17
+ def get uri
18
+ response = read(uri)
19
+ expire!(response) if expired?(response)
20
+ response
21
+ end
22
+
23
+ def exist? uri
24
+ file(uri).exist?
25
+ end
26
+
27
+ def file(response_or_url)
28
+ url = response_or_url.url rescue response_or_url
29
+ dir.join(uri_to_filename(url))
30
+ end
31
+
32
+ def expire!(response)
33
+ file(response).delete
34
+ end
35
+
36
+ protected
37
+
38
+ def read(uri)
39
+ Marshal.load(file(uri).read)
40
+ end
41
+
42
+ def write(response)
43
+ file(response).open('w+') { |f| f << Marshal.dump(response) }
44
+ end
45
+
46
+ def uri_to_filename(uri)
47
+ uri.to_s.gsub(/\W/, '_').gsub(/_+/, '_')
48
+ end
49
+
50
+ end
51
+
52
+ end
@@ -0,0 +1,23 @@
1
+ module WebCrawler::CacheAdapter
2
+
3
+ class Memory < Base
4
+ class << self
5
+ attr_accessor :cache
6
+ end
7
+
8
+ self.cache = {}
9
+
10
+ def put response
11
+ response.tap { self.class.cache[response.url.to_s] = super }
12
+ end
13
+
14
+ def get uri
15
+ self.class.cache[uri.to_s]
16
+ end
17
+
18
+ def exist? uri
19
+ self.class.cache.key? uri.to_s
20
+ end
21
+ end
22
+
23
+ end
@@ -0,0 +1,11 @@
1
+ module WebCrawler
2
+
3
+ module CacheAdapter
4
+
5
+ autoload :Base, 'web_crawler/cache_adapter/base'
6
+ autoload :Memory, 'web_crawler/cache_adapter/memory'
7
+ autoload :File, 'web_crawler/cache_adapter/file'
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,30 @@
1
+ module WebCrawler
2
+
3
+ class CachedRequest < Request
4
+ extend ::Forwardable
5
+
6
+ def initialize(url, options = { })
7
+ super(url)
8
+ @cache = options[:cache] || WebCrawler.config.cache_adapter
9
+ end
10
+
11
+ def process
12
+ cached do
13
+ Response.new *fetch(url)
14
+ end
15
+ end
16
+
17
+ protected
18
+
19
+ def cached
20
+ @response = if @cache.exist? url
21
+ @cache.get url
22
+ else
23
+ @cache.put yield
24
+ end
25
+ @response
26
+ end
27
+
28
+ end
29
+
30
+ end
@@ -0,0 +1,94 @@
1
+ class Thor
2
+ module Hooks
3
+
4
+ def self.included(base)
5
+ base.send :include, InstanceMethods
6
+ base.send :extend, ClassMethods
7
+ end
8
+
9
+ module InstanceMethods
10
+ attr_reader :response
11
+
12
+ def before_hooks
13
+ self.class.before_hooks
14
+ end
15
+
16
+ def after_hooks
17
+ self.class.after_hooks
18
+ end
19
+
20
+ # Invoke the given task if the given args.
21
+ def invoke_task(task, *args) #:nodoc:
22
+ self.class.run_hooks :before, self, task
23
+ @task_result = super(task, *args)
24
+ @task_result.tap do
25
+ self.class.run_hooks :after, self, task
26
+ end
27
+ end
28
+ end
29
+
30
+ module ClassMethods
31
+ def hooks
32
+ @@hooks ||= { before: [], after: [] }
33
+ end
34
+
35
+ def before_hooks
36
+ hooks[:before]
37
+ end
38
+
39
+ def after_hooks
40
+ hooks[:after]
41
+ end
42
+
43
+ def before_action(*args, &block)
44
+ options = args.last.is_a?(Hash) ? args.pop : { }
45
+ check_hooks_options! options
46
+ add_hook :before, args, options, &block
47
+ end
48
+
49
+ def after_action(*args, &block)
50
+ options = args.last.is_a?(Hash) ? args.pop : { }
51
+ check_hooks_options! options
52
+ add_hook :after, args, options, &block
53
+ end
54
+
55
+ def render(*args, &block)
56
+ after_action(*args) do
57
+ block.call @task_result, @options
58
+ end
59
+ end
60
+
61
+ def run_hooks(place, instance, task)
62
+ hooks[place].each { |hook| self.run_hook(instance, task, hook) }
63
+ end
64
+
65
+
66
+ protected
67
+
68
+ def check_hooks_options!(options)
69
+ raise ArgumentError, <<-M.gsub(/^\s+/, '') if options.keys.include?(:only) && options.keys.include?(:except)
70
+ both ":only" and ":except" given. You should use alone option ":only" or ":except"
71
+ M
72
+ end
73
+
74
+ def add_hook(place, args, options, &block)
75
+ options[:only] ||= []
76
+ options[:except] ||= []
77
+ options[:only] = [*options[:only]]
78
+ options[:except] = [*options[:except]]
79
+ hooks[place] << { block: block, options: options, args: args }
80
+ end
81
+
82
+ def run_hook(instance, task, hook)
83
+ instance.instance_eval(&hook[:block]) if runnable?(task, hook)
84
+ end
85
+
86
+ def runnable?(task, hook)
87
+ with_only = hook[:options][:only].empty? || hook[:options][:only].include?(task.name.to_sym)
88
+ with_except = !hook[:options][:except].include?(task.name.to_sym)
89
+ with_only && with_except
90
+ end
91
+ end
92
+
93
+ end
94
+ end
@@ -0,0 +1,26 @@
1
+ class Thor
2
+ module InheritedOptions
3
+
4
+ def self.included(base)
5
+ base.send :extend, ClassMethods
6
+ end
7
+
8
+ module ClassMethods
9
+ def option_to_hash(option)
10
+ values = option.instance_variables.map { |v| option.instance_variable_get v }
11
+ keys = option.instance_variables.map { |sym| sym.to_s.sub('@', '') }
12
+ Hash[keys.zip values]
13
+ end
14
+
15
+ def inherited_method_options(from_action, for_action = nil)
16
+ tasks[from_action.to_s].options.each do |name, option|
17
+ option_hash = option_to_hash(option).symbolize_keys
18
+ option_hash.merge! for: for_action.to_s if for_action
19
+ option_hash[:desc] = option_hash[:description]
20
+ method_option name, option_hash
21
+ end
22
+ end
23
+ end
24
+
25
+ end
26
+ end