macaron 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 Dale Ma
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,13 @@
1
+ # Macaron
2
+ Macaron is a simple web scraper implemented in ruby. It's used for service alive testing.
3
+
4
+ ## Install
5
+ gem install e-macaron
6
+
7
+ ## Example
8
+ ```ruby
9
+ macaron http://www.google.com/
10
+ ```
11
+
12
+ ## License
13
+ MIT LICENSE, please refer to the LICENSE file.
data/bin/macaron ADDED
@@ -0,0 +1,13 @@
1
+ #!/user/bin/ruby
2
+
3
+ require File.dirname(__FILE__) + '/../lib/macaron'
4
+
5
+ mother = Spawner.new({
6
+ :thread_timeout_seconds => 999,
7
+ :in_site_crawling => true,
8
+ :with_waltir => false
9
+ })
10
+ mother.dig('http://rubyconf.tw/2012/', 2)
11
+ # mother.dig('http://www.sakura.idv.tw/', 2) # url, depth
12
+ puts "Success times: #{mother.success_times}"
13
+ puts "Fail times: #{mother.fail_times}"
data/lib/macaron.rb ADDED
@@ -0,0 +1,6 @@
1
+ $: << File.dirname(__FILE__)
2
+
3
+ require 'macaron/processor'
4
+ require 'macaron/spawner'
5
+ require 'macaron/scraper'
6
+ include Macaron
@@ -0,0 +1,50 @@
1
+ require 'threadpool'
2
+
3
+ module Macaron
4
+ class Processor < Job
5
+ @@output_lock = Mutex.new
6
+
7
+ def run
8
+ begin
9
+ url = @args.shift
10
+ depth = @args.shift
11
+ html = @args.shift
12
+ return if depth <= 0
13
+ scraper = Scraper.new
14
+ scraper.analyze(url, html)
15
+
16
+ @@result[url] = {:anchors => scraper.anchors}
17
+
18
+ # do some additional analyzes
19
+ run_sub_tasks(scraper)
20
+
21
+ links = nil
22
+ if @@options[:in_site_crawling]
23
+ links = scraper.internal_anchors
24
+ else
25
+ links = scraper.absolute_anchors
26
+ end
27
+
28
+ links.each { |a|
29
+ next if @@parsed_urls.include?(a)
30
+ @@task_map = @@task_map.put(a, depth - 1)
31
+ }
32
+
33
+ @@mutex.synchronize {
34
+ @@success_times += 1
35
+ }
36
+ rescue Exception => e
37
+ @@mutex.synchronize {
38
+ @@fail_times += 1
39
+ }
40
+ print "Error on job: #{url}, msg: #{e.message}\n"
41
+ end
42
+ end
43
+
44
+ private
45
+ def run_sub_tasks(scraper)
46
+ # p scraper.image_urls
47
+ end
48
+
49
+ end
50
+ end
@@ -0,0 +1,78 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'benchmark'
5
+ require 'timeout'
6
+ require 'watir-webdriver'
7
+
8
+ module Macaron
9
+ class Scraper
10
+
11
+ def analyze(host, html)
12
+ @host = host
13
+ @html = html
14
+
15
+ elapsed_seconds = 0
16
+ begin
17
+ timeout(@@options[:nokogiri_timeout_seconds]) do
18
+ elapsed_seconds = Benchmark.realtime { fetch_dom }
19
+ end
20
+ rescue Timeout::Error
21
+ print "Timeout on #{host}\n"
22
+ @@mutex.synchronize {
23
+ @@fail_times += 1
24
+ }
25
+ end
26
+
27
+ @all_links = absolute_anchors
28
+
29
+ print ">> elapsed #{elapsed_seconds} seconds to get '#{host}'\n"
30
+ end
31
+
32
+ def anchors
33
+ @dom.css('a')
34
+ end
35
+
36
+ def absolute_anchors
37
+ make_absolute_anchors(anchors)
38
+ end
39
+
40
+ def internal_anchors
41
+ root = URI.parse(@host).host
42
+ @all_links.select {|l| URI.parse(l).host == root}
43
+ end
44
+
45
+ def external_anchors
46
+ root = URI.parse(@host).host
47
+ @all_links.select {|l| URI.parse(l).host != root}
48
+ end
49
+
50
+ def images
51
+ @dom.css('img')
52
+ end
53
+
54
+ def image_urls
55
+ images.map { |img| make_absolute(img['src']) }.compact
56
+ end
57
+
58
+ def fetch_dom
59
+ if @@options[:with_waltir]
60
+ @dom = Nokogiri::HTML(@html)
61
+ else
62
+ @dom = Nokogiri::HTML(open(@host))
63
+ end
64
+ end
65
+
66
+ def make_absolute_anchors(nodes)
67
+ nodes.map {|n| make_absolute(n['href']) }.compact
68
+ end
69
+
70
+ def make_absolute(href)
71
+ begin
72
+ URI.parse(@host).merge(URI.parse(href)).to_s
73
+ rescue
74
+ nil
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,77 @@
1
+ require 'rubygems'
2
+ require 'threadpool'
3
+ require 'hamster'
4
+
5
+ module Macaron
6
+ @@result = {}
7
+ @@parsed_urls = Hamster.set
8
+ @@task_map = Hamster.hash
9
+ @@options = {}
10
+ @@success_times = 0
11
+ @@fail_times = 0
12
+ @@mutex = Mutex.new
13
+
14
+ class Spawner
15
+ DEFALUT_OPTIONS = {
16
+ :nokogiri_timeout_seconds => 30,
17
+ :thread_timeout_seconds => 40,
18
+ :pages => 1000,
19
+ :initial_workers => 1,
20
+ :maximum_workers => 1,
21
+ :in_site_crawling => true,
22
+ :with_waltir => false
23
+ }.freeze
24
+
25
+ def initialize(options = {})
26
+ @@options = DEFALUT_OPTIONS.merge(options)
27
+ @threadpool = Threadpool.new(
28
+ @@options[:initial_workers],
29
+ @@options[:maximum_workers],
30
+ @@options[:thread_timeout_seconds]
31
+ )
32
+ end
33
+
34
+ def success_times
35
+ @@success_times
36
+ end
37
+
38
+ def fail_times
39
+ @@fail_times
40
+ end
41
+
42
+ def dig(url, init_depth)
43
+ @@task_map = @@task_map.put(url, init_depth)
44
+ loop do
45
+ @@task_map = @@task_map.remove {|url, depth|
46
+ @@parsed_urls = @@parsed_urls.add(url)
47
+
48
+ if @@options[:with_waltir]
49
+ html = get_html_via_waltir(url)
50
+ @threadpool.load(Processor.new(url, depth, html))
51
+ else
52
+ @threadpool.load(Processor.new(url, depth))
53
+ end
54
+ }
55
+
56
+ break if @threadpool.busy_workers_count == 0 && @@task_map.empty?
57
+
58
+ if @@success_times > @@options[:pages]
59
+ print "Fetched pages exceeds the limit #{@@options[:pages]}\n"
60
+ break
61
+ end
62
+ end
63
+
64
+ @bot.close unless @bot.nil?
65
+
66
+ # puts "result: #{@@result.size}, #{@@result.keys}"
67
+ end
68
+
69
+ private
70
+ def get_html_via_waltir(url)
71
+ @bot ||= Watir::Browser.new
72
+ @bot.goto(url)
73
+ @bot.html
74
+ end
75
+
76
+ end
77
+ end
metadata ADDED
@@ -0,0 +1,51 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: macaron
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Dale Ma
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-11-17 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description:
15
+ email: dalema22@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - bin/macaron
21
+ - lib/macaron/processor.rb
22
+ - lib/macaron/scraper.rb
23
+ - lib/macaron/spawner.rb
24
+ - lib/macaron.rb
25
+ - LICENSE
26
+ - README.md
27
+ homepage: http://github.com/eguitarz/macaron
28
+ licenses: []
29
+ post_install_message:
30
+ rdoc_options: []
31
+ require_paths:
32
+ - lib
33
+ required_ruby_version: !ruby/object:Gem::Requirement
34
+ none: false
35
+ requirements:
36
+ - - ! '>='
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ! '>='
43
+ - !ruby/object:Gem::Version
44
+ version: '0'
45
+ requirements: []
46
+ rubyforge_project:
47
+ rubygems_version: 1.8.19
48
+ signing_key:
49
+ specification_version: 3
50
+ summary: Ruby based web scraper
51
+ test_files: []