macaron 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 Dale Ma
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,13 @@
1
+ # Macaron
2
+ Macaron is a simple web scraper implemented in ruby. It's used for service alive testing.
3
+
4
+ ## Install
5
+ gem install e-macaron
6
+
7
+ ## Example
8
+ ```ruby
9
+ macaron http://www.google.com/
10
+ ```
11
+
12
+ ## License
13
+ MIT LICENSE, please refer to the LICENSE file.
data/bin/macaron ADDED
@@ -0,0 +1,13 @@
1
+ #!/user/bin/ruby
2
+
3
+ require File.dirname(__FILE__) + '/../lib/macaron'
4
+
5
+ mother = Spawner.new({
6
+ :thread_timeout_seconds => 999,
7
+ :in_site_crawling => true,
8
+ :with_waltir => false
9
+ })
10
+ mother.dig('http://rubyconf.tw/2012/', 2)
11
+ # mother.dig('http://www.sakura.idv.tw/', 2) # url, depth
12
+ puts "Success times: #{mother.success_times}"
13
+ puts "Fail times: #{mother.fail_times}"
data/lib/macaron.rb ADDED
@@ -0,0 +1,6 @@
1
+ $: << File.dirname(__FILE__)
2
+
3
+ require 'macaron/processor'
4
+ require 'macaron/spawner'
5
+ require 'macaron/scraper'
6
+ include Macaron
@@ -0,0 +1,50 @@
1
+ require 'threadpool'
2
+
3
+ module Macaron
4
+ class Processor < Job
5
+ @@output_lock = Mutex.new
6
+
7
+ def run
8
+ begin
9
+ url = @args.shift
10
+ depth = @args.shift
11
+ html = @args.shift
12
+ return if depth <= 0
13
+ scraper = Scraper.new
14
+ scraper.analyze(url, html)
15
+
16
+ @@result[url] = {:anchors => scraper.anchors}
17
+
18
+ # do some additional analyzes
19
+ run_sub_tasks(scraper)
20
+
21
+ links = nil
22
+ if @@options[:in_site_crawling]
23
+ links = scraper.internal_anchors
24
+ else
25
+ links = scraper.absolute_anchors
26
+ end
27
+
28
+ links.each { |a|
29
+ next if @@parsed_urls.include?(a)
30
+ @@task_map = @@task_map.put(a, depth - 1)
31
+ }
32
+
33
+ @@mutex.synchronize {
34
+ @@success_times += 1
35
+ }
36
+ rescue Exception => e
37
+ @@mutex.synchronize {
38
+ @@fail_times += 1
39
+ }
40
+ print "Error on job: #{url}, msg: #{e.message}\n"
41
+ end
42
+ end
43
+
44
+ private
45
+ def run_sub_tasks(scraper)
46
+ # p scraper.image_urls
47
+ end
48
+
49
+ end
50
+ end
@@ -0,0 +1,78 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'benchmark'
5
+ require 'timeout'
6
+ require 'watir-webdriver'
7
+
8
+ module Macaron
9
+ class Scraper
10
+
11
+ def analyze(host, html)
12
+ @host = host
13
+ @html = html
14
+
15
+ elapsed_seconds = 0
16
+ begin
17
+ timeout(@@options[:nokogiri_timeout_seconds]) do
18
+ elapsed_seconds = Benchmark.realtime { fetch_dom }
19
+ end
20
+ rescue Timeout::Error
21
+ print "Timeout on #{host}\n"
22
+ @@mutex.synchronize {
23
+ @@fail_times += 1
24
+ }
25
+ end
26
+
27
+ @all_links = absolute_anchors
28
+
29
+ print ">> elapsed #{elapsed_seconds} seconds to get '#{host}'\n"
30
+ end
31
+
32
+ def anchors
33
+ @dom.css('a')
34
+ end
35
+
36
+ def absolute_anchors
37
+ make_absolute_anchors(anchors)
38
+ end
39
+
40
+ def internal_anchors
41
+ root = URI.parse(@host).host
42
+ @all_links.select {|l| URI.parse(l).host == root}
43
+ end
44
+
45
+ def external_anchors
46
+ root = URI.parse(@host).host
47
+ @all_links.select {|l| URI.parse(l).host != root}
48
+ end
49
+
50
+ def images
51
+ @dom.css('img')
52
+ end
53
+
54
+ def image_urls
55
+ images.map { |img| make_absolute(img['src']) }.compact
56
+ end
57
+
58
+ def fetch_dom
59
+ if @@options[:with_waltir]
60
+ @dom = Nokogiri::HTML(@html)
61
+ else
62
+ @dom = Nokogiri::HTML(open(@host))
63
+ end
64
+ end
65
+
66
+ def make_absolute_anchors(nodes)
67
+ nodes.map {|n| make_absolute(n['href']) }.compact
68
+ end
69
+
70
+ def make_absolute(href)
71
+ begin
72
+ URI.parse(@host).merge(URI.parse(href)).to_s
73
+ rescue
74
+ nil
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,77 @@
1
+ require 'rubygems'
2
+ require 'threadpool'
3
+ require 'hamster'
4
+
5
+ module Macaron
6
+ @@result = {}
7
+ @@parsed_urls = Hamster.set
8
+ @@task_map = Hamster.hash
9
+ @@options = {}
10
+ @@success_times = 0
11
+ @@fail_times = 0
12
+ @@mutex = Mutex.new
13
+
14
+ class Spawner
15
+ DEFALUT_OPTIONS = {
16
+ :nokogiri_timeout_seconds => 30,
17
+ :thread_timeout_seconds => 40,
18
+ :pages => 1000,
19
+ :initial_workers => 1,
20
+ :maximum_workers => 1,
21
+ :in_site_crawling => true,
22
+ :with_waltir => false
23
+ }.freeze
24
+
25
+ def initialize(options = {})
26
+ @@options = DEFALUT_OPTIONS.merge(options)
27
+ @threadpool = Threadpool.new(
28
+ @@options[:initial_workers],
29
+ @@options[:maximum_workers],
30
+ @@options[:thread_timeout_seconds]
31
+ )
32
+ end
33
+
34
+ def success_times
35
+ @@success_times
36
+ end
37
+
38
+ def fail_times
39
+ @@fail_times
40
+ end
41
+
42
+ def dig(url, init_depth)
43
+ @@task_map = @@task_map.put(url, init_depth)
44
+ loop do
45
+ @@task_map = @@task_map.remove {|url, depth|
46
+ @@parsed_urls = @@parsed_urls.add(url)
47
+
48
+ if @@options[:with_waltir]
49
+ html = get_html_via_waltir(url)
50
+ @threadpool.load(Processor.new(url, depth, html))
51
+ else
52
+ @threadpool.load(Processor.new(url, depth))
53
+ end
54
+ }
55
+
56
+ break if @threadpool.busy_workers_count == 0 && @@task_map.empty?
57
+
58
+ if @@success_times > @@options[:pages]
59
+ print "Fetched pages exceeds the limit #{@@options[:pages]}\n"
60
+ break
61
+ end
62
+ end
63
+
64
+ @bot.close unless @bot.nil?
65
+
66
+ # puts "result: #{@@result.size}, #{@@result.keys}"
67
+ end
68
+
69
+ private
70
+ def get_html_via_waltir(url)
71
+ @bot ||= Watir::Browser.new
72
+ @bot.goto(url)
73
+ @bot.html
74
+ end
75
+
76
+ end
77
+ end
metadata ADDED
@@ -0,0 +1,51 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: macaron
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Dale Ma
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-11-17 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description:
15
+ email: dalema22@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - bin/macaron
21
+ - lib/macaron/processor.rb
22
+ - lib/macaron/scraper.rb
23
+ - lib/macaron/spawner.rb
24
+ - lib/macaron.rb
25
+ - LICENSE
26
+ - README.md
27
+ homepage: http://github.com/eguitarz/macaron
28
+ licenses: []
29
+ post_install_message:
30
+ rdoc_options: []
31
+ require_paths:
32
+ - lib
33
+ required_ruby_version: !ruby/object:Gem::Requirement
34
+ none: false
35
+ requirements:
36
+ - - ! '>='
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ! '>='
43
+ - !ruby/object:Gem::Version
44
+ version: '0'
45
+ requirements: []
46
+ rubyforge_project:
47
+ rubygems_version: 1.8.19
48
+ signing_key:
49
+ specification_version: 3
50
+ summary: Ruby based web scraper
51
+ test_files: []