macaron 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.md +13 -0
- data/bin/macaron +13 -0
- data/lib/macaron.rb +6 -0
- data/lib/macaron/processor.rb +50 -0
- data/lib/macaron/scraper.rb +78 -0
- data/lib/macaron/spawner.rb +77 -0
- metadata +51 -0
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Dale Ma
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# Macaron
|
2
|
+
Macaron is a simple web scraper implemented in ruby. It's used for service alive testing.
|
3
|
+
|
4
|
+
## Install
|
5
|
+
gem install e-macaron
|
6
|
+
|
7
|
+
## Example
|
8
|
+
```ruby
|
9
|
+
macaron http://www.google.com/
|
10
|
+
```
|
11
|
+
|
12
|
+
## License
|
13
|
+
MIT LICENSE, please refer to the LICENSE file.
|
data/bin/macaron
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/user/bin/ruby
|
2
|
+
|
3
|
+
require File.dirname(__FILE__) + '/../lib/macaron'
|
4
|
+
|
5
|
+
mother = Spawner.new({
|
6
|
+
:thread_timeout_seconds => 999,
|
7
|
+
:in_site_crawling => true,
|
8
|
+
:with_waltir => false
|
9
|
+
})
|
10
|
+
mother.dig('http://rubyconf.tw/2012/', 2)
|
11
|
+
# mother.dig('http://www.sakura.idv.tw/', 2) # url, depth
|
12
|
+
puts "Success times: #{mother.success_times}"
|
13
|
+
puts "Fail times: #{mother.fail_times}"
|
data/lib/macaron.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'threadpool'
|
2
|
+
|
3
|
+
module Macaron
|
4
|
+
class Processor < Job
|
5
|
+
@@output_lock = Mutex.new
|
6
|
+
|
7
|
+
def run
|
8
|
+
begin
|
9
|
+
url = @args.shift
|
10
|
+
depth = @args.shift
|
11
|
+
html = @args.shift
|
12
|
+
return if depth <= 0
|
13
|
+
scraper = Scraper.new
|
14
|
+
scraper.analyze(url, html)
|
15
|
+
|
16
|
+
@@result[url] = {:anchors => scraper.anchors}
|
17
|
+
|
18
|
+
# do some additional analyzes
|
19
|
+
run_sub_tasks(scraper)
|
20
|
+
|
21
|
+
links = nil
|
22
|
+
if @@options[:in_site_crawling]
|
23
|
+
links = scraper.internal_anchors
|
24
|
+
else
|
25
|
+
links = scraper.absolute_anchors
|
26
|
+
end
|
27
|
+
|
28
|
+
links.each { |a|
|
29
|
+
next if @@parsed_urls.include?(a)
|
30
|
+
@@task_map = @@task_map.put(a, depth - 1)
|
31
|
+
}
|
32
|
+
|
33
|
+
@@mutex.synchronize {
|
34
|
+
@@success_times += 1
|
35
|
+
}
|
36
|
+
rescue Exception => e
|
37
|
+
@@mutex.synchronize {
|
38
|
+
@@fail_times += 1
|
39
|
+
}
|
40
|
+
print "Error on job: #{url}, msg: #{e.message}\n"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
def run_sub_tasks(scraper)
|
46
|
+
# p scraper.image_urls
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'benchmark'
|
5
|
+
require 'timeout'
|
6
|
+
require 'watir-webdriver'
|
7
|
+
|
8
|
+
module Macaron
|
9
|
+
class Scraper
|
10
|
+
|
11
|
+
def analyze(host, html)
|
12
|
+
@host = host
|
13
|
+
@html = html
|
14
|
+
|
15
|
+
elapsed_seconds = 0
|
16
|
+
begin
|
17
|
+
timeout(@@options[:nokogiri_timeout_seconds]) do
|
18
|
+
elapsed_seconds = Benchmark.realtime { fetch_dom }
|
19
|
+
end
|
20
|
+
rescue Timeout::Error
|
21
|
+
print "Timeout on #{host}\n"
|
22
|
+
@@mutex.synchronize {
|
23
|
+
@@fail_times += 1
|
24
|
+
}
|
25
|
+
end
|
26
|
+
|
27
|
+
@all_links = absolute_anchors
|
28
|
+
|
29
|
+
print ">> elapsed #{elapsed_seconds} seconds to get '#{host}'\n"
|
30
|
+
end
|
31
|
+
|
32
|
+
def anchors
|
33
|
+
@dom.css('a')
|
34
|
+
end
|
35
|
+
|
36
|
+
def absolute_anchors
|
37
|
+
make_absolute_anchors(anchors)
|
38
|
+
end
|
39
|
+
|
40
|
+
def internal_anchors
|
41
|
+
root = URI.parse(@host).host
|
42
|
+
@all_links.select {|l| URI.parse(l).host == root}
|
43
|
+
end
|
44
|
+
|
45
|
+
def external_anchors
|
46
|
+
root = URI.parse(@host).host
|
47
|
+
@all_links.select {|l| URI.parse(l).host != root}
|
48
|
+
end
|
49
|
+
|
50
|
+
def images
|
51
|
+
@dom.css('img')
|
52
|
+
end
|
53
|
+
|
54
|
+
def image_urls
|
55
|
+
images.map { |img| make_absolute(img['src']) }.compact
|
56
|
+
end
|
57
|
+
|
58
|
+
def fetch_dom
|
59
|
+
if @@options[:with_waltir]
|
60
|
+
@dom = Nokogiri::HTML(@html)
|
61
|
+
else
|
62
|
+
@dom = Nokogiri::HTML(open(@host))
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def make_absolute_anchors(nodes)
|
67
|
+
nodes.map {|n| make_absolute(n['href']) }.compact
|
68
|
+
end
|
69
|
+
|
70
|
+
def make_absolute(href)
|
71
|
+
begin
|
72
|
+
URI.parse(@host).merge(URI.parse(href)).to_s
|
73
|
+
rescue
|
74
|
+
nil
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'threadpool'
|
3
|
+
require 'hamster'
|
4
|
+
|
5
|
+
module Macaron
|
6
|
+
@@result = {}
|
7
|
+
@@parsed_urls = Hamster.set
|
8
|
+
@@task_map = Hamster.hash
|
9
|
+
@@options = {}
|
10
|
+
@@success_times = 0
|
11
|
+
@@fail_times = 0
|
12
|
+
@@mutex = Mutex.new
|
13
|
+
|
14
|
+
class Spawner
|
15
|
+
DEFALUT_OPTIONS = {
|
16
|
+
:nokogiri_timeout_seconds => 30,
|
17
|
+
:thread_timeout_seconds => 40,
|
18
|
+
:pages => 1000,
|
19
|
+
:initial_workers => 1,
|
20
|
+
:maximum_workers => 1,
|
21
|
+
:in_site_crawling => true,
|
22
|
+
:with_waltir => false
|
23
|
+
}.freeze
|
24
|
+
|
25
|
+
def initialize(options = {})
|
26
|
+
@@options = DEFALUT_OPTIONS.merge(options)
|
27
|
+
@threadpool = Threadpool.new(
|
28
|
+
@@options[:initial_workers],
|
29
|
+
@@options[:maximum_workers],
|
30
|
+
@@options[:thread_timeout_seconds]
|
31
|
+
)
|
32
|
+
end
|
33
|
+
|
34
|
+
def success_times
|
35
|
+
@@success_times
|
36
|
+
end
|
37
|
+
|
38
|
+
def fail_times
|
39
|
+
@@fail_times
|
40
|
+
end
|
41
|
+
|
42
|
+
def dig(url, init_depth)
|
43
|
+
@@task_map = @@task_map.put(url, init_depth)
|
44
|
+
loop do
|
45
|
+
@@task_map = @@task_map.remove {|url, depth|
|
46
|
+
@@parsed_urls = @@parsed_urls.add(url)
|
47
|
+
|
48
|
+
if @@options[:with_waltir]
|
49
|
+
html = get_html_via_waltir(url)
|
50
|
+
@threadpool.load(Processor.new(url, depth, html))
|
51
|
+
else
|
52
|
+
@threadpool.load(Processor.new(url, depth))
|
53
|
+
end
|
54
|
+
}
|
55
|
+
|
56
|
+
break if @threadpool.busy_workers_count == 0 && @@task_map.empty?
|
57
|
+
|
58
|
+
if @@success_times > @@options[:pages]
|
59
|
+
print "Fetched pages exceeds the limit #{@@options[:pages]}\n"
|
60
|
+
break
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
@bot.close unless @bot.nil?
|
65
|
+
|
66
|
+
# puts "result: #{@@result.size}, #{@@result.keys}"
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
def get_html_via_waltir(url)
|
71
|
+
@bot ||= Watir::Browser.new
|
72
|
+
@bot.goto(url)
|
73
|
+
@bot.html
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
end
|
metadata
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: macaron
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Dale Ma
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-11-17 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description:
|
15
|
+
email: dalema22@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- bin/macaron
|
21
|
+
- lib/macaron/processor.rb
|
22
|
+
- lib/macaron/scraper.rb
|
23
|
+
- lib/macaron/spawner.rb
|
24
|
+
- lib/macaron.rb
|
25
|
+
- LICENSE
|
26
|
+
- README.md
|
27
|
+
homepage: http://github.com/eguitarz/macaron
|
28
|
+
licenses: []
|
29
|
+
post_install_message:
|
30
|
+
rdoc_options: []
|
31
|
+
require_paths:
|
32
|
+
- lib
|
33
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
34
|
+
none: false
|
35
|
+
requirements:
|
36
|
+
- - ! '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ! '>='
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: '0'
|
45
|
+
requirements: []
|
46
|
+
rubyforge_project:
|
47
|
+
rubygems_version: 1.8.19
|
48
|
+
signing_key:
|
49
|
+
specification_version: 3
|
50
|
+
summary: Ruby based web scraper
|
51
|
+
test_files: []
|