macaron 1.0.2 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/macaron +3 -6
- data/lib/macaron.rb +2 -2
- data/lib/macaron/crawler.rb +23 -0
- data/lib/macaron/page.rb +73 -0
- data/lib/macaron/spawner.rb +53 -59
- metadata +18 -26
- data/lib/macaron/processor.rb +0 -62
- data/lib/macaron/scraper.rb +0 -79
data/bin/macaron
CHANGED
@@ -17,7 +17,7 @@ options_parser = OptionParser.new do |opts|
|
|
17
17
|
end
|
18
18
|
|
19
19
|
opts.on("-j", "--javascript", "Open javascript support mode") do |j|
|
20
|
-
options[:
|
20
|
+
options[:with_watir] = j
|
21
21
|
end
|
22
22
|
|
23
23
|
opts.on("-s", "--save", "Save html") do |s|
|
@@ -38,9 +38,6 @@ if ARGV.length != 1
|
|
38
38
|
end
|
39
39
|
|
40
40
|
url = ARGV.first
|
41
|
-
puts "
|
41
|
+
puts "Started"
|
42
42
|
|
43
|
-
|
44
|
-
mother.dig(url, options[:depth])
|
45
|
-
puts "Success times: #{mother.success_times}"
|
46
|
-
puts "Fail times: #{mother.fail_times}"
|
43
|
+
Spawner.new(url, options)
|
data/lib/macaron.rb
CHANGED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'observer'
|
2
|
+
require 'timeout'
|
3
|
+
require 'threadpool'
|
4
|
+
|
5
|
+
module Macaron
|
6
|
+
class Crawler < Job
|
7
|
+
include Observable
|
8
|
+
|
9
|
+
def run
|
10
|
+
url, bot = @args
|
11
|
+
page = Page.new(url, bot)
|
12
|
+
links = []
|
13
|
+
begin
|
14
|
+
links = page.fetch.inner_links
|
15
|
+
rescue
|
16
|
+
end
|
17
|
+
changed
|
18
|
+
notify_observers(links)
|
19
|
+
print "#{url} >> #{page.title}\n"
|
20
|
+
delete_observers
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/macaron/page.rb
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'thread'
|
4
|
+
|
5
|
+
module Macaron
|
6
|
+
class Page
|
7
|
+
def initialize(url, bot=nil)
|
8
|
+
@url = url
|
9
|
+
@bot = bot
|
10
|
+
@@bot_lock = Mutex.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def fetch
|
14
|
+
document
|
15
|
+
base(@url)
|
16
|
+
self
|
17
|
+
end
|
18
|
+
|
19
|
+
def inner_links
|
20
|
+
anchors = links.select {|link|
|
21
|
+
URI.parse(link).host == @base.host
|
22
|
+
}.compact
|
23
|
+
end
|
24
|
+
|
25
|
+
def title
|
26
|
+
@doc.title
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
def document
|
31
|
+
@doc ||= Nokogiri::HTML(content)
|
32
|
+
end
|
33
|
+
|
34
|
+
def base(href)
|
35
|
+
base = @doc.css('base')
|
36
|
+
header_base_url = base.attr('href').text unless base.empty?
|
37
|
+
base_url = header_base_url || @url
|
38
|
+
@base ||= URI.parse(base_url)
|
39
|
+
end
|
40
|
+
|
41
|
+
def content
|
42
|
+
if @bot
|
43
|
+
# only activate one browser, needs to be thread safe.
|
44
|
+
@@bot_lock.synchronize {
|
45
|
+
@bot.goto(@url)
|
46
|
+
@bot.html
|
47
|
+
}
|
48
|
+
else
|
49
|
+
open(@url)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def links
|
54
|
+
@doc.css('a').map {|a|
|
55
|
+
href = a['href']
|
56
|
+
if href.start_with? 'http'
|
57
|
+
href
|
58
|
+
else
|
59
|
+
make_absolute(href)
|
60
|
+
end
|
61
|
+
}.compact
|
62
|
+
end
|
63
|
+
|
64
|
+
def make_absolute(href)
|
65
|
+
begin
|
66
|
+
@base.merge(URI.parse(href)).to_s
|
67
|
+
rescue
|
68
|
+
nil
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
data/lib/macaron/spawner.rb
CHANGED
@@ -1,79 +1,73 @@
|
|
1
|
-
require '
|
2
|
-
require '
|
3
|
-
require '
|
1
|
+
require 'timeout'
|
2
|
+
require 'observer'
|
3
|
+
require 'watir-webdriver'
|
4
4
|
|
5
5
|
module Macaron
|
6
|
-
@@result = {}
|
7
|
-
@@parsed_urls = Hamster.set
|
8
|
-
@@task_map = Hamster.hash
|
9
|
-
@@options = {}
|
10
|
-
@@success_times = 0
|
11
|
-
@@fail_times = 0
|
12
|
-
@@mutex = Mutex.new
|
13
|
-
|
14
6
|
class Spawner
|
15
|
-
|
16
|
-
|
17
|
-
:thread_timeout_seconds => 40,
|
18
|
-
:pages => 1000,
|
19
|
-
:initial_workers => 4,
|
20
|
-
:maximum_workers => 4,
|
21
|
-
:in_site_crawling => true,
|
22
|
-
:with_waltir => false,
|
23
|
-
:debug => false
|
24
|
-
}.freeze
|
7
|
+
def initialize(url, options)
|
8
|
+
@options = options
|
25
9
|
|
26
|
-
|
27
|
-
|
28
|
-
@threadpool = Threadpool.new(
|
29
|
-
@@options[:initial_workers],
|
30
|
-
@@options[:maximum_workers],
|
31
|
-
@@options[:thread_timeout_seconds]
|
32
|
-
)
|
33
|
-
end
|
10
|
+
# threadpool(init workers, max workers, job timeout)
|
11
|
+
threadpool = Threadpool.new(10, 10, job_timeout)
|
34
12
|
|
35
|
-
|
36
|
-
|
37
|
-
|
13
|
+
# tasks saves the on-processing urls
|
14
|
+
@tasks = Queue.new
|
15
|
+
@tasks << url
|
38
16
|
|
39
|
-
|
40
|
-
|
41
|
-
|
17
|
+
# parsed_urls used to prevent loop crawling
|
18
|
+
@parsed_urls = [url]
|
19
|
+
|
20
|
+
# awaiting_counter saves the awaiting task number
|
21
|
+
@awaiting_counter = 1
|
22
|
+
|
23
|
+
# bot is a webdriver
|
24
|
+
bot = Watir::Browser.new if @options[:with_watir]
|
42
25
|
|
43
|
-
def dig(url, init_depth=3)
|
44
|
-
@@task_map = @@task_map.put(url, init_depth)
|
45
26
|
loop do
|
46
|
-
|
47
|
-
|
27
|
+
break if @awaiting_counter == 0
|
28
|
+
|
29
|
+
begin
|
30
|
+
Timeout::timeout(task_timeout) { url = @tasks.shift }
|
31
|
+
rescue
|
32
|
+
next
|
33
|
+
end
|
48
34
|
|
49
|
-
|
50
|
-
|
51
|
-
@threadpool.load(Processor.new(url, depth, html))
|
52
|
-
else
|
53
|
-
@threadpool.load(Processor.new(url, depth))
|
54
|
-
end
|
35
|
+
job = Macaron::Crawler.new(url, bot)
|
36
|
+
job.add_observer(self)
|
55
37
|
|
56
|
-
|
57
|
-
|
38
|
+
threadpool.load(job)
|
39
|
+
end
|
58
40
|
|
59
|
-
|
41
|
+
bot.close unless bot.nil?
|
42
|
+
end
|
60
43
|
|
61
|
-
|
62
|
-
|
63
|
-
|
44
|
+
def update(links)
|
45
|
+
@awaiting_counter -= 1
|
46
|
+
links.each do |link|
|
47
|
+
unless @parsed_urls.include?(link)
|
48
|
+
@tasks << link
|
49
|
+
@awaiting_counter += 1
|
64
50
|
end
|
51
|
+
@parsed_urls << link
|
65
52
|
end
|
53
|
+
end
|
66
54
|
|
67
|
-
|
68
|
-
|
69
|
-
|
55
|
+
private
|
56
|
+
def task_timeout
|
57
|
+
# webdriver is slow, it takes more time to wait the result.
|
58
|
+
if @options[:with_watir]
|
59
|
+
10
|
60
|
+
else
|
61
|
+
2
|
62
|
+
end
|
70
63
|
end
|
71
64
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
65
|
+
def job_timeout
|
66
|
+
if @options[:with_watir]
|
67
|
+
20
|
68
|
+
else
|
69
|
+
10
|
70
|
+
end
|
77
71
|
end
|
78
72
|
|
79
73
|
end
|
metadata
CHANGED
@@ -1,61 +1,53 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: macaron
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
4
|
+
version: 2.0.1
|
5
|
+
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Dale Ma
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-12-
|
12
|
+
date: 2012-12-11 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
|
-
description:
|
14
|
+
description:
|
15
15
|
email: dalema22@gmail.com
|
16
16
|
executables:
|
17
17
|
- macaron
|
18
18
|
extensions: []
|
19
19
|
extra_rdoc_files: []
|
20
20
|
files:
|
21
|
-
-
|
22
|
-
|
23
|
-
-
|
24
|
-
|
25
|
-
-
|
26
|
-
|
27
|
-
- !binary |-
|
28
|
-
bGliL21hY2Fyb24vc2NyYXBlci5yYg==
|
29
|
-
- !binary |-
|
30
|
-
bGliL21hY2Fyb24vc3Bhd25lci5yYg==
|
31
|
-
- !binary |-
|
32
|
-
bGliL21hY2Fyb24vdmVyc2lvbi5yYg==
|
21
|
+
- bin/macaron
|
22
|
+
- lib/macaron/crawler.rb
|
23
|
+
- lib/macaron/page.rb
|
24
|
+
- lib/macaron/spawner.rb
|
25
|
+
- lib/macaron/version.rb
|
26
|
+
- lib/macaron.rb
|
33
27
|
- LICENSE
|
34
28
|
- README.md
|
35
29
|
homepage: http://github.com/eguitarz/macaron
|
36
30
|
licenses: []
|
37
|
-
post_install_message:
|
31
|
+
post_install_message:
|
38
32
|
rdoc_options: []
|
39
33
|
require_paths:
|
40
34
|
- lib
|
41
35
|
required_ruby_version: !ruby/object:Gem::Requirement
|
36
|
+
none: false
|
42
37
|
requirements:
|
43
38
|
- - ! '>='
|
44
39
|
- !ruby/object:Gem::Version
|
45
|
-
version:
|
46
|
-
MA==
|
47
|
-
none: false
|
40
|
+
version: '0'
|
48
41
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
49
43
|
requirements:
|
50
44
|
- - ! '>='
|
51
45
|
- !ruby/object:Gem::Version
|
52
|
-
version:
|
53
|
-
MA==
|
54
|
-
none: false
|
46
|
+
version: '0'
|
55
47
|
requirements: []
|
56
|
-
rubyforge_project:
|
48
|
+
rubyforge_project:
|
57
49
|
rubygems_version: 1.8.24
|
58
|
-
signing_key:
|
50
|
+
signing_key:
|
59
51
|
specification_version: 3
|
60
52
|
summary: Ruby based web scraper
|
61
53
|
test_files: []
|
data/lib/macaron/processor.rb
DELETED
@@ -1,62 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'threadpool'
|
3
|
-
|
4
|
-
module Macaron
|
5
|
-
class Processor < Job
|
6
|
-
@@output_lock = Mutex.new
|
7
|
-
|
8
|
-
def run
|
9
|
-
begin
|
10
|
-
url = @args.shift
|
11
|
-
depth = @args.shift
|
12
|
-
html = @args.shift
|
13
|
-
return if depth <= 0
|
14
|
-
scraper = Scraper.new
|
15
|
-
scraper.analyze(url, html)
|
16
|
-
|
17
|
-
# @@result[url] = {:anchors => scraper.anchors}
|
18
|
-
@@result[url] = true;
|
19
|
-
|
20
|
-
# do some additional analyzes
|
21
|
-
run_sub_tasks(scraper)
|
22
|
-
|
23
|
-
links = nil
|
24
|
-
if @@options[:in_site_crawling]
|
25
|
-
links = scraper.internal_anchors
|
26
|
-
else
|
27
|
-
links = scraper.absolute_anchors
|
28
|
-
end
|
29
|
-
puts "found #{links.size} links on #{url}" if @@options[:debug]
|
30
|
-
|
31
|
-
links.each { |a|
|
32
|
-
next if @@parsed_urls.include?(a)
|
33
|
-
p "put #{a} into tasks" if @@options[:debug]
|
34
|
-
@@task_map = @@task_map.put(a, depth - 1)
|
35
|
-
}
|
36
|
-
|
37
|
-
@@mutex.synchronize {
|
38
|
-
@@success_times += 1
|
39
|
-
}
|
40
|
-
rescue Exception => e
|
41
|
-
@@mutex.synchronize {
|
42
|
-
@@fail_times += 1
|
43
|
-
}
|
44
|
-
print "Error on job: #{url}, msg: #{e.message}\n"
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
private
|
49
|
-
def run_sub_tasks(scraper)
|
50
|
-
# p scraper.image_urls
|
51
|
-
|
52
|
-
if @@options[:save]
|
53
|
-
dir = @@options[:dir] || '/tmp'
|
54
|
-
filename = scraper.host.gsub('/', '\\')
|
55
|
-
File.open(File.join(dir, filename), "w+") do |f|
|
56
|
-
f.write(scraper.dom)
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
end
|
62
|
-
end
|
data/lib/macaron/scraper.rb
DELETED
@@ -1,79 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'nokogiri'
|
3
|
-
require 'open-uri'
|
4
|
-
require 'benchmark'
|
5
|
-
require 'timeout'
|
6
|
-
require 'watir-webdriver'
|
7
|
-
|
8
|
-
module Macaron
|
9
|
-
class Scraper
|
10
|
-
attr_reader :dom, :host
|
11
|
-
|
12
|
-
def analyze(host, html)
|
13
|
-
@host = host
|
14
|
-
@html = html
|
15
|
-
|
16
|
-
elapsed_seconds = 0
|
17
|
-
begin
|
18
|
-
timeout(@@options[:nokogiri_timeout_seconds]) do
|
19
|
-
elapsed_seconds = Benchmark.realtime { fetch_dom }
|
20
|
-
end
|
21
|
-
rescue Timeout::Error
|
22
|
-
print "Timeout on #{host}\n"
|
23
|
-
@@mutex.synchronize {
|
24
|
-
@@fail_times += 1
|
25
|
-
}
|
26
|
-
end
|
27
|
-
|
28
|
-
@all_links = absolute_anchors
|
29
|
-
|
30
|
-
print ">> elapsed #{elapsed_seconds} seconds to get '#{host}'\n"
|
31
|
-
end
|
32
|
-
|
33
|
-
def anchors
|
34
|
-
@dom.css('a')
|
35
|
-
end
|
36
|
-
|
37
|
-
def absolute_anchors
|
38
|
-
make_absolute_anchors(anchors)
|
39
|
-
end
|
40
|
-
|
41
|
-
def internal_anchors
|
42
|
-
root = URI.parse(@host).host
|
43
|
-
@all_links.select {|l| URI.parse(l).host == root}
|
44
|
-
end
|
45
|
-
|
46
|
-
def external_anchors
|
47
|
-
root = URI.parse(@host).host
|
48
|
-
@all_links.select {|l| URI.parse(l).host != root}
|
49
|
-
end
|
50
|
-
|
51
|
-
def images
|
52
|
-
@dom.css('img')
|
53
|
-
end
|
54
|
-
|
55
|
-
def image_urls
|
56
|
-
images.map { |img| make_absolute(img['src']) }.compact
|
57
|
-
end
|
58
|
-
|
59
|
-
def fetch_dom
|
60
|
-
unless @@options[:with_waltir]
|
61
|
-
@html = open(@host)
|
62
|
-
end
|
63
|
-
|
64
|
-
@dom = Nokogiri::HTML(@html)
|
65
|
-
end
|
66
|
-
|
67
|
-
def make_absolute_anchors(nodes)
|
68
|
-
nodes.map {|n| make_absolute(n['href']) }.compact
|
69
|
-
end
|
70
|
-
|
71
|
-
def make_absolute(href)
|
72
|
-
begin
|
73
|
-
URI.parse(@host).merge(URI.parse(href)).to_s
|
74
|
-
rescue
|
75
|
-
nil
|
76
|
-
end
|
77
|
-
end
|
78
|
-
end
|
79
|
-
end
|