macaron 1.0.2 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/macaron +3 -6
- data/lib/macaron.rb +2 -2
- data/lib/macaron/crawler.rb +23 -0
- data/lib/macaron/page.rb +73 -0
- data/lib/macaron/spawner.rb +53 -59
- metadata +18 -26
- data/lib/macaron/processor.rb +0 -62
- data/lib/macaron/scraper.rb +0 -79
data/bin/macaron
CHANGED
|
@@ -17,7 +17,7 @@ options_parser = OptionParser.new do |opts|
|
|
|
17
17
|
end
|
|
18
18
|
|
|
19
19
|
opts.on("-j", "--javascript", "Open javascript support mode") do |j|
|
|
20
|
-
options[:
|
|
20
|
+
options[:with_watir] = j
|
|
21
21
|
end
|
|
22
22
|
|
|
23
23
|
opts.on("-s", "--save", "Save html") do |s|
|
|
@@ -38,9 +38,6 @@ if ARGV.length != 1
|
|
|
38
38
|
end
|
|
39
39
|
|
|
40
40
|
url = ARGV.first
|
|
41
|
-
puts "
|
|
41
|
+
puts "Started"
|
|
42
42
|
|
|
43
|
-
|
|
44
|
-
mother.dig(url, options[:depth])
|
|
45
|
-
puts "Success times: #{mother.success_times}"
|
|
46
|
-
puts "Fail times: #{mother.fail_times}"
|
|
43
|
+
Spawner.new(url, options)
|
data/lib/macaron.rb
CHANGED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require 'observer'
|
|
2
|
+
require 'timeout'
|
|
3
|
+
require 'threadpool'
|
|
4
|
+
|
|
5
|
+
module Macaron
|
|
6
|
+
class Crawler < Job
|
|
7
|
+
include Observable
|
|
8
|
+
|
|
9
|
+
def run
|
|
10
|
+
url, bot = @args
|
|
11
|
+
page = Page.new(url, bot)
|
|
12
|
+
links = []
|
|
13
|
+
begin
|
|
14
|
+
links = page.fetch.inner_links
|
|
15
|
+
rescue
|
|
16
|
+
end
|
|
17
|
+
changed
|
|
18
|
+
notify_observers(links)
|
|
19
|
+
print "#{url} >> #{page.title}\n"
|
|
20
|
+
delete_observers
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
data/lib/macaron/page.rb
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
require 'open-uri'
|
|
2
|
+
require 'nokogiri'
|
|
3
|
+
require 'thread'
|
|
4
|
+
|
|
5
|
+
module Macaron
|
|
6
|
+
class Page
|
|
7
|
+
def initialize(url, bot=nil)
|
|
8
|
+
@url = url
|
|
9
|
+
@bot = bot
|
|
10
|
+
@@bot_lock = Mutex.new
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def fetch
|
|
14
|
+
document
|
|
15
|
+
base(@url)
|
|
16
|
+
self
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def inner_links
|
|
20
|
+
anchors = links.select {|link|
|
|
21
|
+
URI.parse(link).host == @base.host
|
|
22
|
+
}.compact
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def title
|
|
26
|
+
@doc.title
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
def document
|
|
31
|
+
@doc ||= Nokogiri::HTML(content)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def base(href)
|
|
35
|
+
base = @doc.css('base')
|
|
36
|
+
header_base_url = base.attr('href').text unless base.empty?
|
|
37
|
+
base_url = header_base_url || @url
|
|
38
|
+
@base ||= URI.parse(base_url)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def content
|
|
42
|
+
if @bot
|
|
43
|
+
# only activate one browser, needs to be thread safe.
|
|
44
|
+
@@bot_lock.synchronize {
|
|
45
|
+
@bot.goto(@url)
|
|
46
|
+
@bot.html
|
|
47
|
+
}
|
|
48
|
+
else
|
|
49
|
+
open(@url)
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def links
|
|
54
|
+
@doc.css('a').map {|a|
|
|
55
|
+
href = a['href']
|
|
56
|
+
if href.start_with? 'http'
|
|
57
|
+
href
|
|
58
|
+
else
|
|
59
|
+
make_absolute(href)
|
|
60
|
+
end
|
|
61
|
+
}.compact
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def make_absolute(href)
|
|
65
|
+
begin
|
|
66
|
+
@base.merge(URI.parse(href)).to_s
|
|
67
|
+
rescue
|
|
68
|
+
nil
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
end
|
|
73
|
+
end
|
data/lib/macaron/spawner.rb
CHANGED
|
@@ -1,79 +1,73 @@
|
|
|
1
|
-
require '
|
|
2
|
-
require '
|
|
3
|
-
require '
|
|
1
|
+
require 'timeout'
|
|
2
|
+
require 'observer'
|
|
3
|
+
require 'watir-webdriver'
|
|
4
4
|
|
|
5
5
|
module Macaron
|
|
6
|
-
@@result = {}
|
|
7
|
-
@@parsed_urls = Hamster.set
|
|
8
|
-
@@task_map = Hamster.hash
|
|
9
|
-
@@options = {}
|
|
10
|
-
@@success_times = 0
|
|
11
|
-
@@fail_times = 0
|
|
12
|
-
@@mutex = Mutex.new
|
|
13
|
-
|
|
14
6
|
class Spawner
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
:thread_timeout_seconds => 40,
|
|
18
|
-
:pages => 1000,
|
|
19
|
-
:initial_workers => 4,
|
|
20
|
-
:maximum_workers => 4,
|
|
21
|
-
:in_site_crawling => true,
|
|
22
|
-
:with_waltir => false,
|
|
23
|
-
:debug => false
|
|
24
|
-
}.freeze
|
|
7
|
+
def initialize(url, options)
|
|
8
|
+
@options = options
|
|
25
9
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
@threadpool = Threadpool.new(
|
|
29
|
-
@@options[:initial_workers],
|
|
30
|
-
@@options[:maximum_workers],
|
|
31
|
-
@@options[:thread_timeout_seconds]
|
|
32
|
-
)
|
|
33
|
-
end
|
|
10
|
+
# threadpool(init workers, max workers, job timeout)
|
|
11
|
+
threadpool = Threadpool.new(10, 10, job_timeout)
|
|
34
12
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
13
|
+
# tasks saves the on-processing urls
|
|
14
|
+
@tasks = Queue.new
|
|
15
|
+
@tasks << url
|
|
38
16
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
17
|
+
# parsed_urls used to prevent loop crawling
|
|
18
|
+
@parsed_urls = [url]
|
|
19
|
+
|
|
20
|
+
# awaiting_counter saves the awaiting task number
|
|
21
|
+
@awaiting_counter = 1
|
|
22
|
+
|
|
23
|
+
# bot is a webdriver
|
|
24
|
+
bot = Watir::Browser.new if @options[:with_watir]
|
|
42
25
|
|
|
43
|
-
def dig(url, init_depth=3)
|
|
44
|
-
@@task_map = @@task_map.put(url, init_depth)
|
|
45
26
|
loop do
|
|
46
|
-
|
|
47
|
-
|
|
27
|
+
break if @awaiting_counter == 0
|
|
28
|
+
|
|
29
|
+
begin
|
|
30
|
+
Timeout::timeout(task_timeout) { url = @tasks.shift }
|
|
31
|
+
rescue
|
|
32
|
+
next
|
|
33
|
+
end
|
|
48
34
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
@threadpool.load(Processor.new(url, depth, html))
|
|
52
|
-
else
|
|
53
|
-
@threadpool.load(Processor.new(url, depth))
|
|
54
|
-
end
|
|
35
|
+
job = Macaron::Crawler.new(url, bot)
|
|
36
|
+
job.add_observer(self)
|
|
55
37
|
|
|
56
|
-
|
|
57
|
-
|
|
38
|
+
threadpool.load(job)
|
|
39
|
+
end
|
|
58
40
|
|
|
59
|
-
|
|
41
|
+
bot.close unless bot.nil?
|
|
42
|
+
end
|
|
60
43
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
44
|
+
def update(links)
|
|
45
|
+
@awaiting_counter -= 1
|
|
46
|
+
links.each do |link|
|
|
47
|
+
unless @parsed_urls.include?(link)
|
|
48
|
+
@tasks << link
|
|
49
|
+
@awaiting_counter += 1
|
|
64
50
|
end
|
|
51
|
+
@parsed_urls << link
|
|
65
52
|
end
|
|
53
|
+
end
|
|
66
54
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
55
|
+
private
|
|
56
|
+
def task_timeout
|
|
57
|
+
# webdriver is slow, it takes more time to wait the result.
|
|
58
|
+
if @options[:with_watir]
|
|
59
|
+
10
|
|
60
|
+
else
|
|
61
|
+
2
|
|
62
|
+
end
|
|
70
63
|
end
|
|
71
64
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
65
|
+
def job_timeout
|
|
66
|
+
if @options[:with_watir]
|
|
67
|
+
20
|
|
68
|
+
else
|
|
69
|
+
10
|
|
70
|
+
end
|
|
77
71
|
end
|
|
78
72
|
|
|
79
73
|
end
|
metadata
CHANGED
|
@@ -1,61 +1,53 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: macaron
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
version: 2.0.1
|
|
5
|
+
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
8
8
|
- Dale Ma
|
|
9
|
-
autorequire:
|
|
9
|
+
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2012-12-
|
|
12
|
+
date: 2012-12-11 00:00:00.000000000 Z
|
|
13
13
|
dependencies: []
|
|
14
|
-
description:
|
|
14
|
+
description:
|
|
15
15
|
email: dalema22@gmail.com
|
|
16
16
|
executables:
|
|
17
17
|
- macaron
|
|
18
18
|
extensions: []
|
|
19
19
|
extra_rdoc_files: []
|
|
20
20
|
files:
|
|
21
|
-
-
|
|
22
|
-
|
|
23
|
-
-
|
|
24
|
-
|
|
25
|
-
-
|
|
26
|
-
|
|
27
|
-
- !binary |-
|
|
28
|
-
bGliL21hY2Fyb24vc2NyYXBlci5yYg==
|
|
29
|
-
- !binary |-
|
|
30
|
-
bGliL21hY2Fyb24vc3Bhd25lci5yYg==
|
|
31
|
-
- !binary |-
|
|
32
|
-
bGliL21hY2Fyb24vdmVyc2lvbi5yYg==
|
|
21
|
+
- bin/macaron
|
|
22
|
+
- lib/macaron/crawler.rb
|
|
23
|
+
- lib/macaron/page.rb
|
|
24
|
+
- lib/macaron/spawner.rb
|
|
25
|
+
- lib/macaron/version.rb
|
|
26
|
+
- lib/macaron.rb
|
|
33
27
|
- LICENSE
|
|
34
28
|
- README.md
|
|
35
29
|
homepage: http://github.com/eguitarz/macaron
|
|
36
30
|
licenses: []
|
|
37
|
-
post_install_message:
|
|
31
|
+
post_install_message:
|
|
38
32
|
rdoc_options: []
|
|
39
33
|
require_paths:
|
|
40
34
|
- lib
|
|
41
35
|
required_ruby_version: !ruby/object:Gem::Requirement
|
|
36
|
+
none: false
|
|
42
37
|
requirements:
|
|
43
38
|
- - ! '>='
|
|
44
39
|
- !ruby/object:Gem::Version
|
|
45
|
-
version:
|
|
46
|
-
MA==
|
|
47
|
-
none: false
|
|
40
|
+
version: '0'
|
|
48
41
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
42
|
+
none: false
|
|
49
43
|
requirements:
|
|
50
44
|
- - ! '>='
|
|
51
45
|
- !ruby/object:Gem::Version
|
|
52
|
-
version:
|
|
53
|
-
MA==
|
|
54
|
-
none: false
|
|
46
|
+
version: '0'
|
|
55
47
|
requirements: []
|
|
56
|
-
rubyforge_project:
|
|
48
|
+
rubyforge_project:
|
|
57
49
|
rubygems_version: 1.8.24
|
|
58
|
-
signing_key:
|
|
50
|
+
signing_key:
|
|
59
51
|
specification_version: 3
|
|
60
52
|
summary: Ruby based web scraper
|
|
61
53
|
test_files: []
|
data/lib/macaron/processor.rb
DELETED
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
require 'rubygems'
|
|
2
|
-
require 'threadpool'
|
|
3
|
-
|
|
4
|
-
module Macaron
|
|
5
|
-
class Processor < Job
|
|
6
|
-
@@output_lock = Mutex.new
|
|
7
|
-
|
|
8
|
-
def run
|
|
9
|
-
begin
|
|
10
|
-
url = @args.shift
|
|
11
|
-
depth = @args.shift
|
|
12
|
-
html = @args.shift
|
|
13
|
-
return if depth <= 0
|
|
14
|
-
scraper = Scraper.new
|
|
15
|
-
scraper.analyze(url, html)
|
|
16
|
-
|
|
17
|
-
# @@result[url] = {:anchors => scraper.anchors}
|
|
18
|
-
@@result[url] = true;
|
|
19
|
-
|
|
20
|
-
# do some additional analyzes
|
|
21
|
-
run_sub_tasks(scraper)
|
|
22
|
-
|
|
23
|
-
links = nil
|
|
24
|
-
if @@options[:in_site_crawling]
|
|
25
|
-
links = scraper.internal_anchors
|
|
26
|
-
else
|
|
27
|
-
links = scraper.absolute_anchors
|
|
28
|
-
end
|
|
29
|
-
puts "found #{links.size} links on #{url}" if @@options[:debug]
|
|
30
|
-
|
|
31
|
-
links.each { |a|
|
|
32
|
-
next if @@parsed_urls.include?(a)
|
|
33
|
-
p "put #{a} into tasks" if @@options[:debug]
|
|
34
|
-
@@task_map = @@task_map.put(a, depth - 1)
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
@@mutex.synchronize {
|
|
38
|
-
@@success_times += 1
|
|
39
|
-
}
|
|
40
|
-
rescue Exception => e
|
|
41
|
-
@@mutex.synchronize {
|
|
42
|
-
@@fail_times += 1
|
|
43
|
-
}
|
|
44
|
-
print "Error on job: #{url}, msg: #{e.message}\n"
|
|
45
|
-
end
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
private
|
|
49
|
-
def run_sub_tasks(scraper)
|
|
50
|
-
# p scraper.image_urls
|
|
51
|
-
|
|
52
|
-
if @@options[:save]
|
|
53
|
-
dir = @@options[:dir] || '/tmp'
|
|
54
|
-
filename = scraper.host.gsub('/', '\\')
|
|
55
|
-
File.open(File.join(dir, filename), "w+") do |f|
|
|
56
|
-
f.write(scraper.dom)
|
|
57
|
-
end
|
|
58
|
-
end
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
end
|
|
62
|
-
end
|
data/lib/macaron/scraper.rb
DELETED
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
require 'rubygems'
|
|
2
|
-
require 'nokogiri'
|
|
3
|
-
require 'open-uri'
|
|
4
|
-
require 'benchmark'
|
|
5
|
-
require 'timeout'
|
|
6
|
-
require 'watir-webdriver'
|
|
7
|
-
|
|
8
|
-
module Macaron
|
|
9
|
-
class Scraper
|
|
10
|
-
attr_reader :dom, :host
|
|
11
|
-
|
|
12
|
-
def analyze(host, html)
|
|
13
|
-
@host = host
|
|
14
|
-
@html = html
|
|
15
|
-
|
|
16
|
-
elapsed_seconds = 0
|
|
17
|
-
begin
|
|
18
|
-
timeout(@@options[:nokogiri_timeout_seconds]) do
|
|
19
|
-
elapsed_seconds = Benchmark.realtime { fetch_dom }
|
|
20
|
-
end
|
|
21
|
-
rescue Timeout::Error
|
|
22
|
-
print "Timeout on #{host}\n"
|
|
23
|
-
@@mutex.synchronize {
|
|
24
|
-
@@fail_times += 1
|
|
25
|
-
}
|
|
26
|
-
end
|
|
27
|
-
|
|
28
|
-
@all_links = absolute_anchors
|
|
29
|
-
|
|
30
|
-
print ">> elapsed #{elapsed_seconds} seconds to get '#{host}'\n"
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
def anchors
|
|
34
|
-
@dom.css('a')
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
def absolute_anchors
|
|
38
|
-
make_absolute_anchors(anchors)
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
def internal_anchors
|
|
42
|
-
root = URI.parse(@host).host
|
|
43
|
-
@all_links.select {|l| URI.parse(l).host == root}
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
def external_anchors
|
|
47
|
-
root = URI.parse(@host).host
|
|
48
|
-
@all_links.select {|l| URI.parse(l).host != root}
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
def images
|
|
52
|
-
@dom.css('img')
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
def image_urls
|
|
56
|
-
images.map { |img| make_absolute(img['src']) }.compact
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
def fetch_dom
|
|
60
|
-
unless @@options[:with_waltir]
|
|
61
|
-
@html = open(@host)
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
@dom = Nokogiri::HTML(@html)
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
def make_absolute_anchors(nodes)
|
|
68
|
-
nodes.map {|n| make_absolute(n['href']) }.compact
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
def make_absolute(href)
|
|
72
|
-
begin
|
|
73
|
-
URI.parse(@host).merge(URI.parse(href)).to_s
|
|
74
|
-
rescue
|
|
75
|
-
nil
|
|
76
|
-
end
|
|
77
|
-
end
|
|
78
|
-
end
|
|
79
|
-
end
|