macaron 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/macaron +5 -1
- data/lib/macaron/processor.rb +14 -2
- data/lib/macaron/scraper.rb +5 -4
- data/lib/macaron/spawner.rb +8 -5
- metadata +27 -19
data/bin/macaron
CHANGED
@@ -17,7 +17,11 @@ options_parser = OptionParser.new do |opts|
|
|
17
17
|
end
|
18
18
|
|
19
19
|
opts.on("-j", "--javascript", "Open javascript support mode") do |j|
|
20
|
-
options[:
|
20
|
+
options[:with_waltir] = j
|
21
|
+
end
|
22
|
+
|
23
|
+
opts.on("-s", "--save", "Save html") do |s|
|
24
|
+
options[:save] = s
|
21
25
|
end
|
22
26
|
|
23
27
|
opts.on_tail("-h", "--help", "Show this message") do
|
data/lib/macaron/processor.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'rubygems'
|
1
2
|
require 'threadpool'
|
2
3
|
|
3
4
|
module Macaron
|
@@ -13,7 +14,8 @@ module Macaron
|
|
13
14
|
scraper = Scraper.new
|
14
15
|
scraper.analyze(url, html)
|
15
16
|
|
16
|
-
@@result[url] = {:anchors => scraper.anchors}
|
17
|
+
# @@result[url] = {:anchors => scraper.anchors}
|
18
|
+
@@result[url] = true;
|
17
19
|
|
18
20
|
# do some additional analyzes
|
19
21
|
run_sub_tasks(scraper)
|
@@ -24,9 +26,11 @@ module Macaron
|
|
24
26
|
else
|
25
27
|
links = scraper.absolute_anchors
|
26
28
|
end
|
29
|
+
puts "found #{links.size} links on #{url}" if @@options[:debug]
|
27
30
|
|
28
|
-
links.each { |a|
|
31
|
+
links.each { |a|
|
29
32
|
next if @@parsed_urls.include?(a)
|
33
|
+
p "put #{a} into tasks" if @@options[:debug]
|
30
34
|
@@task_map = @@task_map.put(a, depth - 1)
|
31
35
|
}
|
32
36
|
|
@@ -44,6 +48,14 @@ module Macaron
|
|
44
48
|
private
|
45
49
|
def run_sub_tasks(scraper)
|
46
50
|
# p scraper.image_urls
|
51
|
+
|
52
|
+
if @@options[:save]
|
53
|
+
dir = @@options[:dir] || '/tmp'
|
54
|
+
filename = scraper.host.gsub('/', '\\')
|
55
|
+
File.open(File.join(dir, filename), "w+") do |f|
|
56
|
+
f.write(scraper.dom)
|
57
|
+
end
|
58
|
+
end
|
47
59
|
end
|
48
60
|
|
49
61
|
end
|
data/lib/macaron/scraper.rb
CHANGED
@@ -7,6 +7,7 @@ require 'watir-webdriver'
|
|
7
7
|
|
8
8
|
module Macaron
|
9
9
|
class Scraper
|
10
|
+
attr_reader :dom, :host
|
10
11
|
|
11
12
|
def analyze(host, html)
|
12
13
|
@host = host
|
@@ -56,11 +57,11 @@ module Macaron
|
|
56
57
|
end
|
57
58
|
|
58
59
|
def fetch_dom
|
59
|
-
|
60
|
-
@
|
61
|
-
else
|
62
|
-
@dom = Nokogiri::HTML(open(@host))
|
60
|
+
unless @@options[:with_waltir]
|
61
|
+
@html = open(@host)
|
63
62
|
end
|
63
|
+
|
64
|
+
@dom = Nokogiri::HTML(@html)
|
64
65
|
end
|
65
66
|
|
66
67
|
def make_absolute_anchors(nodes)
|
data/lib/macaron/spawner.rb
CHANGED
@@ -16,10 +16,11 @@ module Macaron
|
|
16
16
|
:nokogiri_timeout_seconds => 30,
|
17
17
|
:thread_timeout_seconds => 40,
|
18
18
|
:pages => 1000,
|
19
|
-
:initial_workers =>
|
20
|
-
:maximum_workers =>
|
19
|
+
:initial_workers => 4,
|
20
|
+
:maximum_workers => 4,
|
21
21
|
:in_site_crawling => true,
|
22
|
-
:with_waltir => false
|
22
|
+
:with_waltir => false,
|
23
|
+
:debug => false
|
23
24
|
}.freeze
|
24
25
|
|
25
26
|
def initialize(options = {})
|
@@ -42,7 +43,7 @@ module Macaron
|
|
42
43
|
def dig(url, init_depth=3)
|
43
44
|
@@task_map = @@task_map.put(url, init_depth)
|
44
45
|
loop do
|
45
|
-
@@task_map
|
46
|
+
@@task_map.each {|url, depth|
|
46
47
|
@@parsed_urls = @@parsed_urls.add(url)
|
47
48
|
|
48
49
|
if @@options[:with_waltir]
|
@@ -50,7 +51,9 @@ module Macaron
|
|
50
51
|
@threadpool.load(Processor.new(url, depth, html))
|
51
52
|
else
|
52
53
|
@threadpool.load(Processor.new(url, depth))
|
53
|
-
end
|
54
|
+
end
|
55
|
+
|
56
|
+
@@task_map = @@task_map.delete(url)
|
54
57
|
}
|
55
58
|
|
56
59
|
break if @threadpool.busy_workers_count == 0 && @@task_map.empty?
|
metadata
CHANGED
@@ -1,53 +1,61 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: macaron
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
4
|
+
prerelease:
|
5
|
+
version: 1.0.2
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Dale Ma
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-12-06 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
|
-
description:
|
14
|
+
description:
|
15
15
|
email: dalema22@gmail.com
|
16
16
|
executables:
|
17
17
|
- macaron
|
18
18
|
extensions: []
|
19
19
|
extra_rdoc_files: []
|
20
20
|
files:
|
21
|
-
-
|
22
|
-
|
23
|
-
-
|
24
|
-
|
25
|
-
-
|
26
|
-
|
21
|
+
- !binary |-
|
22
|
+
YmluL21hY2Fyb24=
|
23
|
+
- !binary |-
|
24
|
+
bGliL21hY2Fyb24ucmI=
|
25
|
+
- !binary |-
|
26
|
+
bGliL21hY2Fyb24vcHJvY2Vzc29yLnJi
|
27
|
+
- !binary |-
|
28
|
+
bGliL21hY2Fyb24vc2NyYXBlci5yYg==
|
29
|
+
- !binary |-
|
30
|
+
bGliL21hY2Fyb24vc3Bhd25lci5yYg==
|
31
|
+
- !binary |-
|
32
|
+
bGliL21hY2Fyb24vdmVyc2lvbi5yYg==
|
27
33
|
- LICENSE
|
28
34
|
- README.md
|
29
35
|
homepage: http://github.com/eguitarz/macaron
|
30
36
|
licenses: []
|
31
|
-
post_install_message:
|
37
|
+
post_install_message:
|
32
38
|
rdoc_options: []
|
33
39
|
require_paths:
|
34
40
|
- lib
|
35
41
|
required_ruby_version: !ruby/object:Gem::Requirement
|
36
|
-
none: false
|
37
42
|
requirements:
|
38
43
|
- - ! '>='
|
39
44
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
41
|
-
|
45
|
+
version: !binary |-
|
46
|
+
MA==
|
42
47
|
none: false
|
48
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
43
49
|
requirements:
|
44
50
|
- - ! '>='
|
45
51
|
- !ruby/object:Gem::Version
|
46
|
-
version:
|
52
|
+
version: !binary |-
|
53
|
+
MA==
|
54
|
+
none: false
|
47
55
|
requirements: []
|
48
|
-
rubyforge_project:
|
49
|
-
rubygems_version: 1.8.
|
50
|
-
signing_key:
|
56
|
+
rubyforge_project:
|
57
|
+
rubygems_version: 1.8.24
|
58
|
+
signing_key:
|
51
59
|
specification_version: 3
|
52
60
|
summary: Ruby based web scraper
|
53
61
|
test_files: []
|