macaron 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/macaron +5 -1
- data/lib/macaron/processor.rb +14 -2
- data/lib/macaron/scraper.rb +5 -4
- data/lib/macaron/spawner.rb +8 -5
- metadata +27 -19
data/bin/macaron
CHANGED
@@ -17,7 +17,11 @@ options_parser = OptionParser.new do |opts|
|
|
17
17
|
end
|
18
18
|
|
19
19
|
opts.on("-j", "--javascript", "Open javascript support mode") do |j|
|
20
|
-
options[:
|
20
|
+
options[:with_waltir] = j
|
21
|
+
end
|
22
|
+
|
23
|
+
opts.on("-s", "--save", "Save html") do |s|
|
24
|
+
options[:save] = s
|
21
25
|
end
|
22
26
|
|
23
27
|
opts.on_tail("-h", "--help", "Show this message") do
|
data/lib/macaron/processor.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'rubygems'
|
1
2
|
require 'threadpool'
|
2
3
|
|
3
4
|
module Macaron
|
@@ -13,7 +14,8 @@ module Macaron
|
|
13
14
|
scraper = Scraper.new
|
14
15
|
scraper.analyze(url, html)
|
15
16
|
|
16
|
-
@@result[url] = {:anchors => scraper.anchors}
|
17
|
+
# @@result[url] = {:anchors => scraper.anchors}
|
18
|
+
@@result[url] = true;
|
17
19
|
|
18
20
|
# do some additional analyzes
|
19
21
|
run_sub_tasks(scraper)
|
@@ -24,9 +26,11 @@ module Macaron
|
|
24
26
|
else
|
25
27
|
links = scraper.absolute_anchors
|
26
28
|
end
|
29
|
+
puts "found #{links.size} links on #{url}" if @@options[:debug]
|
27
30
|
|
28
|
-
links.each { |a|
|
31
|
+
links.each { |a|
|
29
32
|
next if @@parsed_urls.include?(a)
|
33
|
+
p "put #{a} into tasks" if @@options[:debug]
|
30
34
|
@@task_map = @@task_map.put(a, depth - 1)
|
31
35
|
}
|
32
36
|
|
@@ -44,6 +48,14 @@ module Macaron
|
|
44
48
|
private
|
45
49
|
def run_sub_tasks(scraper)
|
46
50
|
# p scraper.image_urls
|
51
|
+
|
52
|
+
if @@options[:save]
|
53
|
+
dir = @@options[:dir] || '/tmp'
|
54
|
+
filename = scraper.host.gsub('/', '\\')
|
55
|
+
File.open(File.join(dir, filename), "w+") do |f|
|
56
|
+
f.write(scraper.dom)
|
57
|
+
end
|
58
|
+
end
|
47
59
|
end
|
48
60
|
|
49
61
|
end
|
data/lib/macaron/scraper.rb
CHANGED
@@ -7,6 +7,7 @@ require 'watir-webdriver'
|
|
7
7
|
|
8
8
|
module Macaron
|
9
9
|
class Scraper
|
10
|
+
attr_reader :dom, :host
|
10
11
|
|
11
12
|
def analyze(host, html)
|
12
13
|
@host = host
|
@@ -56,11 +57,11 @@ module Macaron
|
|
56
57
|
end
|
57
58
|
|
58
59
|
def fetch_dom
|
59
|
-
|
60
|
-
@
|
61
|
-
else
|
62
|
-
@dom = Nokogiri::HTML(open(@host))
|
60
|
+
unless @@options[:with_waltir]
|
61
|
+
@html = open(@host)
|
63
62
|
end
|
63
|
+
|
64
|
+
@dom = Nokogiri::HTML(@html)
|
64
65
|
end
|
65
66
|
|
66
67
|
def make_absolute_anchors(nodes)
|
data/lib/macaron/spawner.rb
CHANGED
@@ -16,10 +16,11 @@ module Macaron
|
|
16
16
|
:nokogiri_timeout_seconds => 30,
|
17
17
|
:thread_timeout_seconds => 40,
|
18
18
|
:pages => 1000,
|
19
|
-
:initial_workers =>
|
20
|
-
:maximum_workers =>
|
19
|
+
:initial_workers => 4,
|
20
|
+
:maximum_workers => 4,
|
21
21
|
:in_site_crawling => true,
|
22
|
-
:with_waltir => false
|
22
|
+
:with_waltir => false,
|
23
|
+
:debug => false
|
23
24
|
}.freeze
|
24
25
|
|
25
26
|
def initialize(options = {})
|
@@ -42,7 +43,7 @@ module Macaron
|
|
42
43
|
def dig(url, init_depth=3)
|
43
44
|
@@task_map = @@task_map.put(url, init_depth)
|
44
45
|
loop do
|
45
|
-
@@task_map
|
46
|
+
@@task_map.each {|url, depth|
|
46
47
|
@@parsed_urls = @@parsed_urls.add(url)
|
47
48
|
|
48
49
|
if @@options[:with_waltir]
|
@@ -50,7 +51,9 @@ module Macaron
|
|
50
51
|
@threadpool.load(Processor.new(url, depth, html))
|
51
52
|
else
|
52
53
|
@threadpool.load(Processor.new(url, depth))
|
53
|
-
end
|
54
|
+
end
|
55
|
+
|
56
|
+
@@task_map = @@task_map.delete(url)
|
54
57
|
}
|
55
58
|
|
56
59
|
break if @threadpool.busy_workers_count == 0 && @@task_map.empty?
|
metadata
CHANGED
@@ -1,53 +1,61 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: macaron
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
4
|
+
prerelease:
|
5
|
+
version: 1.0.2
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Dale Ma
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-12-06 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
|
-
description:
|
14
|
+
description:
|
15
15
|
email: dalema22@gmail.com
|
16
16
|
executables:
|
17
17
|
- macaron
|
18
18
|
extensions: []
|
19
19
|
extra_rdoc_files: []
|
20
20
|
files:
|
21
|
-
-
|
22
|
-
|
23
|
-
-
|
24
|
-
|
25
|
-
-
|
26
|
-
|
21
|
+
- !binary |-
|
22
|
+
YmluL21hY2Fyb24=
|
23
|
+
- !binary |-
|
24
|
+
bGliL21hY2Fyb24ucmI=
|
25
|
+
- !binary |-
|
26
|
+
bGliL21hY2Fyb24vcHJvY2Vzc29yLnJi
|
27
|
+
- !binary |-
|
28
|
+
bGliL21hY2Fyb24vc2NyYXBlci5yYg==
|
29
|
+
- !binary |-
|
30
|
+
bGliL21hY2Fyb24vc3Bhd25lci5yYg==
|
31
|
+
- !binary |-
|
32
|
+
bGliL21hY2Fyb24vdmVyc2lvbi5yYg==
|
27
33
|
- LICENSE
|
28
34
|
- README.md
|
29
35
|
homepage: http://github.com/eguitarz/macaron
|
30
36
|
licenses: []
|
31
|
-
post_install_message:
|
37
|
+
post_install_message:
|
32
38
|
rdoc_options: []
|
33
39
|
require_paths:
|
34
40
|
- lib
|
35
41
|
required_ruby_version: !ruby/object:Gem::Requirement
|
36
|
-
none: false
|
37
42
|
requirements:
|
38
43
|
- - ! '>='
|
39
44
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
41
|
-
|
45
|
+
version: !binary |-
|
46
|
+
MA==
|
42
47
|
none: false
|
48
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
43
49
|
requirements:
|
44
50
|
- - ! '>='
|
45
51
|
- !ruby/object:Gem::Version
|
46
|
-
version:
|
52
|
+
version: !binary |-
|
53
|
+
MA==
|
54
|
+
none: false
|
47
55
|
requirements: []
|
48
|
-
rubyforge_project:
|
49
|
-
rubygems_version: 1.8.
|
50
|
-
signing_key:
|
56
|
+
rubyforge_project:
|
57
|
+
rubygems_version: 1.8.24
|
58
|
+
signing_key:
|
51
59
|
specification_version: 3
|
52
60
|
summary: Ruby based web scraper
|
53
61
|
test_files: []
|