macaron 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -17,7 +17,11 @@ options_parser = OptionParser.new do |opts|
17
17
  end
18
18
 
19
19
  opts.on("-j", "--javascript", "Open javascript support mode") do |j|
20
- options[:waltir] = j
20
+ options[:with_waltir] = j
21
+ end
22
+
23
+ opts.on("-s", "--save", "Save html") do |s|
24
+ options[:save] = s
21
25
  end
22
26
 
23
27
  opts.on_tail("-h", "--help", "Show this message") do
@@ -1,3 +1,4 @@
1
+ require 'rubygems'
1
2
  require 'threadpool'
2
3
 
3
4
  module Macaron
@@ -13,7 +14,8 @@ module Macaron
13
14
  scraper = Scraper.new
14
15
  scraper.analyze(url, html)
15
16
 
16
- @@result[url] = {:anchors => scraper.anchors}
17
+ # @@result[url] = {:anchors => scraper.anchors}
18
+ @@result[url] = true;
17
19
 
18
20
  # do some additional analyzes
19
21
  run_sub_tasks(scraper)
@@ -24,9 +26,11 @@ module Macaron
24
26
  else
25
27
  links = scraper.absolute_anchors
26
28
  end
29
+ puts "found #{links.size} links on #{url}" if @@options[:debug]
27
30
 
28
- links.each { |a|
31
+ links.each { |a|
29
32
  next if @@parsed_urls.include?(a)
33
+ p "put #{a} into tasks" if @@options[:debug]
30
34
  @@task_map = @@task_map.put(a, depth - 1)
31
35
  }
32
36
 
@@ -44,6 +48,14 @@ module Macaron
44
48
  private
45
49
  def run_sub_tasks(scraper)
46
50
  # p scraper.image_urls
51
+
52
+ if @@options[:save]
53
+ dir = @@options[:dir] || '/tmp'
54
+ filename = scraper.host.gsub('/', '\\')
55
+ File.open(File.join(dir, filename), "w+") do |f|
56
+ f.write(scraper.dom)
57
+ end
58
+ end
47
59
  end
48
60
 
49
61
  end
@@ -7,6 +7,7 @@ require 'watir-webdriver'
7
7
 
8
8
  module Macaron
9
9
  class Scraper
10
+ attr_reader :dom, :host
10
11
 
11
12
  def analyze(host, html)
12
13
  @host = host
@@ -56,11 +57,11 @@ module Macaron
56
57
  end
57
58
 
58
59
  def fetch_dom
59
- if @@options[:with_waltir]
60
- @dom = Nokogiri::HTML(@html)
61
- else
62
- @dom = Nokogiri::HTML(open(@host))
60
+ unless @@options[:with_waltir]
61
+ @html = open(@host)
63
62
  end
63
+
64
+ @dom = Nokogiri::HTML(@html)
64
65
  end
65
66
 
66
67
  def make_absolute_anchors(nodes)
@@ -16,10 +16,11 @@ module Macaron
16
16
  :nokogiri_timeout_seconds => 30,
17
17
  :thread_timeout_seconds => 40,
18
18
  :pages => 1000,
19
- :initial_workers => 1,
20
- :maximum_workers => 1,
19
+ :initial_workers => 4,
20
+ :maximum_workers => 4,
21
21
  :in_site_crawling => true,
22
- :with_waltir => false
22
+ :with_waltir => false,
23
+ :debug => false
23
24
  }.freeze
24
25
 
25
26
  def initialize(options = {})
@@ -42,7 +43,7 @@ module Macaron
42
43
  def dig(url, init_depth=3)
43
44
  @@task_map = @@task_map.put(url, init_depth)
44
45
  loop do
45
- @@task_map = @@task_map.remove {|url, depth|
46
+ @@task_map.each {|url, depth|
46
47
  @@parsed_urls = @@parsed_urls.add(url)
47
48
 
48
49
  if @@options[:with_waltir]
@@ -50,7 +51,9 @@ module Macaron
50
51
  @threadpool.load(Processor.new(url, depth, html))
51
52
  else
52
53
  @threadpool.load(Processor.new(url, depth))
53
- end
54
+ end
55
+
56
+ @@task_map = @@task_map.delete(url)
54
57
  }
55
58
 
56
59
  break if @threadpool.busy_workers_count == 0 && @@task_map.empty?
metadata CHANGED
@@ -1,53 +1,61 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: macaron
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
5
- prerelease:
4
+ prerelease:
5
+ version: 1.0.2
6
6
  platform: ruby
7
7
  authors:
8
8
  - Dale Ma
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-17 00:00:00.000000000 Z
12
+ date: 2012-12-06 00:00:00.000000000 Z
13
13
  dependencies: []
14
- description:
14
+ description:
15
15
  email: dalema22@gmail.com
16
16
  executables:
17
17
  - macaron
18
18
  extensions: []
19
19
  extra_rdoc_files: []
20
20
  files:
21
- - bin/macaron
22
- - lib/macaron/processor.rb
23
- - lib/macaron/scraper.rb
24
- - lib/macaron/spawner.rb
25
- - lib/macaron/version.rb
26
- - lib/macaron.rb
21
+ - !binary |-
22
+ YmluL21hY2Fyb24=
23
+ - !binary |-
24
+ bGliL21hY2Fyb24ucmI=
25
+ - !binary |-
26
+ bGliL21hY2Fyb24vcHJvY2Vzc29yLnJi
27
+ - !binary |-
28
+ bGliL21hY2Fyb24vc2NyYXBlci5yYg==
29
+ - !binary |-
30
+ bGliL21hY2Fyb24vc3Bhd25lci5yYg==
31
+ - !binary |-
32
+ bGliL21hY2Fyb24vdmVyc2lvbi5yYg==
27
33
  - LICENSE
28
34
  - README.md
29
35
  homepage: http://github.com/eguitarz/macaron
30
36
  licenses: []
31
- post_install_message:
37
+ post_install_message:
32
38
  rdoc_options: []
33
39
  require_paths:
34
40
  - lib
35
41
  required_ruby_version: !ruby/object:Gem::Requirement
36
- none: false
37
42
  requirements:
38
43
  - - ! '>='
39
44
  - !ruby/object:Gem::Version
40
- version: '0'
41
- required_rubygems_version: !ruby/object:Gem::Requirement
45
+ version: !binary |-
46
+ MA==
42
47
  none: false
48
+ required_rubygems_version: !ruby/object:Gem::Requirement
43
49
  requirements:
44
50
  - - ! '>='
45
51
  - !ruby/object:Gem::Version
46
- version: '0'
52
+ version: !binary |-
53
+ MA==
54
+ none: false
47
55
  requirements: []
48
- rubyforge_project:
49
- rubygems_version: 1.8.19
50
- signing_key:
56
+ rubyforge_project:
57
+ rubygems_version: 1.8.24
58
+ signing_key:
51
59
  specification_version: 3
52
60
  summary: Ruby based web scraper
53
61
  test_files: []