macaron 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,7 +17,11 @@ options_parser = OptionParser.new do |opts|
17
17
  end
18
18
 
19
19
  opts.on("-j", "--javascript", "Open javascript support mode") do |j|
20
- options[:waltir] = j
20
+ options[:with_waltir] = j
21
+ end
22
+
23
+ opts.on("-s", "--save", "Save html") do |s|
24
+ options[:save] = s
21
25
  end
22
26
 
23
27
  opts.on_tail("-h", "--help", "Show this message") do
@@ -1,3 +1,4 @@
1
+ require 'rubygems'
1
2
  require 'threadpool'
2
3
 
3
4
  module Macaron
@@ -13,7 +14,8 @@ module Macaron
13
14
  scraper = Scraper.new
14
15
  scraper.analyze(url, html)
15
16
 
16
- @@result[url] = {:anchors => scraper.anchors}
17
+ # @@result[url] = {:anchors => scraper.anchors}
18
+ @@result[url] = true;
17
19
 
18
20
  # do some additional analyzes
19
21
  run_sub_tasks(scraper)
@@ -24,9 +26,11 @@ module Macaron
24
26
  else
25
27
  links = scraper.absolute_anchors
26
28
  end
29
+ puts "found #{links.size} links on #{url}" if @@options[:debug]
27
30
 
28
- links.each { |a|
31
+ links.each { |a|
29
32
  next if @@parsed_urls.include?(a)
33
+ p "put #{a} into tasks" if @@options[:debug]
30
34
  @@task_map = @@task_map.put(a, depth - 1)
31
35
  }
32
36
 
@@ -44,6 +48,14 @@ module Macaron
44
48
  private
45
49
  def run_sub_tasks(scraper)
46
50
  # p scraper.image_urls
51
+
52
+ if @@options[:save]
53
+ dir = @@options[:dir] || '/tmp'
54
+ filename = scraper.host.gsub('/', '\\')
55
+ File.open(File.join(dir, filename), "w+") do |f|
56
+ f.write(scraper.dom)
57
+ end
58
+ end
47
59
  end
48
60
 
49
61
  end
@@ -7,6 +7,7 @@ require 'watir-webdriver'
7
7
 
8
8
  module Macaron
9
9
  class Scraper
10
+ attr_reader :dom, :host
10
11
 
11
12
  def analyze(host, html)
12
13
  @host = host
@@ -56,11 +57,11 @@ module Macaron
56
57
  end
57
58
 
58
59
  def fetch_dom
59
- if @@options[:with_waltir]
60
- @dom = Nokogiri::HTML(@html)
61
- else
62
- @dom = Nokogiri::HTML(open(@host))
60
+ unless @@options[:with_waltir]
61
+ @html = open(@host)
63
62
  end
63
+
64
+ @dom = Nokogiri::HTML(@html)
64
65
  end
65
66
 
66
67
  def make_absolute_anchors(nodes)
@@ -16,10 +16,11 @@ module Macaron
16
16
  :nokogiri_timeout_seconds => 30,
17
17
  :thread_timeout_seconds => 40,
18
18
  :pages => 1000,
19
- :initial_workers => 1,
20
- :maximum_workers => 1,
19
+ :initial_workers => 4,
20
+ :maximum_workers => 4,
21
21
  :in_site_crawling => true,
22
- :with_waltir => false
22
+ :with_waltir => false,
23
+ :debug => false
23
24
  }.freeze
24
25
 
25
26
  def initialize(options = {})
@@ -42,7 +43,7 @@ module Macaron
42
43
  def dig(url, init_depth=3)
43
44
  @@task_map = @@task_map.put(url, init_depth)
44
45
  loop do
45
- @@task_map = @@task_map.remove {|url, depth|
46
+ @@task_map.each {|url, depth|
46
47
  @@parsed_urls = @@parsed_urls.add(url)
47
48
 
48
49
  if @@options[:with_waltir]
@@ -50,7 +51,9 @@ module Macaron
50
51
  @threadpool.load(Processor.new(url, depth, html))
51
52
  else
52
53
  @threadpool.load(Processor.new(url, depth))
53
- end
54
+ end
55
+
56
+ @@task_map = @@task_map.delete(url)
54
57
  }
55
58
 
56
59
  break if @threadpool.busy_workers_count == 0 && @@task_map.empty?
metadata CHANGED
@@ -1,53 +1,61 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: macaron
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
5
- prerelease:
4
+ prerelease:
5
+ version: 1.0.2
6
6
  platform: ruby
7
7
  authors:
8
8
  - Dale Ma
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-17 00:00:00.000000000 Z
12
+ date: 2012-12-06 00:00:00.000000000 Z
13
13
  dependencies: []
14
- description:
14
+ description:
15
15
  email: dalema22@gmail.com
16
16
  executables:
17
17
  - macaron
18
18
  extensions: []
19
19
  extra_rdoc_files: []
20
20
  files:
21
- - bin/macaron
22
- - lib/macaron/processor.rb
23
- - lib/macaron/scraper.rb
24
- - lib/macaron/spawner.rb
25
- - lib/macaron/version.rb
26
- - lib/macaron.rb
21
+ - !binary |-
22
+ YmluL21hY2Fyb24=
23
+ - !binary |-
24
+ bGliL21hY2Fyb24ucmI=
25
+ - !binary |-
26
+ bGliL21hY2Fyb24vcHJvY2Vzc29yLnJi
27
+ - !binary |-
28
+ bGliL21hY2Fyb24vc2NyYXBlci5yYg==
29
+ - !binary |-
30
+ bGliL21hY2Fyb24vc3Bhd25lci5yYg==
31
+ - !binary |-
32
+ bGliL21hY2Fyb24vdmVyc2lvbi5yYg==
27
33
  - LICENSE
28
34
  - README.md
29
35
  homepage: http://github.com/eguitarz/macaron
30
36
  licenses: []
31
- post_install_message:
37
+ post_install_message:
32
38
  rdoc_options: []
33
39
  require_paths:
34
40
  - lib
35
41
  required_ruby_version: !ruby/object:Gem::Requirement
36
- none: false
37
42
  requirements:
38
43
  - - ! '>='
39
44
  - !ruby/object:Gem::Version
40
- version: '0'
41
- required_rubygems_version: !ruby/object:Gem::Requirement
45
+ version: !binary |-
46
+ MA==
42
47
  none: false
48
+ required_rubygems_version: !ruby/object:Gem::Requirement
43
49
  requirements:
44
50
  - - ! '>='
45
51
  - !ruby/object:Gem::Version
46
- version: '0'
52
+ version: !binary |-
53
+ MA==
54
+ none: false
47
55
  requirements: []
48
- rubyforge_project:
49
- rubygems_version: 1.8.19
50
- signing_key:
56
+ rubyforge_project:
57
+ rubygems_version: 1.8.24
58
+ signing_key:
51
59
  specification_version: 3
52
60
  summary: Ruby based web scraper
53
61
  test_files: []