macaron 1.0.2 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/bin/macaron CHANGED
@@ -17,7 +17,7 @@ options_parser = OptionParser.new do |opts|
17
17
  end
18
18
 
19
19
  opts.on("-j", "--javascript", "Open javascript support mode") do |j|
20
- options[:with_waltir] = j
20
+ options[:with_watir] = j
21
21
  end
22
22
 
23
23
  opts.on("-s", "--save", "Save html") do |s|
@@ -38,9 +38,6 @@ if ARGV.length != 1
38
38
  end
39
39
 
40
40
  url = ARGV.first
41
- puts "Starting at #{url}"
41
+ puts "Started"
42
42
 
43
- mother = Spawner.new(options)
44
- mother.dig(url, options[:depth])
45
- puts "Success times: #{mother.success_times}"
46
- puts "Fail times: #{mother.fail_times}"
43
+ Spawner.new(url, options)
data/lib/macaron.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  $: << File.dirname(__FILE__)
2
2
 
3
- require 'macaron/processor'
4
3
  require 'macaron/spawner'
5
- require 'macaron/scraper'
4
+ require 'macaron/crawler'
5
+ require 'macaron/page'
6
6
  include Macaron
@@ -0,0 +1,23 @@
1
+ require 'observer'
2
+ require 'timeout'
3
+ require 'threadpool'
4
+
5
+ module Macaron
6
+ class Crawler < Job
7
+ include Observable
8
+
9
+ def run
10
+ url, bot = @args
11
+ page = Page.new(url, bot)
12
+ links = []
13
+ begin
14
+ links = page.fetch.inner_links
15
+ rescue
16
+ end
17
+ changed
18
+ notify_observers(links)
19
+ print "#{url} >> #{page.title}\n"
20
+ delete_observers
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,73 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+ require 'thread'
4
+
5
+ module Macaron
6
+ class Page
7
+ def initialize(url, bot=nil)
8
+ @url = url
9
+ @bot = bot
10
+ @@bot_lock = Mutex.new
11
+ end
12
+
13
+ def fetch
14
+ document
15
+ base(@url)
16
+ self
17
+ end
18
+
19
+ def inner_links
20
+ anchors = links.select {|link|
21
+ URI.parse(link).host == @base.host
22
+ }.compact
23
+ end
24
+
25
+ def title
26
+ @doc.title
27
+ end
28
+
29
+ private
30
+ def document
31
+ @doc ||= Nokogiri::HTML(content)
32
+ end
33
+
34
+ def base(href)
35
+ base = @doc.css('base')
36
+ header_base_url = base.attr('href').text unless base.empty?
37
+ base_url = header_base_url || @url
38
+ @base ||= URI.parse(base_url)
39
+ end
40
+
41
+ def content
42
+ if @bot
43
+ # only activate one browser, needs to be thread safe.
44
+ @@bot_lock.synchronize {
45
+ @bot.goto(@url)
46
+ @bot.html
47
+ }
48
+ else
49
+ open(@url)
50
+ end
51
+ end
52
+
53
+ def links
54
+ @doc.css('a').map {|a|
55
+ href = a['href']
56
+ if href.start_with? 'http'
57
+ href
58
+ else
59
+ make_absolute(href)
60
+ end
61
+ }.compact
62
+ end
63
+
64
+ def make_absolute(href)
65
+ begin
66
+ @base.merge(URI.parse(href)).to_s
67
+ rescue
68
+ nil
69
+ end
70
+ end
71
+
72
+ end
73
+ end
@@ -1,79 +1,73 @@
1
- require 'rubygems'
2
- require 'threadpool'
3
- require 'hamster'
1
+ require 'timeout'
2
+ require 'observer'
3
+ require 'watir-webdriver'
4
4
 
5
5
  module Macaron
6
- @@result = {}
7
- @@parsed_urls = Hamster.set
8
- @@task_map = Hamster.hash
9
- @@options = {}
10
- @@success_times = 0
11
- @@fail_times = 0
12
- @@mutex = Mutex.new
13
-
14
6
  class Spawner
15
- DEFALUT_OPTIONS = {
16
- :nokogiri_timeout_seconds => 30,
17
- :thread_timeout_seconds => 40,
18
- :pages => 1000,
19
- :initial_workers => 4,
20
- :maximum_workers => 4,
21
- :in_site_crawling => true,
22
- :with_waltir => false,
23
- :debug => false
24
- }.freeze
7
+ def initialize(url, options)
8
+ @options = options
25
9
 
26
- def initialize(options = {})
27
- @@options = DEFALUT_OPTIONS.merge(options)
28
- @threadpool = Threadpool.new(
29
- @@options[:initial_workers],
30
- @@options[:maximum_workers],
31
- @@options[:thread_timeout_seconds]
32
- )
33
- end
10
+ # threadpool(init workers, max workers, job timeout)
11
+ threadpool = Threadpool.new(10, 10, job_timeout)
34
12
 
35
- def success_times
36
- @@success_times
37
- end
13
+ # tasks saves the on-processing urls
14
+ @tasks = Queue.new
15
+ @tasks << url
38
16
 
39
- def fail_times
40
- @@fail_times
41
- end
17
+ # parsed_urls used to prevent loop crawling
18
+ @parsed_urls = [url]
19
+
20
+ # awaiting_counter saves the awaiting task number
21
+ @awaiting_counter = 1
22
+
23
+ # bot is a webdriver
24
+ bot = Watir::Browser.new if @options[:with_watir]
42
25
 
43
- def dig(url, init_depth=3)
44
- @@task_map = @@task_map.put(url, init_depth)
45
26
  loop do
46
- @@task_map.each {|url, depth|
47
- @@parsed_urls = @@parsed_urls.add(url)
27
+ break if @awaiting_counter == 0
28
+
29
+ begin
30
+ Timeout::timeout(task_timeout) { url = @tasks.shift }
31
+ rescue
32
+ next
33
+ end
48
34
 
49
- if @@options[:with_waltir]
50
- html = get_html_via_waltir(url)
51
- @threadpool.load(Processor.new(url, depth, html))
52
- else
53
- @threadpool.load(Processor.new(url, depth))
54
- end
35
+ job = Macaron::Crawler.new(url, bot)
36
+ job.add_observer(self)
55
37
 
56
- @@task_map = @@task_map.delete(url)
57
- }
38
+ threadpool.load(job)
39
+ end
58
40
 
59
- break if @threadpool.busy_workers_count == 0 && @@task_map.empty?
41
+ bot.close unless bot.nil?
42
+ end
60
43
 
61
- if @@success_times > @@options[:pages]
62
- print "Fetched pages exceeds the limit #{@@options[:pages]}\n"
63
- break
44
+ def update(links)
45
+ @awaiting_counter -= 1
46
+ links.each do |link|
47
+ unless @parsed_urls.include?(link)
48
+ @tasks << link
49
+ @awaiting_counter += 1
64
50
  end
51
+ @parsed_urls << link
65
52
  end
53
+ end
66
54
 
67
- @bot.close unless @bot.nil?
68
-
69
- puts "result: #{@@result.size}, #{@@result.keys}" if @@options[:debug]
55
+ private
56
+ def task_timeout
57
+ # webdriver is slow, it takes more time to wait the result.
58
+ if @options[:with_watir]
59
+ 10
60
+ else
61
+ 2
62
+ end
70
63
  end
71
64
 
72
- private
73
- def get_html_via_waltir(url)
74
- @bot ||= Watir::Browser.new
75
- @bot.goto(url)
76
- @bot.html
65
+ def job_timeout
66
+ if @options[:with_watir]
67
+ 20
68
+ else
69
+ 10
70
+ end
77
71
  end
78
72
 
79
73
  end
metadata CHANGED
@@ -1,61 +1,53 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: macaron
3
3
  version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 1.0.2
4
+ version: 2.0.1
5
+ prerelease:
6
6
  platform: ruby
7
7
  authors:
8
8
  - Dale Ma
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-06 00:00:00.000000000 Z
12
+ date: 2012-12-11 00:00:00.000000000 Z
13
13
  dependencies: []
14
- description:
14
+ description:
15
15
  email: dalema22@gmail.com
16
16
  executables:
17
17
  - macaron
18
18
  extensions: []
19
19
  extra_rdoc_files: []
20
20
  files:
21
- - !binary |-
22
- YmluL21hY2Fyb24=
23
- - !binary |-
24
- bGliL21hY2Fyb24ucmI=
25
- - !binary |-
26
- bGliL21hY2Fyb24vcHJvY2Vzc29yLnJi
27
- - !binary |-
28
- bGliL21hY2Fyb24vc2NyYXBlci5yYg==
29
- - !binary |-
30
- bGliL21hY2Fyb24vc3Bhd25lci5yYg==
31
- - !binary |-
32
- bGliL21hY2Fyb24vdmVyc2lvbi5yYg==
21
+ - bin/macaron
22
+ - lib/macaron/crawler.rb
23
+ - lib/macaron/page.rb
24
+ - lib/macaron/spawner.rb
25
+ - lib/macaron/version.rb
26
+ - lib/macaron.rb
33
27
  - LICENSE
34
28
  - README.md
35
29
  homepage: http://github.com/eguitarz/macaron
36
30
  licenses: []
37
- post_install_message:
31
+ post_install_message:
38
32
  rdoc_options: []
39
33
  require_paths:
40
34
  - lib
41
35
  required_ruby_version: !ruby/object:Gem::Requirement
36
+ none: false
42
37
  requirements:
43
38
  - - ! '>='
44
39
  - !ruby/object:Gem::Version
45
- version: !binary |-
46
- MA==
47
- none: false
40
+ version: '0'
48
41
  required_rubygems_version: !ruby/object:Gem::Requirement
42
+ none: false
49
43
  requirements:
50
44
  - - ! '>='
51
45
  - !ruby/object:Gem::Version
52
- version: !binary |-
53
- MA==
54
- none: false
46
+ version: '0'
55
47
  requirements: []
56
- rubyforge_project:
48
+ rubyforge_project:
57
49
  rubygems_version: 1.8.24
58
- signing_key:
50
+ signing_key:
59
51
  specification_version: 3
60
52
  summary: Ruby based web scraper
61
53
  test_files: []
@@ -1,62 +0,0 @@
1
- require 'rubygems'
2
- require 'threadpool'
3
-
4
- module Macaron
5
- class Processor < Job
6
- @@output_lock = Mutex.new
7
-
8
- def run
9
- begin
10
- url = @args.shift
11
- depth = @args.shift
12
- html = @args.shift
13
- return if depth <= 0
14
- scraper = Scraper.new
15
- scraper.analyze(url, html)
16
-
17
- # @@result[url] = {:anchors => scraper.anchors}
18
- @@result[url] = true;
19
-
20
- # do some additional analyzes
21
- run_sub_tasks(scraper)
22
-
23
- links = nil
24
- if @@options[:in_site_crawling]
25
- links = scraper.internal_anchors
26
- else
27
- links = scraper.absolute_anchors
28
- end
29
- puts "found #{links.size} links on #{url}" if @@options[:debug]
30
-
31
- links.each { |a|
32
- next if @@parsed_urls.include?(a)
33
- p "put #{a} into tasks" if @@options[:debug]
34
- @@task_map = @@task_map.put(a, depth - 1)
35
- }
36
-
37
- @@mutex.synchronize {
38
- @@success_times += 1
39
- }
40
- rescue Exception => e
41
- @@mutex.synchronize {
42
- @@fail_times += 1
43
- }
44
- print "Error on job: #{url}, msg: #{e.message}\n"
45
- end
46
- end
47
-
48
- private
49
- def run_sub_tasks(scraper)
50
- # p scraper.image_urls
51
-
52
- if @@options[:save]
53
- dir = @@options[:dir] || '/tmp'
54
- filename = scraper.host.gsub('/', '\\')
55
- File.open(File.join(dir, filename), "w+") do |f|
56
- f.write(scraper.dom)
57
- end
58
- end
59
- end
60
-
61
- end
62
- end
@@ -1,79 +0,0 @@
1
- require 'rubygems'
2
- require 'nokogiri'
3
- require 'open-uri'
4
- require 'benchmark'
5
- require 'timeout'
6
- require 'watir-webdriver'
7
-
8
- module Macaron
9
- class Scraper
10
- attr_reader :dom, :host
11
-
12
- def analyze(host, html)
13
- @host = host
14
- @html = html
15
-
16
- elapsed_seconds = 0
17
- begin
18
- timeout(@@options[:nokogiri_timeout_seconds]) do
19
- elapsed_seconds = Benchmark.realtime { fetch_dom }
20
- end
21
- rescue Timeout::Error
22
- print "Timeout on #{host}\n"
23
- @@mutex.synchronize {
24
- @@fail_times += 1
25
- }
26
- end
27
-
28
- @all_links = absolute_anchors
29
-
30
- print ">> elapsed #{elapsed_seconds} seconds to get '#{host}'\n"
31
- end
32
-
33
- def anchors
34
- @dom.css('a')
35
- end
36
-
37
- def absolute_anchors
38
- make_absolute_anchors(anchors)
39
- end
40
-
41
- def internal_anchors
42
- root = URI.parse(@host).host
43
- @all_links.select {|l| URI.parse(l).host == root}
44
- end
45
-
46
- def external_anchors
47
- root = URI.parse(@host).host
48
- @all_links.select {|l| URI.parse(l).host != root}
49
- end
50
-
51
- def images
52
- @dom.css('img')
53
- end
54
-
55
- def image_urls
56
- images.map { |img| make_absolute(img['src']) }.compact
57
- end
58
-
59
- def fetch_dom
60
- unless @@options[:with_waltir]
61
- @html = open(@host)
62
- end
63
-
64
- @dom = Nokogiri::HTML(@html)
65
- end
66
-
67
- def make_absolute_anchors(nodes)
68
- nodes.map {|n| make_absolute(n['href']) }.compact
69
- end
70
-
71
- def make_absolute(href)
72
- begin
73
- URI.parse(@host).merge(URI.parse(href)).to_s
74
- rescue
75
- nil
76
- end
77
- end
78
- end
79
- end