macaron 1.0.2 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/macaron CHANGED
@@ -17,7 +17,7 @@ options_parser = OptionParser.new do |opts|
17
17
  end
18
18
 
19
19
  opts.on("-j", "--javascript", "Open javascript support mode") do |j|
20
- options[:with_waltir] = j
20
+ options[:with_watir] = j
21
21
  end
22
22
 
23
23
  opts.on("-s", "--save", "Save html") do |s|
@@ -38,9 +38,6 @@ if ARGV.length != 1
38
38
  end
39
39
 
40
40
  url = ARGV.first
41
- puts "Starting at #{url}"
41
+ puts "Started"
42
42
 
43
- mother = Spawner.new(options)
44
- mother.dig(url, options[:depth])
45
- puts "Success times: #{mother.success_times}"
46
- puts "Fail times: #{mother.fail_times}"
43
+ Spawner.new(url, options)
data/lib/macaron.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  $: << File.dirname(__FILE__)
2
2
 
3
- require 'macaron/processor'
4
3
  require 'macaron/spawner'
5
- require 'macaron/scraper'
4
+ require 'macaron/crawler'
5
+ require 'macaron/page'
6
6
  include Macaron
@@ -0,0 +1,23 @@
1
+ require 'observer'
2
+ require 'timeout'
3
+ require 'threadpool'
4
+
5
+ module Macaron
6
+ class Crawler < Job
7
+ include Observable
8
+
9
+ def run
10
+ url, bot = @args
11
+ page = Page.new(url, bot)
12
+ links = []
13
+ begin
14
+ links = page.fetch.inner_links
15
+ rescue
16
+ end
17
+ changed
18
+ notify_observers(links)
19
+ print "#{url} >> #{page.title}\n"
20
+ delete_observers
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,73 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+ require 'thread'
4
+
5
+ module Macaron
6
+ class Page
7
+ def initialize(url, bot=nil)
8
+ @url = url
9
+ @bot = bot
10
+ @@bot_lock = Mutex.new
11
+ end
12
+
13
+ def fetch
14
+ document
15
+ base(@url)
16
+ self
17
+ end
18
+
19
+ def inner_links
20
+ anchors = links.select {|link|
21
+ URI.parse(link).host == @base.host
22
+ }.compact
23
+ end
24
+
25
+ def title
26
+ @doc.title
27
+ end
28
+
29
+ private
30
+ def document
31
+ @doc ||= Nokogiri::HTML(content)
32
+ end
33
+
34
+ def base(href)
35
+ base = @doc.css('base')
36
+ header_base_url = base.attr('href').text unless base.empty?
37
+ base_url = header_base_url || @url
38
+ @base ||= URI.parse(base_url)
39
+ end
40
+
41
+ def content
42
+ if @bot
43
+ # only activate one browser, needs to be thread safe.
44
+ @@bot_lock.synchronize {
45
+ @bot.goto(@url)
46
+ @bot.html
47
+ }
48
+ else
49
+ open(@url)
50
+ end
51
+ end
52
+
53
+ def links
54
+ @doc.css('a').map {|a|
55
+ href = a['href']
56
+ if href.start_with? 'http'
57
+ href
58
+ else
59
+ make_absolute(href)
60
+ end
61
+ }.compact
62
+ end
63
+
64
+ def make_absolute(href)
65
+ begin
66
+ @base.merge(URI.parse(href)).to_s
67
+ rescue
68
+ nil
69
+ end
70
+ end
71
+
72
+ end
73
+ end
@@ -1,79 +1,73 @@
1
- require 'rubygems'
2
- require 'threadpool'
3
- require 'hamster'
1
+ require 'timeout'
2
+ require 'observer'
3
+ require 'watir-webdriver'
4
4
 
5
5
  module Macaron
6
- @@result = {}
7
- @@parsed_urls = Hamster.set
8
- @@task_map = Hamster.hash
9
- @@options = {}
10
- @@success_times = 0
11
- @@fail_times = 0
12
- @@mutex = Mutex.new
13
-
14
6
  class Spawner
15
- DEFALUT_OPTIONS = {
16
- :nokogiri_timeout_seconds => 30,
17
- :thread_timeout_seconds => 40,
18
- :pages => 1000,
19
- :initial_workers => 4,
20
- :maximum_workers => 4,
21
- :in_site_crawling => true,
22
- :with_waltir => false,
23
- :debug => false
24
- }.freeze
7
+ def initialize(url, options)
8
+ @options = options
25
9
 
26
- def initialize(options = {})
27
- @@options = DEFALUT_OPTIONS.merge(options)
28
- @threadpool = Threadpool.new(
29
- @@options[:initial_workers],
30
- @@options[:maximum_workers],
31
- @@options[:thread_timeout_seconds]
32
- )
33
- end
10
+ # threadpool(init workers, max workers, job timeout)
11
+ threadpool = Threadpool.new(10, 10, job_timeout)
34
12
 
35
- def success_times
36
- @@success_times
37
- end
13
+ # tasks saves the on-processing urls
14
+ @tasks = Queue.new
15
+ @tasks << url
38
16
 
39
- def fail_times
40
- @@fail_times
41
- end
17
+ # parsed_urls used to prevent loop crawling
18
+ @parsed_urls = [url]
19
+
20
+ # awaiting_counter saves the awaiting task number
21
+ @awaiting_counter = 1
22
+
23
+ # bot is a webdriver
24
+ bot = Watir::Browser.new if @options[:with_watir]
42
25
 
43
- def dig(url, init_depth=3)
44
- @@task_map = @@task_map.put(url, init_depth)
45
26
  loop do
46
- @@task_map.each {|url, depth|
47
- @@parsed_urls = @@parsed_urls.add(url)
27
+ break if @awaiting_counter == 0
28
+
29
+ begin
30
+ Timeout::timeout(task_timeout) { url = @tasks.shift }
31
+ rescue
32
+ next
33
+ end
48
34
 
49
- if @@options[:with_waltir]
50
- html = get_html_via_waltir(url)
51
- @threadpool.load(Processor.new(url, depth, html))
52
- else
53
- @threadpool.load(Processor.new(url, depth))
54
- end
35
+ job = Macaron::Crawler.new(url, bot)
36
+ job.add_observer(self)
55
37
 
56
- @@task_map = @@task_map.delete(url)
57
- }
38
+ threadpool.load(job)
39
+ end
58
40
 
59
- break if @threadpool.busy_workers_count == 0 && @@task_map.empty?
41
+ bot.close unless bot.nil?
42
+ end
60
43
 
61
- if @@success_times > @@options[:pages]
62
- print "Fetched pages exceeds the limit #{@@options[:pages]}\n"
63
- break
44
+ def update(links)
45
+ @awaiting_counter -= 1
46
+ links.each do |link|
47
+ unless @parsed_urls.include?(link)
48
+ @tasks << link
49
+ @awaiting_counter += 1
64
50
  end
51
+ @parsed_urls << link
65
52
  end
53
+ end
66
54
 
67
- @bot.close unless @bot.nil?
68
-
69
- puts "result: #{@@result.size}, #{@@result.keys}" if @@options[:debug]
55
+ private
56
+ def task_timeout
57
+ # webdriver is slow, it takes more time to wait the result.
58
+ if @options[:with_watir]
59
+ 10
60
+ else
61
+ 2
62
+ end
70
63
  end
71
64
 
72
- private
73
- def get_html_via_waltir(url)
74
- @bot ||= Watir::Browser.new
75
- @bot.goto(url)
76
- @bot.html
65
+ def job_timeout
66
+ if @options[:with_watir]
67
+ 20
68
+ else
69
+ 10
70
+ end
77
71
  end
78
72
 
79
73
  end
metadata CHANGED
@@ -1,61 +1,53 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: macaron
3
3
  version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 1.0.2
4
+ version: 2.0.1
5
+ prerelease:
6
6
  platform: ruby
7
7
  authors:
8
8
  - Dale Ma
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-06 00:00:00.000000000 Z
12
+ date: 2012-12-11 00:00:00.000000000 Z
13
13
  dependencies: []
14
- description:
14
+ description:
15
15
  email: dalema22@gmail.com
16
16
  executables:
17
17
  - macaron
18
18
  extensions: []
19
19
  extra_rdoc_files: []
20
20
  files:
21
- - !binary |-
22
- YmluL21hY2Fyb24=
23
- - !binary |-
24
- bGliL21hY2Fyb24ucmI=
25
- - !binary |-
26
- bGliL21hY2Fyb24vcHJvY2Vzc29yLnJi
27
- - !binary |-
28
- bGliL21hY2Fyb24vc2NyYXBlci5yYg==
29
- - !binary |-
30
- bGliL21hY2Fyb24vc3Bhd25lci5yYg==
31
- - !binary |-
32
- bGliL21hY2Fyb24vdmVyc2lvbi5yYg==
21
+ - bin/macaron
22
+ - lib/macaron/crawler.rb
23
+ - lib/macaron/page.rb
24
+ - lib/macaron/spawner.rb
25
+ - lib/macaron/version.rb
26
+ - lib/macaron.rb
33
27
  - LICENSE
34
28
  - README.md
35
29
  homepage: http://github.com/eguitarz/macaron
36
30
  licenses: []
37
- post_install_message:
31
+ post_install_message:
38
32
  rdoc_options: []
39
33
  require_paths:
40
34
  - lib
41
35
  required_ruby_version: !ruby/object:Gem::Requirement
36
+ none: false
42
37
  requirements:
43
38
  - - ! '>='
44
39
  - !ruby/object:Gem::Version
45
- version: !binary |-
46
- MA==
47
- none: false
40
+ version: '0'
48
41
  required_rubygems_version: !ruby/object:Gem::Requirement
42
+ none: false
49
43
  requirements:
50
44
  - - ! '>='
51
45
  - !ruby/object:Gem::Version
52
- version: !binary |-
53
- MA==
54
- none: false
46
+ version: '0'
55
47
  requirements: []
56
- rubyforge_project:
48
+ rubyforge_project:
57
49
  rubygems_version: 1.8.24
58
- signing_key:
50
+ signing_key:
59
51
  specification_version: 3
60
52
  summary: Ruby based web scraper
61
53
  test_files: []
@@ -1,62 +0,0 @@
1
- require 'rubygems'
2
- require 'threadpool'
3
-
4
- module Macaron
5
- class Processor < Job
6
- @@output_lock = Mutex.new
7
-
8
- def run
9
- begin
10
- url = @args.shift
11
- depth = @args.shift
12
- html = @args.shift
13
- return if depth <= 0
14
- scraper = Scraper.new
15
- scraper.analyze(url, html)
16
-
17
- # @@result[url] = {:anchors => scraper.anchors}
18
- @@result[url] = true;
19
-
20
- # do some additional analyzes
21
- run_sub_tasks(scraper)
22
-
23
- links = nil
24
- if @@options[:in_site_crawling]
25
- links = scraper.internal_anchors
26
- else
27
- links = scraper.absolute_anchors
28
- end
29
- puts "found #{links.size} links on #{url}" if @@options[:debug]
30
-
31
- links.each { |a|
32
- next if @@parsed_urls.include?(a)
33
- p "put #{a} into tasks" if @@options[:debug]
34
- @@task_map = @@task_map.put(a, depth - 1)
35
- }
36
-
37
- @@mutex.synchronize {
38
- @@success_times += 1
39
- }
40
- rescue Exception => e
41
- @@mutex.synchronize {
42
- @@fail_times += 1
43
- }
44
- print "Error on job: #{url}, msg: #{e.message}\n"
45
- end
46
- end
47
-
48
- private
49
- def run_sub_tasks(scraper)
50
- # p scraper.image_urls
51
-
52
- if @@options[:save]
53
- dir = @@options[:dir] || '/tmp'
54
- filename = scraper.host.gsub('/', '\\')
55
- File.open(File.join(dir, filename), "w+") do |f|
56
- f.write(scraper.dom)
57
- end
58
- end
59
- end
60
-
61
- end
62
- end
@@ -1,79 +0,0 @@
1
- require 'rubygems'
2
- require 'nokogiri'
3
- require 'open-uri'
4
- require 'benchmark'
5
- require 'timeout'
6
- require 'watir-webdriver'
7
-
8
- module Macaron
9
- class Scraper
10
- attr_reader :dom, :host
11
-
12
- def analyze(host, html)
13
- @host = host
14
- @html = html
15
-
16
- elapsed_seconds = 0
17
- begin
18
- timeout(@@options[:nokogiri_timeout_seconds]) do
19
- elapsed_seconds = Benchmark.realtime { fetch_dom }
20
- end
21
- rescue Timeout::Error
22
- print "Timeout on #{host}\n"
23
- @@mutex.synchronize {
24
- @@fail_times += 1
25
- }
26
- end
27
-
28
- @all_links = absolute_anchors
29
-
30
- print ">> elapsed #{elapsed_seconds} seconds to get '#{host}'\n"
31
- end
32
-
33
- def anchors
34
- @dom.css('a')
35
- end
36
-
37
- def absolute_anchors
38
- make_absolute_anchors(anchors)
39
- end
40
-
41
- def internal_anchors
42
- root = URI.parse(@host).host
43
- @all_links.select {|l| URI.parse(l).host == root}
44
- end
45
-
46
- def external_anchors
47
- root = URI.parse(@host).host
48
- @all_links.select {|l| URI.parse(l).host != root}
49
- end
50
-
51
- def images
52
- @dom.css('img')
53
- end
54
-
55
- def image_urls
56
- images.map { |img| make_absolute(img['src']) }.compact
57
- end
58
-
59
- def fetch_dom
60
- unless @@options[:with_waltir]
61
- @html = open(@host)
62
- end
63
-
64
- @dom = Nokogiri::HTML(@html)
65
- end
66
-
67
- def make_absolute_anchors(nodes)
68
- nodes.map {|n| make_absolute(n['href']) }.compact
69
- end
70
-
71
- def make_absolute(href)
72
- begin
73
- URI.parse(@host).merge(URI.parse(href)).to_s
74
- rescue
75
- nil
76
- end
77
- end
78
- end
79
- end