RubyGems - macaron - Versions diffs - 1.0.2 → 2.0.1 - Mend

macaron 1.0.2 → 2.0.1

Files changed (8) hide show

data/bin/macaron CHANGED Viewed

@@ -17,7 +17,7 @@ options_parser = OptionParser.new do |opts|
   end
   opts.on("-j", "--javascript", "Open javascript support mode") do |j|
-    options[:with_waltir] = j
+    options[:with_watir] = j
   end
   opts.on("-s", "--save", "Save html") do |s|
@@ -38,9 +38,6 @@ if ARGV.length != 1
 end
 url = ARGV.first
-puts "Starting at #{url}"
+puts "Started"
-mother = Spawner.new(options)
-mother.dig(url, options[:depth])
-puts "Success times: #{mother.success_times}"
-puts "Fail times: #{mother.fail_times}"
+Spawner.new(url, options)

data/lib/macaron.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 $: << File.dirname(__FILE__)
-require 'macaron/processor'
 require 'macaron/spawner'
-require 'macaron/scraper'
+require 'macaron/crawler'
+require 'macaron/page'
 include Macaron

data/lib/macaron/crawler.rb ADDED Viewed

@@ -0,0 +1,23 @@
+require 'observer'
+require 'timeout'
+require 'threadpool'
+module Macaron
+  class Crawler < Job
+    include Observable
+    def run
+      url, bot = @args
+      page = Page.new(url, bot)
+      links = []
+      begin
+        links = page.fetch.inner_links
+      rescue
+      end
+      changed
+      notify_observers(links)
+      print "#{url} >> #{page.title}\n"
+      delete_observers
+    end
+  end
+end

data/lib/macaron/page.rb ADDED Viewed

@@ -0,0 +1,73 @@
+require 'open-uri'
+require 'nokogiri'
+require 'thread'
+module Macaron
+  class Page
+    def initialize(url, bot=nil)
+      @url = url
+      @bot = bot
+      @@bot_lock = Mutex.new
+    end
+    def fetch
+      document
+      base(@url)
+      self
+    end
+    def inner_links
+      anchors = links.select {|link|
+        URI.parse(link).host == @base.host
+      }.compact
+    end
+    def title
+      @doc.title
+    end
+    private
+    def document
+      @doc ||= Nokogiri::HTML(content)
+    end
+    def base(href)
+      base = @doc.css('base')
+      header_base_url = base.attr('href').text unless base.empty?
+      base_url = header_base_url || @url
+      @base ||= URI.parse(base_url)
+    end
+    def content
+      if @bot
+        # only activate one browser, needs to be thread safe.
+        @@bot_lock.synchronize {
+          @bot.goto(@url)
+          @bot.html
+        }
+      else
+        open(@url)
+      end
+    end
+    def links
+      @doc.css('a').map {|a|
+        href = a['href']
+        if href.start_with? 'http'
+          href
+        else
+          make_absolute(href)
+        end
+      }.compact
+    end
+    def make_absolute(href)
+      begin
+        @base.merge(URI.parse(href)).to_s
+      rescue
+        nil
+      end
+    end
+  end
+end

data/lib/macaron/spawner.rb CHANGED Viewed

@@ -1,79 +1,73 @@
-require 'rubygems'
-require 'threadpool'
-require 'hamster'
+require 'timeout'
+require 'observer'
+require 'watir-webdriver'
 module Macaron
-  @@result = {}
-  @@parsed_urls = Hamster.set
-  @@task_map = Hamster.hash
-  @@options = {}
-  @@success_times = 0
-  @@fail_times = 0
-  @@mutex = Mutex.new
   class Spawner
-    DEFALUT_OPTIONS = {
-      :nokogiri_timeout_seconds => 30,
-      :thread_timeout_seconds => 40,
-      :pages => 1000,
-      :initial_workers => 4,
-      :maximum_workers => 4,
-      :in_site_crawling => true,
-      :with_waltir => false,
-      :debug => false
-    }.freeze
+    def initialize(url, options)
+      @options = options
-    def initialize(options = {})
-      @@options = DEFALUT_OPTIONS.merge(options)
-      @threadpool = Threadpool.new(
-        @@options[:initial_workers],
-        @@options[:maximum_workers],
-        @@options[:thread_timeout_seconds]
-      )
-    end
+      # threadpool(init workers, max workers, job timeout)
+      threadpool = Threadpool.new(10, 10, job_timeout)
-    def success_times
-      @@success_times
-    end
+      # tasks saves the on-processing urls
+      @tasks = Queue.new
+      @tasks << url
-    def fail_times
-      @@fail_times
-    end
+      # parsed_urls used to prevent loop crawling
+      @parsed_urls = [url]
+      # awaiting_counter saves the awaiting task number
+      @awaiting_counter = 1
+      # bot is a webdriver
+      bot = Watir::Browser.new if @options[:with_watir]
-    def dig(url, init_depth=3)
-      @@task_map = @@task_map.put(url, init_depth)
       loop do
-        @@task_map.each {|url, depth|
-          @@parsed_urls = @@parsed_urls.add(url)
+        break if @awaiting_counter == 0
+        begin
+          Timeout::timeout(task_timeout) { url = @tasks.shift }
+        rescue
+          next
+        end
-          if @@options[:with_waltir]
-            html = get_html_via_waltir(url)
-            @threadpool.load(Processor.new(url, depth, html))
-          else
-            @threadpool.load(Processor.new(url, depth))
-          end
+        job = Macaron::Crawler.new(url, bot)
+        job.add_observer(self)
-          @@task_map = @@task_map.delete(url)
-        }
+        threadpool.load(job)
+      end
-        break if @threadpool.busy_workers_count == 0 && @@task_map.empty?
+      bot.close unless bot.nil?
+    end
-        if @@success_times > @@options[:pages]
-          print "Fetched pages exceeds the limit #{@@options[:pages]}\n"
-          break
+    def update(links)
+      @awaiting_counter -= 1
+      links.each do |link|
+        unless @parsed_urls.include?(link)
+          @tasks << link
+          @awaiting_counter += 1
         end
+        @parsed_urls << link
       end
+    end
-      @bot.close unless @bot.nil?
-      puts "result: #{@@result.size}, #{@@result.keys}" if @@options[:debug]
+    private
+    def task_timeout
+      # webdriver is slow, it takes more time to wait the result.
+      if @options[:with_watir]
+        10
+      else
+        2
+      end
     end
-    private
-    def get_html_via_waltir(url)
-      @bot ||= Watir::Browser.new
-      @bot.goto(url)
-      @bot.html
+    def job_timeout
+      if @options[:with_watir]
+        20
+      else
+        10
+      end
     end
   end

metadata CHANGED Viewed

@@ -1,61 +1,53 @@
 --- !ruby/object:Gem::Specification
 name: macaron
 version: !ruby/object:Gem::Version
-  prerelease:
-  version: 1.0.2
+  version: 2.0.1
+  prerelease:
 platform: ruby
 authors:
 - Dale Ma
-autorequire:
+autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-12-06 00:00:00.000000000 Z
+date: 2012-12-11 00:00:00.000000000 Z
 dependencies: []
-description:
+description:
 email: dalema22@gmail.com
 executables:
 - macaron
 extensions: []
 extra_rdoc_files: []
 files:
-- !binary |-
-  YmluL21hY2Fyb24=
-- !binary |-
-  bGliL21hY2Fyb24ucmI=
-- !binary |-
-  bGliL21hY2Fyb24vcHJvY2Vzc29yLnJi
-- !binary |-
-  bGliL21hY2Fyb24vc2NyYXBlci5yYg==
-- !binary |-
-  bGliL21hY2Fyb24vc3Bhd25lci5yYg==
-- !binary |-
-  bGliL21hY2Fyb24vdmVyc2lvbi5yYg==
+- bin/macaron
+- lib/macaron/crawler.rb
+- lib/macaron/page.rb
+- lib/macaron/spawner.rb
+- lib/macaron/version.rb
+- lib/macaron.rb
 - LICENSE
 - README.md
 homepage: http://github.com/eguitarz/macaron
 licenses: []
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
-      version: !binary |-
-        MA==
-  none: false
+      version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
-      version: !binary |-
-        MA==
-  none: false
+      version: '0'
 requirements: []
-rubyforge_project:
+rubyforge_project:
 rubygems_version: 1.8.24
-signing_key:
+signing_key:
 specification_version: 3
 summary: Ruby based web scraper
 test_files: []

data/lib/macaron/processor.rb DELETED Viewed

@@ -1,62 +0,0 @@
-require 'rubygems'
-require 'threadpool'
-module Macaron
-  class Processor < Job
-    @@output_lock = Mutex.new
-    def run
-      begin
-        url = @args.shift
-        depth = @args.shift
-        html = @args.shift
-        return if depth <= 0
-        scraper = Scraper.new
-        scraper.analyze(url, html)
-        # @@result[url] = {:anchors => scraper.anchors}
-        @@result[url] = true;
-        # do some additional analyzes
-        run_sub_tasks(scraper)
-        links = nil
-        if @@options[:in_site_crawling]
-          links = scraper.internal_anchors
-        else
-          links = scraper.absolute_anchors
-        end
-        puts "found #{links.size} links on #{url}" if @@options[:debug]
-        links.each { |a|
-          next if @@parsed_urls.include?(a)
-          p "put #{a} into tasks" if @@options[:debug]
-          @@task_map = @@task_map.put(a, depth - 1)
-        }
-        @@mutex.synchronize {
-          @@success_times += 1
-        }
-      rescue Exception => e
-        @@mutex.synchronize {
-          @@fail_times += 1
-        }
-        print "Error on job: #{url}, msg: #{e.message}\n"
-      end
-    end
-    private
-    def run_sub_tasks(scraper)
-      # p scraper.image_urls
-      if @@options[:save]
-        dir = @@options[:dir] || '/tmp'
-        filename = scraper.host.gsub('/', '\\')
-        File.open(File.join(dir, filename), "w+") do |f|
-          f.write(scraper.dom)
-        end
-      end
-    end
-  end
-end

data/lib/macaron/scraper.rb DELETED Viewed

@@ -1,79 +0,0 @@
-require 'rubygems'
-require 'nokogiri'
-require 'open-uri'
-require 'benchmark'
-require 'timeout'
-require 'watir-webdriver'
-module Macaron
-  class Scraper
-    attr_reader :dom, :host
-    def analyze(host, html)
-      @host = host
-      @html = html
-      elapsed_seconds = 0
-      begin
-        timeout(@@options[:nokogiri_timeout_seconds]) do
-          elapsed_seconds = Benchmark.realtime { fetch_dom }
-        end
-      rescue Timeout::Error
-        print "Timeout on #{host}\n"
-        @@mutex.synchronize {
-          @@fail_times += 1
-        }
-      end
-      @all_links = absolute_anchors
-      print ">> elapsed #{elapsed_seconds} seconds to get '#{host}'\n"
-    end
-    def anchors
-      @dom.css('a')
-    end
-    def absolute_anchors
-      make_absolute_anchors(anchors)
-    end
-    def internal_anchors
-      root = URI.parse(@host).host
-      @all_links.select {|l| URI.parse(l).host == root}
-    end
-    def external_anchors
-      root = URI.parse(@host).host
-      @all_links.select {|l| URI.parse(l).host != root}
-    end
-    def images
-      @dom.css('img')
-    end
-    def image_urls
-      images.map { |img| make_absolute(img['src']) }.compact
-    end
-    def fetch_dom
-      unless @@options[:with_waltir]
-        @html = open(@host)
-      end
-      @dom = Nokogiri::HTML(@html)
-    end
-    def make_absolute_anchors(nodes)
-      nodes.map {|n| make_absolute(n['href']) }.compact
-    end
-    def make_absolute(href)
-      begin
-        URI.parse(@host).merge(URI.parse(href)).to_s
-      rescue
-        nil
-      end
-    end
-  end
-end