RubyGems - shingara-anemone - Versions diffs - 0.2.4 - Mend

shingara-anemone 0.2.4

Files changed (22) hide show

data/CHANGELOG.rdoc +27 -0
data/LICENSE.txt +19 -0
data/README.rdoc +24 -0
data/bin/anemone +4 -0
data/lib/anemone.rb +2 -0
data/lib/anemone/cli.rb +24 -0
data/lib/anemone/cli/count.rb +22 -0
data/lib/anemone/cli/cron.rb +90 -0
data/lib/anemone/cli/pagedepth.rb +32 -0
data/lib/anemone/cli/serialize.rb +35 -0
data/lib/anemone/cli/url_list.rb +41 -0
data/lib/anemone/core.rb +256 -0
data/lib/anemone/http.rb +123 -0
data/lib/anemone/page.rb +155 -0
data/lib/anemone/page_hash.rb +142 -0
data/lib/anemone/tentacle.rb +39 -0
data/spec/anemone_spec.rb +15 -0
data/spec/core_spec.rb +203 -0
data/spec/fakeweb_helper.rb +57 -0
data/spec/page_spec.rb +52 -0
data/spec/spec_helper.rb +7 -0
metadata +96 -0

@@ -0,0 +1,123 @@
+require 'net/https'
+require 'anemone/page'
+module Anemone
+  class HTTP
+    # Maximum number of redirects to follow on each get_response
+    REDIRECT_LIMIT = 5
+    def initialize(opts = {})
+      @connections = {}
+      @opts = opts
+    end
+    #
+    # Create a new Page from the response of an HTTP request to *url*
+    #
+    def fetch_page(url, from_page = nil)
+      begin
+        url = URI(url) unless url.is_a?(URI)
+        if from_page
+          referer = from_page.url
+          depth = from_page.depth + 1
+        end
+        response, code, location, response_time = get(url, referer)
+        aka = nil
+        if !url.eql?(location)
+          aka = location
+        end
+        return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
+      rescue => e
+        if verbose?
+          puts e.inspect
+          puts e.backtrace
+        end
+        return Page.new(url)
+      end
+    end
+    private
+    #
+    # Retrieve an HTTP response for *url*, following redirects.
+    # Returns the response object, response code, and final URI location.
+    #
+    def get(url, referer = nil)
+      response, response_time = get_response(url, referer)
+      code = Integer(response.code)
+      loc = url
+      limit = redirect_limit
+      while response.is_a?(Net::HTTPRedirection) and limit > 0
+          loc = URI(response['location'])
+          loc = url.merge(loc) if loc.relative?
+          response, response_time = get_response(loc, referer)
+          limit -= 1
+      end
+      return response, code, loc, response_time
+    end
+    #
+    # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
+    #
+    def get_response(url, referer = nil)
+      full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
+      opts = {}
+      opts['User-Agent'] = user_agent if user_agent
+      opts['Referer'] = referer.to_s if referer
+      retries = 0
+      begin
+        start = Time.now()
+        req = Net::HTTP::Get.new(full_path, opts)
+        req.basic_auth url.user, url.password if url.user
+        response = connection(url).request(req)
+        finish = Time.now()
+        response_time = ((finish - start) * 1000).round
+        return response, response_time
+      rescue EOFError
+        refresh_connection(url)
+        retries += 1
+        retry unless retries > 3
+      end
+    end
+    def connection(url)
+      @connections[url.host] ||= {}
+      if conn = @connections[url.host][url.port]
+        return conn
+      end
+      refresh_connection(url)
+    end
+    def refresh_connection(url)
+      http = Net::HTTP.new(url.host, url.port)
+      if url.scheme == 'https'
+        http.use_ssl = true
+        http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+      end
+      @connections[url.host][url.port] = http.start
+    end
+    def redirect_limit
+      @opts[:redirect_limit] || REDIRECT_LIMIT
+    end
+    def user_agent
+      @opts[:user_agent]
+    end
+    def verbose?
+      @opts[:verbose]
+    end
+  end
+end

data/lib/anemone/page.rb ADDED

@@ -0,0 +1,155 @@
+require 'nokogiri'
+require 'ostruct'
+module Anemone
+  class Page
+    # The URL of the page
+    attr_reader :url
+    # Headers of the HTTP response
+    attr_reader :headers
+    # OpenStruct for user-stored data
+    attr_accessor :data
+    # Nokogiri document for the HTML body
+    attr_accessor :doc
+    # Integer response code of the page
+    attr_accessor :code
+    # Array of redirect-aliases for the page
+    attr_accessor :aliases
+    # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
+    attr_accessor :visited
+    # Depth of this page from the root of the crawl. This is not necessarily the
+    # shortest path; use PageHash#shortest_paths! to find that value.
+    attr_accessor :depth
+    # URL of the page that brought us to this page
+    attr_accessor :referer
+    # Response time of the request for this page in milliseconds
+    attr_accessor :response_time
+    #
+    # Create a new page
+    #
+    def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
+      @url = url
+      @code = code
+      @headers = headers || {}
+      @headers['content-type'] ||= ['']
+      @aliases = Array(aka)
+      @data = OpenStruct.new
+      @referer = referer
+      @depth = depth || 0
+      @response_time = response_time
+      @doc = Nokogiri::HTML(body) if body && html? rescue nil
+    end
+    # Array of distinct A tag HREFs from the page
+    def links
+      return @links unless @links.nil?
+      @links = []
+      return @links if !doc
+      doc.css('a').each do |a|
+        u = a.attributes['href'].content rescue nil
+        next if u.nil? or u.empty?
+        abs = to_absolute(URI(u)) rescue next
+        @links << abs if in_domain?(abs)
+      end
+      @links.uniq!
+      @links
+    end
+    def discard_doc!
+      links # force parsing of page links before we trash the document
+      @doc = nil
+    end
+    #
+    # Return a new page with the same *response* and *url*, but
+    # with a 200 response code
+    #
+    def alias_clone(url)
+      p = clone
+	  p.add_alias!(@aka) if !@aka.nil?
+	  p.code = 200
+	  p
+    end
+    #
+    # Add a redirect-alias String *aka* to the list of the page's aliases
+    #
+    # Returns *self*
+    #
+    def add_alias!(aka)
+      @aliases << aka if !@aliases.include?(aka)
+      self
+    end
+    #
+    # Returns an Array of all links from this page, and all the
+    # redirect-aliases of those pages, as String objects.
+    #
+    # *page_hash* is a PageHash object with the results of the current crawl.
+    #
+    def links_and_their_aliases(page_hash)
+      links.inject([]) do |results, link|
+        results.concat([link].concat(page_hash[link].aliases))
+      end
+    end
+    #
+    # The content-type returned by the HTTP request for this page
+    #
+    def content_type
+      headers['content-type'].first
+    end
+    #
+    # Returns +true+ if the page is a HTML document, returns +false+
+    # otherwise.
+    #
+    def html?
+      !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
+    end
+    #
+    # Returns +true+ if the page is a HTTP redirect, returns +false+
+    # otherwise.
+    #
+    def redirect?
+      (300..399).include?(@code)
+    end
+    #
+    # Returns +true+ if the page was not found (returned 404 code),
+    # returns +false+ otherwise.
+    #
+    def not_found?
+      404 == @code
+    end
+    #
+    # Converts relative URL *link* into an absolute URL based on the
+    # location of the page
+    #
+    def to_absolute(link)
+      # remove anchor
+      link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
+      relative = URI(link)
+      absolute = @url.merge(relative)
+      absolute.path = '/' if absolute.path.empty?
+      return absolute
+    end
+    #
+    # Returns +true+ if *uri* is in the same domain as the page, returns
+    # +false+ otherwise
+    #
+    def in_domain?(uri)
+      uri.host == @url.host
+    end
+  end
+end

data/lib/anemone/page_hash.rb ADDED

@@ -0,0 +1,142 @@
+module Anemone
+  class PageHash < Hash
+    # We typically index the hash with a URI,
+    # but convert it to a String for easier retrieval
+    def [](index)
+      super(index.to_s)
+    end
+    def []=(index, other)
+      super(index.to_s, other)
+    end
+    def has_key?(key)
+      super(key.to_s)
+    end
+    # Does this PageHash contain the specified URL?
+    # HTTP and HTTPS versions of a URL are considered to be the same page.
+    def has_page?(url)
+      schemes = %w(http https)
+      if schemes.include? url.scheme
+        u = url.dup
+        return schemes.any? { |s| u.scheme = s; has_key?(u) }
+      end
+      has_key?(url)
+    end
+    #
+    # Use a breadth-first search to calculate the single-source
+    # shortest paths from *root* to all pages in the PageHash
+    #
+    def shortest_paths!(root)
+      root = URI(root) if root.is_a?(String)
+      raise "Root node not found" if !has_key?(root)
+      each_value {|p| p.visited = false if p}
+      q = Queue.new
+      q.enq(root)
+      self[root].depth = 0
+      self[root].visited = true
+      while(!q.empty?)
+        url = q.deq
+        next if !has_key?(url)
+        page = self[url]
+        page.links.each do |u|
+          next if !has_key?(u) or self[u].nil?
+          link = self[u]
+          aliases = [link].concat(link.aliases.map {|a| self[a] })
+          aliases.each do |node|
+            if node.depth.nil? or page.depth + 1 < node.depth
+              node.depth = page.depth + 1
+            end
+          end
+          q.enq(self[u].url) if !self[u].visited
+          self[u].visited = true
+        end
+      end
+      self
+    end
+    #
+    # Returns a new PageHash by removing redirect-aliases for each
+    # non-redirect Page
+    #
+    def uniq
+      results = PageHash.new
+      each do |url, page|
+        #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
+        page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
+        if !page.redirect? and !page_added
+          results[url] = page.clone
+          results[url].aliases = []
+        end
+      end
+      results
+    end
+    #
+    # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
+    # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
+    #
+    def pages_linking_to(urls)
+      unless urls.is_a?(Array)
+        urls = [urls] unless urls.is_a?(Array)
+        single = true
+      end
+      urls.map! do |url|
+        if url.is_a?(String)
+          URI(url) rescue nil
+        else
+          url
+        end
+      end
+      urls.compact
+      links = {}
+      urls.each { |url| links[url] = [] }
+      values.each do |page|
+        urls.each { |url| links[url] << page if page.links.include?(url) }
+      end
+      if single and !links.empty?
+        return links.first
+      else
+        return links
+      end
+    end
+    #
+    # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
+    # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
+    #
+    def urls_linking_to(urls)
+      unless urls.is_a?(Array)
+        urls = [urls] unless urls.is_a?(Array)
+        single = true
+      end
+      links = pages_linking_to(urls)
+      links.each { |url, pages| links[url] = pages.map{|p| p.url} }
+      if single and !links.empty?
+        return links.first
+      else
+        return links
+      end
+    end
+  end
+end

data/lib/anemone/tentacle.rb ADDED

@@ -0,0 +1,39 @@
+require 'anemone/http'
+module Anemone
+  class Tentacle
+    #
+    # Create a new Tentacle
+    #
+    def initialize(link_queue, page_queue, opts = {})
+      @link_queue = link_queue
+      @page_queue = page_queue
+      @http = Anemone::HTTP.new(opts)
+      @opts = opts
+    end
+    #
+    # Gets links from @link_queue, and returns the fetched
+    # Page objects into @page_queue
+    #
+    def run
+      loop do
+        link, from_page = @link_queue.deq
+        break if link == :END
+        @page_queue << @http.fetch_page(link, from_page)
+        delay
+      end
+    end
+    private
+    def delay
+      sleep @opts[:delay] if @opts[:delay]
+    end
+  end
+end