RubyGems - anemone - Versions diffs - 0.0.1 → 0.0.2 - Mend

anemone 0.0.1 → 0.0.2

Files changed (4) hide show

data/lib/anemone/anemone.rb CHANGED Viewed

@@ -1,16 +1,37 @@
+require 'ostruct'
 require 'anemone/core'
 module Anemone
   # Version number
-  VERSION = '0.0.1'
+  VERSION = '0.0.2'
   # User-Agent string used for HTTP requests
   USER_AGENT = "Anemone/#{self::VERSION}"
+  #module-wide options
+  def Anemone.options=(options)
+    @options = options
+  end
+  def Anemone.options
+    @options
+  end
   #
   # Convenience method to start a crawl using Core
   #
   def Anemone.crawl(url, options = {}, &block)
-    Core.crawl(url, options, &block)
+    Anemone.options = OpenStruct.new(options)
+	#by default, run 4 Tentacle threads to fetch pages
+    Anemone.options.threads ||= 4
+	#disable verbose output by default
+    Anemone.options.verbose ||= false
+	#by default, throw away the page response body after scanning it for links, to save memory
+	Anemone.options.discard_page_bodies ||= true
+    Core.crawl(url, &block)
   end
 end

data/lib/anemone/core.rb CHANGED Viewed

@@ -11,10 +11,9 @@ module Anemone
     #
     # Initialize the crawl with a starting *url*, *options*, and optional *block*
     #
-    def initialize(url, options={}, &block)
+    def initialize(url, &block)
       url = URI(url) if url.is_a?(String)
       @url = url
-      @options = options
       @tentacles = []
       @pages = PageHash.new
       @on_every_page_blocks = []
@@ -22,17 +21,14 @@ module Anemone
       @skip_link_patterns = []
       @after_crawl_blocks = []
-      @options[:threads] ||= 4
-      @options[:verbose] ||= false
       block.call(self) if block
     end
     #
     # Convenience method to start a new crawl
     #
-    def self.crawl(root, options={}, &block)
-      self.new(root, options) do |core|
+    def self.crawl(root, &block)
+      self.new(root) do |core|
         block.call(core) if block
         core.run
         core.do_after_crawl_blocks
@@ -91,7 +87,7 @@ module Anemone
       link_queue = Queue.new
       page_queue = Queue.new
-      @options[:threads].times do |id|
+      Anemone.options.threads.times do |id|
         @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
       end
@@ -104,7 +100,7 @@ module Anemone
         @pages[page.url] = page
-        puts "#{page.url} Queue: #{link_queue.size}" if @options[:verbose]
+        puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
         do_page_blocks(page)

data/lib/anemone/page.rb CHANGED Viewed

@@ -7,9 +7,13 @@ module Anemone
     attr_reader :url
     # Array of distinct A tag HREFs from the page
     attr_reader :links
-    # Integer response code of the page
-    attr_reader :code
+	#Body of the HTTP response
+	attr_reader :body
+	#Content-type of the  HTTP response
+	attr_reader :content_type
+    # Integer response code of the page
+    attr_accessor :code
     # Array of redirect-aliases for the page
     attr_accessor :aliases
     # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
@@ -31,7 +35,7 @@ module Anemone
           aka = location
         end
-        return Page.new(url, response, code, aka)
+        return Page.new(url, response.body, code, response['Content-Type'], aka)
       rescue
         return Page.new(url)
       end
@@ -40,18 +44,19 @@ module Anemone
     #
     # Create a new page
     #
-    def initialize(url, response = nil, code = nil, aka = nil)
+    def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
       @url = url
-      @response = response
+	  @body = body unless Anemone.options.discard_page_bodies
       @code = code
+	  @content_type = content_type
       @links = []
       @aliases = []
       @aliases << aka if !aka.nil?
       #get a list of distinct links on the page, in absolute url form
-      if @response and @response.body
-        Hpricot(@response.body).search('a').each do |a|
+      if body
+        Hpricot(body).search('a').each do |a|
           u = a['href']
           next if u.nil?
@@ -75,7 +80,10 @@ module Anemone
     # with a 200 response code
     #
     def alias_clone(url)
-      Page.new(url, @response, 200, @url)
+      p = clone
+	  p.add_alias!(@aka) if !@aka.nil?
+	  p.code = 200
+	  p
     end
     #
@@ -99,27 +107,13 @@ module Anemone
         results.concat([link].concat(page_hash[link].aliases))
       end
     end
-    #
-    # Returns the response body for the page
-    #
-    def body
-      @response.body
-    end
-    #
-    # Returns the +Content-Type+ header for the page
-    #
-    def content_type
-      @response['Content-Type']
-    end
     #
     # Returns +true+ if the page is a HTML document, returns +false+
     # otherwise.
     #
     def html?
-      (content_type =~ /text\/html/) == 0
+      (@content_type =~ /text\/html/) == 0
     end
     #

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: anemone
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
 platform: ruby
 authors:
 - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-04-14 00:00:00 -05:00
+date: 2009-04-30 00:00:00 -05:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency