RubyGems - anemone - Versions diffs - 0.0.3 → 0.0.4 - Mend

anemone 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/README.txt CHANGED Viewed

@@ -12,7 +12,8 @@ write your own specialized spider tasks quickly and easily.
 * Allows exclusion of URLs based on regular expressions
 == REQUIREMENTS
-* hpricot
+* nokogiri
+* facets
 == EXAMPLES
-See the +bin+ directory for several examples of useful Anemone tasks.
+See the +bin+ directory for several examples of useful Anemone tasks.

data/bin/anemone_url_list.rb CHANGED Viewed

@@ -43,7 +43,7 @@ opts = OptionParser.new
 opts.on('-r', '--relative') { options.relative = true }
 opts.parse!(ARGV)
-Anemone.crawl(ARGV.last) do |anemone|
+Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
   anemone.on_every_page do |page|
     if options.relative
       puts page.url.path

data/lib/anemone/anemone.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'anemone/core'
 module Anemone
   # Version number
-  VERSION = '0.0.2'
+  VERSION = '0.0.4'
   # User-Agent string used for HTTP requests
   USER_AGENT = "Anemone/#{self::VERSION}"
@@ -23,15 +23,15 @@ module Anemone
   def Anemone.crawl(url, options = {}, &block)
     Anemone.options = OpenStruct.new(options)
-	#by default, run 4 Tentacle threads to fetch pages
+    #by default, run 4 Tentacle threads to fetch pages
     Anemone.options.threads ||= 4
-	#disable verbose output by default
+    #disable verbose output by default
     Anemone.options.verbose ||= false
-	#by default, don't throw away the page response body after scanning it for links
-	Anemone.options.discard_page_bodies ||= false
+    #by default, don't throw away the page response body after scanning it for links
+    Anemone.options.discard_page_bodies ||= false
     Core.crawl(url, &block)
   end
-end
+end

data/lib/anemone/core.rb CHANGED Viewed

@@ -103,6 +103,8 @@ module Anemone
         puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
         do_page_blocks(page)
+        page.doc = nil if Anemone.options.discard_page_bodies
         page.links.each do |link|
           if visit_link?(link)
@@ -131,7 +133,7 @@ module Anemone
         end
       end
       @tentacles.each { |t| t.join }
       self

data/lib/anemone/page.rb CHANGED Viewed

@@ -1,25 +1,20 @@
 require 'anemone/http'
-require 'hpricot'
+require 'nokogiri'
+require 'facets/ostructable'
 module Anemone
   class Page
+    include OpenStructable
     # The URL of the page
     attr_reader :url
     # Array of distinct A tag HREFs from the page
     attr_reader :links
-    #Body of the HTTP response
-    attr_reader :body
     #Content-type of the  HTTP response
     attr_reader :content_type
-    #title of the page if it is an HTML document
-    attr_reader :title
-    #first h1 on the page, if present
-    attr_reader :h1
-    #first h2 on the page, if present
-    attr_reader :h2
-    #meta-description of the page, if present
-    attr_reader :description
+    #Nokogiri document for the HTML body
+    attr_accessor :doc
     # Integer response code of the page
     attr_accessor :code
     # Array of redirect-aliases for the page
@@ -54,36 +49,28 @@ module Anemone
     #
     def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
       @url = url
-      @body = body unless Anemone.options.discard_page_bodies
       @code = code
       @content_type = content_type
       @links = []
       @aliases = []
+	  #create empty storage for OpenStructable
+      update({})
       @aliases << aka if !aka.nil?
       if body
-        h = Hpricot(body)
-        #save page title
-        title_elem = h.at('title')
-        @title = title_elem.inner_html if !title_elem.nil?
-        #save page h1
-        h1_elem = h.at('h1')
-        @h1 = h1_elem.inner_html if !h1_elem.nil?
-        #save page h2
-        h2_elem = h.at('h2')
-        @h2 = h2_elem.inner_html if !h2_elem.nil?
+        begin
+          @doc = Nokogiri::HTML(body)
+        rescue
+          return
+        end
-        #save page meta-description
-        description_elem = h.at('meta[@name=description]')
-        @description = description_elem['content'] if !description_elem.nil?
+        return if @doc.nil?
         #get a list of distinct links on the page, in absolute url form
-        h.search('a').each do |a|
-          u = a['href']
+        @doc.css('a').each do |a|
+          u = a.attribute('href')
           next if u.nil?
           begin
@@ -106,9 +93,9 @@ module Anemone
     #
     def alias_clone(url)
       p = clone
-	    p.add_alias!(@aka) if !@aka.nil?
-	    p.code = 200
-	    p
+	  p.add_alias!(@aka) if !@aka.nil?
+	  p.code = 200
+	  p
     end
     #

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: anemone
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.4
 platform: ruby
 authors:
 - Chris Kite
@@ -9,18 +9,28 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-05-31 00:00:00 -05:00
+date: 2009-06-12 00:00:00 -05:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
-  name: hpricot
+  name: nokogiri
   type: :runtime
   version_requirement:
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.7.0
+        version: 1.3.0
+    version:
+- !ruby/object:Gem::Dependency
+  name: facets
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 2.5.0
     version:
 description:
 email:
@@ -35,23 +45,23 @@ extensions: []
 extra_rdoc_files:
 - README.txt
 files:
-- bin/anemone_url_list.rb~
-- bin/anemone_url_list.rb
-- bin/anemone_serialize.rb
 - bin/anemone_pagedepth.rb
-- bin/anemone_count.rb
+- bin/anemone_url_list.rb
 - bin/anemone_cron.rb
-- lib/anemone.rb
-- lib/anemone
+- bin/anemone_count.rb
+- bin/anemone_serialize.rb
+- lib/anemone/tentacle.rb
 - lib/anemone/page.rb
+- lib/anemone/page_hash.rb
 - lib/anemone/core.rb
-- lib/anemone/anemone.rb
 - lib/anemone/http.rb
-- lib/anemone/tentacle.rb
-- lib/anemone/page_hash.rb
+- lib/anemone/anemone.rb
+- lib/anemone.rb
 - README.txt
 has_rdoc: true
 homepage: http://anemone.rubyforge.org
+licenses: []
 post_install_message:
 rdoc_options:
 - -m
@@ -75,9 +85,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: anemone
-rubygems_version: 1.3.1
+rubygems_version: 1.3.4
 signing_key:
-specification_version: 2
+specification_version: 3
 summary: Anemone web-spider framework
 test_files: []

data/bin/anemone_url_list.rb~ DELETED Viewed

@@ -1,58 +0,0 @@
-#! /usr/bin/env ruby
-# == Synopsis
-#   Crawls a site starting at the given URL, and outputs the URL of each page
-#   in the domain as they are encountered.
-#
-# == Usage
-#   anemone_url_list.rb [options] url
-#
-# == Options
-#   -r, --relative          Output relative URLs (rather than absolute)
-#
-# == Author
-#   Chris Kite
-$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
-require 'anemone'
-require 'optparse'
-require 'ostruct'
-def usage
-  puts <<END
-Usage: anemone_url_list.rb [options] url
-Options:
-  -r, --relative      Output relative URLs (rather than absolute)
-END
-end
-options = OpenStruct.new
-options.relative = false
-# make sure that the last option is a URL we can crawl
-begin
-  URI(ARGV.last)
-rescue
-  usage
-  Process.exit
-end
-# parse command-line options
-opts = OptionParser.new
-opts.on('-r', '--relative') { options.relative = true }
-opts.parse!(ARGV)
-Anemone.crawl(ARGV.last) do |anemone|
-  anemone.on_pages_like(/\/about\//, /\/experience\//) do |page|
-    puts "WOOZLE #{page.url}"
-  end
-  anemone.on_every_page do |page|
-    if options.relative
-      puts page.url.path
-    else
-      puts page.url
-    end
-  end
-end