RubyGems - graboid - Versions diffs - 0.3.3 → 0.3.4 - Mend

graboid 0.3.3 → 0.3.4

Files changed (11) hide show

data/VERSION +1 -1
data/examples/active_rain_post.rb +17 -7
data/examples/live_journal_post.rb +11 -6
data/examples/ning_post.rb +7 -6
data/graboid.gemspec +5 -2
data/lib/graboid.rb +11 -2
data/lib/graboid/entity.rb +3 -3
data/lib/graboid/scraper.rb +204 -0
data/spec/graboid/scraper_spec.rb +195 -0
data/spec/spec_helper.rb +2 -0
metadata +7 -4

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.3.3
1	+ 0.3.4

data/examples/active_rain_post.rb CHANGED

@@ -4,11 +4,21 @@ require File.join(dir, 'graboid')
 class ActiveRainPost
   include Graboid::Entity
-  root   '.blog_entry'
-  field :title, :selector => 'h2'
-  field :body, :selector => 'div' do |elm|
+  selector '.blog_entry_wrapper'
+  set :title, :selector => 'h2 a'
+  set :pub_date, :selector => '.blog_entry' do |elm|
+    # awesome, the pub date is not contained within
+    # the .blog_entry_wrapper fragment.
+    begin
+      entry_id = elm['id'].gsub('blog_entry_','')
+      date_text = self.doc.css("#divbei#{entry_id} td").select{|td| td.text =~ /posted by/i }.first.text
+      date_text.match(/(\d{2}\/\d{2}\/\d{4})/).captures.first
+    rescue
+      ""
+    end
+  end
+  set :body, :selector => 'div' do |elm|
     elm.css('p').collect(&:to_html)
   end
@@ -26,10 +36,10 @@ class ActiveRainPost
 end
 ActiveRainPost.source = 'http://activerain.com/blogs/elizabethweintraub'
-@posts  = ActiveRainPost.all
+@posts  = ActiveRainPost.all(:max_pages => 1)
 @posts.each do |post|
-  puts "#{post.title}"
+  puts "#{post.pub_date}"
   puts "*"*100
 end

data/examples/live_journal_post.rb CHANGED

@@ -6,9 +6,14 @@ class LiveJournalPost
   root '.entrybox'
-  field :title,         :selector => '.caption a'
-  field :body,          :selector => 'td[@colspan="2"]'
-  field :comment_link,  :selector => '.caption a' do |elm|
+  field :title, :selector => '.caption a'
+  field :body,  :selector => 'td[@colspan="2"]'
+  field :pub_date, :selector => 'td.index' do |elm|
+    elm.text.match(/\[(.*)\|/)[1]
+  end
+  field :comment_link, :selector => '.caption a' do |elm|
     elm['href']
   end
@@ -25,11 +30,11 @@ class LiveJournalPost
 end
-LiveJournalPost.source = 'http://zeroplate.livejournal.com/'
-@posts  = LiveJournalPost.all(:max_pages => 3)
+LiveJournalPost.source  = 'http://zeroplate.livejournal.com/'
+@posts                  = LiveJournalPost.all(:max_pages => 3)
 @posts.each do |post|
-  puts "#{post.title}"
+  puts "#{post.pub_date} - #{post.title}"
   puts "#{post.comment_link}"
   puts "#{post.body}"
   puts "*"*100

data/examples/ning_post.rb CHANGED

@@ -1,7 +1,8 @@
-%w{rubygems graboid}.each {|f| require f }
+dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
+require File.join(dir, 'graboid')
 class NingPost
-  include Graboid::Entity
+  include Graboid::Scraper
   selector 'div.xg_blog .xg_module_body'
@@ -25,10 +26,10 @@ class NingPost
     # ning's list page only has an excerpt of the body. No biggie,
     # we'll just go grab it.
     show_url = elm.css('a').last["href"]
-    Nokogiri::HTML(open(show_url)).css('.postbody').to_html
+    Nokogiri::HTML(open(show_url,"User-Agent" => Graboid.user_agent)).css('.postbody').to_html
   end
-  pager do |doc|
+  page_with do |doc|
     doc.css('.pagination a').select{|a| a.text =~ /previous/i }.first['href'] rescue nil
   end
@@ -45,8 +46,8 @@ class NingPost
 end
-NingPost.source = 'http://cuwebd.ning.com/profiles/blog/list'
-@posts          = NingPost.all(:max_pages => 1)
+NING_URL = 'http://www.friendsorenemies.com/profiles/blog/list?user=3vx1daeuxrt14'
+@posts = NingPost.new( :source => NING_URL ).all(:max_pages => 2)
 @posts.each do |post|
   puts "#{post.pub_date} -- #{post.title}"

data/graboid.gemspec CHANGED

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{graboid}
-  s.version = "0.3.3"
+  s.version = "0.3.4"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Christopher Burnett"]
-  s.date = %q{2010-06-15}
+  s.date = %q{2010-06-16}
   s.description = %q{web scraping made easier}
   s.email = %q{signalstatic@gmail.com}
   s.extra_rdoc_files = [
@@ -29,11 +29,13 @@ Gem::Specification.new do |s|
      "graboid.gemspec",
      "lib/graboid.rb",
      "lib/graboid/entity.rb",
+     "lib/graboid/scraper.rb",
      "spec/fixtures/graboid.jpg",
      "spec/fixtures/posts.html",
      "spec/fixtures/server.rb",
      "spec/fixtures/views/posts.erb",
      "spec/graboid/entity_spec.rb",
+     "spec/graboid/scraper_spec.rb",
      "spec/graboid_spec.rb",
      "spec/spec.opts",
      "spec/spec_helper.rb"
@@ -46,6 +48,7 @@ Gem::Specification.new do |s|
   s.test_files = [
     "spec/fixtures/server.rb",
      "spec/graboid/entity_spec.rb",
+     "spec/graboid/scraper_spec.rb",
      "spec/graboid_spec.rb",
      "spec/spec_helper.rb",
      "examples/active_rain_post.rb",

data/lib/graboid.rb CHANGED

@@ -1,9 +1,18 @@
-%w{rubygems nokogiri open-uri active_support}.each { |f| require f }
+%w{rubygems nokogiri open-uri active_support ostruct}.each { |f| require f }
 dir = Pathname(__FILE__).dirname.expand_path
 require dir + 'graboid/entity'
+require dir + 'graboid/scraper'
 module Graboid
+  extend self
+  def user_agent
+    @user_agent ||= 'Graboid'
+  end
+  def user_agent=(agent)
+    @user_agent = agent
+  end
 end

data/lib/graboid/entity.rb CHANGED

@@ -5,7 +5,7 @@ module Graboid
       klass.class_eval do
         extend  ClassMethods
         include InstanceMethods
+        warn "Deprecation Warning! Graboid::Entity - This module has been deprecated. See Graboid::Scraper."
         write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
       end
     end
@@ -67,7 +67,7 @@ module Graboid
         attribute_map.inject({}) do |extracted_hash, at|
           selector, processor       = at.last[:selector], at.last[:processor]
           node_collection           = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
-          extracted_hash[at.first]  = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) rescue ""
+          extracted_hash[at.first]  = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) #rescue ""
           extracted_hash
         end
@@ -119,7 +119,7 @@ module Graboid
       def read_source
         case self.source
           when /^http[s]?:\/\//
-            open self.source
+            open(self.source, "User-Agent" => Graboid.user_agent)
           when String
             self.source
         end

data/lib/graboid/scraper.rb ADDED

@@ -0,0 +1,204 @@
+module Graboid
+  module Scraper
+    def self.included klass
+      klass.class_eval do
+        extend  ClassMethods
+        include InstanceMethods
+        write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
+        write_inheritable_attribute(:callbacks, {})     if callbacks.nil?
+      end
+    end
+    module ClassMethods
+      def attribute_map
+        read_inheritable_attribute :attribute_map
+      end
+      def callbacks
+        read_inheritable_attribute :callbacks
+      end
+      def inferred_selector
+        @inferred_selector ||= ".#{self.to_s.underscore}"
+      end
+      def page_with &block
+        @pager = block
+      end
+      def pager
+        @pager
+      end
+      def root_selector
+        @root_selector || inferred_selector
+      end
+      def selector selector
+        @root_selector = selector
+      end
+      alias_method :root, :selector
+      def set name, opts={}, &block
+        opts.merge!(:selector   => ".#{name}")  unless opts[:selector].present?
+        opts.merge!(:processor  => block)       if block_given?
+        attribute_map[name] = opts
+      end
+      [:before, :after].each do |prefix|
+        [:paginate, :extract].each do |suffix|
+          method_name = "#{prefix}_#{suffix}"
+          define_method method_name.to_sym do |&block|
+            self.callbacks["#{method_name}".to_sym] = block
+          end
+        end
+      end
+    end
+    module InstanceMethods
+      def initialize opts={}, &block
+        raise ArgumentError unless opts[:source].present?
+        self.source = opts[:source]
+      end
+      def all opts={}, reload=false
+        return self.collection if reload and !self.collection.empty?
+        reset_context
+        self.max_pages = opts[:max_pages] if opts[:max_pages].present?
+        all_fragments.collect{ |frag| extract_instance(frag) }
+      end
+      alias_method :scrape, :all
+      def all_fragments
+        return page_fragments if self.class.pager.nil?
+        old_source = self.source
+        while next_page?
+          self.collection += page_fragments
+          run_before_paginate_callbacks
+          paginate
+          run_after_paginate_callbacks
+        end
+        self.source = old_source
+        self.collection
+      end
+      def attribute_map
+        self.class.attribute_map
+      end
+      def callbacks
+        self.class.callbacks
+      end
+      def collection
+        @collection ||= []
+      end
+      def collection=(col)
+        @collection = col
+      end
+      def current_page
+        @current_page ||= 0
+      end
+      def current_page=num
+        @current_page = num
+      end
+      def doc
+        eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)"
+      end
+      def extract_instance fragment
+        OpenStruct.new(hash_map fragment)
+      end
+      def hash_map fragment
+        attribute_map.inject({}) do |extracted_hash, at|
+          selector, processor       = at.last[:selector], at.last[:processor]
+          node_collection           = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
+          extracted_hash[at.first]  = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) #rescue ""
+          extracted_hash
+        end
+      end
+      def max_pages
+        @max_pages ||= 0
+      end
+      def max_pages=num
+        @max_pages = num
+      end
+      def mode
+        @mode ||= :html
+      end
+      def mode=(m)
+        raise ArgumentError unless [:html, :xml].include?(m)
+        @mode = m
+      end
+      def next_page?
+        if max_pages.zero?
+          return true unless self.class.pager.call(doc).nil?
+        else
+          current_page <= max_pages-1
+        end
+      end
+      def page_fragments
+        doc.css(self.class.root_selector)
+      end
+      def paginate
+        next_page_url = self.class.pager.call(doc) rescue nil
+        self.source   = next_page_url
+        self.current_page += 1
+      end
+      def read_source
+        case self.source
+          when /^http[s]?:\/\//
+            open(self.source ,"User-Agent" => Graboid.user_agent)
+          when String
+            self.source
+        end
+      end
+      def reset_context
+        self.collection   = []
+        self.current_page = 0
+        self.max_pages    = 0
+      end
+      def source
+        @source
+      end
+      def source=(src)
+        @source = src
+      end
+      [:before, :after].each do |prefix|
+        [:paginate, :extract].each do |suffix|
+          method_name = "#{prefix}_#{suffix}"
+          define_method "run_#{method_name}_callbacks" do
+            self.instance_eval &callbacks[method_name.to_sym] if callbacks[method_name.to_sym].present?
+          end
+        end
+      end
+    end
+  end
+end

data/spec/graboid/scraper_spec.rb ADDED

@@ -0,0 +1,195 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+class MockScraper
+  include Graboid::Scraper
+  set :title
+  set :body
+  set :author
+  set :date, :selector => '.author' do |elm|
+    elm.text.match(/\((.*)\)/)[1]
+  end
+end
+class WorkingScraper
+  include Graboid::Scraper
+  selector '.post'
+  set :title
+  set :body
+  set :author
+  set :date, :selector => '.author' do |elm|
+    elm.text.match(/\((.*)\)/)[1]
+  end
+end
+class ScraperWithPager
+  include Graboid::Scraper
+  selector '.post'
+  set :title
+  set :body
+  set :author
+  set :date, :selector => '.author' do |elm|
+    elm.text.match(/\((.*)\)/)[1]
+  end
+  page_with do |doc|
+    'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
+  end
+  before_paginate do
+    puts "page: #{source}"
+  end
+end
+describe Graboid::Scraper do
+  describe "#root_selector" do
+    it "should be set" do
+      MockScraper.root_selector.should == '.mock_scraper'
+    end
+    describe "when inferred from class" do
+      before(:each) do
+        class Phony; include Graboid::Scraper; end
+      end
+      it "should infer .phony" do
+        Phony.root_selector.should == '.phony'
+      end
+    end
+  end
+  describe "#set" do
+    describe "simple syntax" do
+      before(:each) do
+        MockScraper.set :body
+      end
+      it "should be set in the attr map" do
+        MockScraper.attribute_map[:body].should be_a Hash
+      end
+      it "should set the selector" do
+        MockScraper.attribute_map[:body][:selector].should == '.body'
+      end
+    end
+    describe "custom selector syntax" do
+      before(:each) do
+        MockScraper.set :body, :selector => '.custom'
+      end
+      it "should set the selector" do
+        MockScraper.attribute_map[:body][:selector].should == '.custom'
+      end
+    end
+    describe "custom selector syntax with a lambda" do
+      before(:each) do
+        MockScraper.set :body, :selector => '.custom' do |item|
+          "from lambda"
+        end
+      end
+      it "should set the selector" do
+        MockScraper.attribute_map[:body][:selector].should == '.custom'
+      end
+      it "should set the processor" do
+        MockScraper.attribute_map[:body][:processor].should be_a Proc
+      end
+    end
+  end
+  describe "#new" do
+    describe "when supplied a source" do
+      before(:each) do
+        @scraper = WorkingScraper.new( :source => TEST_SERVER_URL )
+      end
+      it "should have the correct attribute_map" do
+        @scraper.attribute_map[:body][:selector].should == '.body'
+      end
+      it "should set the instance source" do
+        @scraper.source.should == TEST_SERVER_URL
+      end
+      it "should set the doc source" do
+        @scraper.doc.should be_a Nokogiri::HTML::Document
+      end
+    end
+    describe "#all_fragments" do
+      before(:each) do
+        @scraper = WorkingScraper.new( :source => POSTS_HTML_STR )
+        @fragments  = @scraper.all_fragments
+      end
+      it "should return the NodeSet" do
+        @fragments.should be_a Nokogiri::XML::NodeSet
+      end
+      it "should have 2 results" do
+        @fragments.count.should == 2
+      end
+    end
+    describe "#all" do
+      before(:each) do
+        @scraper = WorkingScraper.new( :source => POSTS_HTML_STR )
+      end
+      it "should return 2 WorkingPosts" do
+        @scraper.all(:max_pages => 3).length.should == 2
+      end
+      [:current_page, :max_pages].each do |m|
+        describe "##{m}" do
+          it "should be 0 by default" do
+            @scraper.send(m).should == 0
+          end
+          it "should be 3" do
+            @scraper.send("#{m}=",3)
+            @scraper.send(m).should == 3
+          end
+        end
+      end
+    end
+    describe "#page_with" do
+      describe "with a limit" do
+        before(:each) do
+          @scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
+          @posts = @scraper.all(:max_pages => 3)
+        end
+        it "should get 6 posts" do
+          @posts.length.should == 6
+        end
+      end
+      describe "without a limit" do
+        before(:each) do
+          @scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
+          @posts = @scraper.all
+        end
+        it "should get 16 posts" do
+          @posts.length.should == 16
+        end
+      end
+    end
+  end
+end

data/spec/spec_helper.rb CHANGED

@@ -53,3 +53,5 @@ class PostWithPager
   end
 end
+TEST_SERVER_URL = 'http://localhost:9393/posts'

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: graboid
 version: !ruby/object:Gem::Version
-  hash: 21
+  hash: 27
   prerelease: false
   segments:
   - 0
   - 3
-  - 3
-  version: 0.3.3
+  - 4
+  version: 0.3.4
 platform: ruby
 authors:
 - Christopher Burnett
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-06-15 00:00:00 -07:00
+date: 2010-06-16 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -84,11 +84,13 @@ files:
 - graboid.gemspec
 - lib/graboid.rb
 - lib/graboid/entity.rb
+- lib/graboid/scraper.rb
 - spec/fixtures/graboid.jpg
 - spec/fixtures/posts.html
 - spec/fixtures/server.rb
 - spec/fixtures/views/posts.erb
 - spec/graboid/entity_spec.rb
+- spec/graboid/scraper_spec.rb
 - spec/graboid_spec.rb
 - spec/spec.opts
 - spec/spec_helper.rb
@@ -129,6 +131,7 @@ summary: web scraping made easy
 test_files:
 - spec/fixtures/server.rb
 - spec/graboid/entity_spec.rb
+- spec/graboid/scraper_spec.rb
 - spec/graboid_spec.rb
 - spec/spec_helper.rb
 - examples/active_rain_post.rb