RubyGems - graboid - Versions diffs - 0.3.4 → 0.3.5 - Mend

graboid 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/README.mdown CHANGED Viewed

@@ -4,6 +4,10 @@
   Simply awesome web scraping. Better docs later. See specs.
+### 0.3.4 Update ###
+[http://twoism.posterous.com/new-graboid-dsl](http://twoism.posterous.com/new-graboid-dsl, "New DSL")
 ### Installation ###
@@ -15,7 +19,7 @@
     %w{rubygems graboid}.each { |f| require f }
     class RedditEntry
-      include Graboid::Entity
+      include Graboid::Scraper
       selector '.entry'
@@ -26,7 +30,7 @@
         entry.css('a').first['href']
       end
-      pager do |doc|
+      page_with do |doc|
         doc.css('p.nextprev a').select{|a| a.text =~ /next/i  }.first['href']
       end
@@ -38,9 +42,9 @@
     end
-    RedditEntry.source = 'http://reddit.com'
+    @posts = RedditEntry.new( :source => 'http://reddit.com' ).all( :max_pages => 2 )
-    RedditEntry.all(:max_pages => 5).each do |p|
+    @posts.each do |p|
       puts "title: #{p.title}"
       puts "domain: #{p.domain}"
       puts "link: #{p.link}"

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.3.4
1	+ 0.3.5

data/examples/active_rain_post.rb CHANGED Viewed

@@ -35,8 +35,8 @@ class ActiveRainPost
 end
-ActiveRainPost.source = 'http://activerain.com/blogs/elizabethweintraub'
-@posts  = ActiveRainPost.all(:max_pages => 1)
+ActiveRainPost.source = 'http://activerain.com/blogs/danawilkinson'
+@posts  = ActiveRainPost.all(:max_pages => 100)
 @posts.each do |post|
   puts "#{post.pub_date}"

data/examples/ning_post.rb CHANGED Viewed

@@ -46,8 +46,8 @@ class NingPost
 end
-NING_URL = 'http://www.friendsorenemies.com/profiles/blog/list?user=3vx1daeuxrt14'
-@posts = NingPost.new( :source => NING_URL ).all(:max_pages => 2)
+NING_URL = 'http://vstar650.ning.com/profiles/blog/list'
+@posts = NingPost.new( :source => NING_URL ).all(:max_pages => 10)
 @posts.each do |post|
   puts "#{post.pub_date} -- #{post.title}"

data/examples/reddit_post.rb ADDED Viewed

@@ -0,0 +1,35 @@
+dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
+require File.join(dir, 'graboid')
+class RedditEntry
+  include Graboid::Scraper
+  selector '.entry'
+  set :title
+  set :domain, :selector => '.domain a'
+  set :link,   :selector => '.title' do |entry|
+    entry.css('a').first['href']
+  end
+  page_with do |doc|
+    self.doc.css('p.nextprev a').select{|a| a.text =~ /next/i  }.first['href']
+  end
+  before_paginate do
+    puts "opening page: #{self.source}"
+    puts "collection size: #{self.collection.length}"
+    puts "#{"*"*100}"
+  end
+end
+@posts = RedditEntry.new( :source => 'http://reddit.com' ).all( :max_pages => 3 )
+@posts.each do |p|
+  puts "title: #{p.title}"
+  puts "domain: #{p.domain}"
+  puts "link: #{p.link}"
+  puts "#{"*"*100}"
+end

data/examples/tumblr_post.rb ADDED Viewed

@@ -0,0 +1,33 @@
+dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
+require File.join(dir, 'graboid')
+class TumblrEntry
+  include Graboid::Scraper
+  TUMBLR_CHUNK_SIZE = 20
+  selector 'post'
+  set :title, :selector => 'regular-title'
+  page_with do |doc|
+    next_tumblr_page
+  end
+  def next_tumblr_page
+    return nil if self.doc.css('post').empty?
+    "#{self.original_source}?start=#{self.current_page*TUMBLR_CHUNK_SIZE}"
+  end
+  before_paginate do
+    puts "opening page: #{self.source}"
+    puts "collection size: #{self.collection.length}"
+    puts "#{"*"*100}"
+  end
+end
+@posts = TumblrEntry.new( :source => 'http://chrisburnett.tumblr.com/api/read' ).all
+@posts.each do |p|
+  puts "title: #{p.title}"
+end

data/graboid.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{graboid}
-  s.version = "0.3.4"
+  s.version = "0.3.5"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Christopher Burnett"]
-  s.date = %q{2010-06-16}
+  s.date = %q{2010-07-09}
   s.description = %q{web scraping made easier}
   s.email = %q{signalstatic@gmail.com}
   s.extra_rdoc_files = [
@@ -53,7 +53,9 @@ Gem::Specification.new do |s|
      "spec/spec_helper.rb",
      "examples/active_rain_post.rb",
      "examples/live_journal_post.rb",
-     "examples/ning_post.rb"
+     "examples/ning_post.rb",
+     "examples/reddit_post.rb",
+     "examples/tumblr_post.rb"
   ]
   if s.respond_to? :specification_version then

data/lib/graboid.rb CHANGED Viewed

@@ -9,7 +9,7 @@ module Graboid
   extend self
   def user_agent
-    @user_agent ||= 'Graboid'
+    @user_agent ||= 'Foo'
   end
   def user_agent=(agent)

data/lib/graboid/scraper.rb CHANGED Viewed

@@ -25,11 +25,9 @@ module Graboid
       end
       def page_with &block
-        @pager = block
-      end
-      def pager
-        @pager
+        define_method :pager do
+          instance_eval &block
+        end
       end
       def root_selector
@@ -76,7 +74,8 @@ module Graboid
       alias_method :scrape, :all
       def all_fragments
-        return page_fragments if self.class.pager.nil?
+        return page_fragments unless self.respond_to?(:pager)
+        return page_fragments if self.pager(self.doc).nil?
         old_source = self.source
         while next_page?
@@ -151,18 +150,22 @@ module Graboid
       def next_page?
         if max_pages.zero?
-          return true unless self.class.pager.call(doc).nil?
+          return true unless self.pager(doc).nil?
         else
           current_page <= max_pages-1
         end
       end
+      def original_source
+        @original_source
+      end
       def page_fragments
         doc.css(self.class.root_selector)
       end
       def paginate
-        next_page_url = self.class.pager.call(doc) rescue nil
+        next_page_url = self.pager(doc)
         self.source   = next_page_url
         self.current_page += 1
       end
@@ -182,11 +185,16 @@ module Graboid
         self.max_pages    = 0
       end
+      def host
+        self.source.scan(/http[s]?:\/\/.*\//).first
+      end
       def source
         @source
       end
       def source=(src)
+        @original_source = src if @original_source.nil?
         @source = src
       end

data/spec/graboid/scraper_spec.rb CHANGED Viewed

@@ -37,7 +37,7 @@ class ScraperWithPager
   end
   page_with do |doc|
-    'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
+    'http://localhost:9393'+self.doc.css('a.next').first['href'] rescue nil
   end
   before_paginate do
@@ -172,16 +172,24 @@ describe Graboid::Scraper do
       describe "with a limit" do
         before(:each) do
           @scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
+          #@scraper.expects(:run_before_paginate_callbacks).times(3)
           @posts = @scraper.all(:max_pages => 3)
         end
+        it "should set the callback" do
+          @scraper.callbacks[:before_paginate].should be_a Proc
+        end
         it "should get 6 posts" do
           @posts.length.should == 6
         end
       end
       describe "without a limit" do
         before(:each) do
           @scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
+          @scraper.expects(:run_before_paginate_callbacks).times(8)
           @posts = @scraper.all
         end
         it "should get 16 posts" do

data/spec/spec.opts CHANGED Viewed

@@ -1,3 +1,4 @@
 --colour
 --format nested
 --loadby mtime
+--backtrace

data/spec/spec_helper.rb CHANGED Viewed

@@ -5,7 +5,7 @@ require 'spec'
 require 'spec/autorun'
 Spec::Runner.configure do |config|
+  config.mock_with :mocha
 end
 FIXTURE_PATH    = File.expand_path(File.dirname(__FILE__)+'/fixtures/posts.html')

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: graboid
 version: !ruby/object:Gem::Version
-  hash: 27
+  hash: 25
   prerelease: false
   segments:
   - 0
   - 3
-  - 4
-  version: 0.3.4
+  - 5
+  version: 0.3.5
 platform: ruby
 authors:
 - Christopher Burnett
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-06-16 00:00:00 -07:00
+date: 2010-07-09 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -94,6 +94,8 @@ files:
 - spec/graboid_spec.rb
 - spec/spec.opts
 - spec/spec_helper.rb
+- examples/reddit_post.rb
+- examples/tumblr_post.rb
 has_rdoc: true
 homepage: http://github.com/twoism/graboid
 licenses: []
@@ -137,3 +139,5 @@ test_files:
 - examples/active_rain_post.rb
 - examples/live_journal_post.rb
 - examples/ning_post.rb
+- examples/reddit_post.rb
+- examples/tumblr_post.rb