RubyGems - klepto - Versions diffs - 0.2.1 → 0.2.2 - Mend

klepto 0.2.1 → 0.2.2

Files changed (14) hide show

data/README.md +23 -17
data/lib/klepto/bot.rb +66 -0
data/lib/klepto/browser.rb +8 -0
data/lib/klepto/config.rb +29 -0
data/lib/klepto/structure.rb +3 -24
data/lib/klepto/version.rb +1 -1
data/lib/klepto.rb +2 -1
data/samples/concept.rb +30 -24
data/spec/lib/klepto/bot_spec.rb +126 -0
data/spec/lib/klepto/config_spec.rb +23 -8
data/spec/lib/klepto/structure_spec.rb +0 -61
data/spec/orm/active_record.rb +10 -0
data/spec/spec_helper.rb +1 -0
metadata +15 -12

data/README.md CHANGED Viewed

@@ -17,7 +17,7 @@ Say you want a bunch of Bieb tweets! How is there not profit in that?
 ```ruby
 # Crawl a web site or multiple. Structure#crawl takes a *splat!
-@structures = Klepto::Structure.crawl("https://twitter.com/justinbieber"){
+@structures = Klepto::Bot.new("https://twitter.com/justinbieber"){
   # By default, it uses CSS selectors
   name      'h1.fullname'
@@ -53,9 +53,29 @@ Say you want a bunch of Bieb tweets! How is there not profit in that?
     permalink '.time a', :css, :attr => :href
   end
+  # Set some headers, why not.
+  config.headers({
+    'Referer'     => 'http://www.twitter.com'
+  })
+  # on_http_status can take a splat of statuses or ~statuses(4xx,5xx)
+  #   you can also have multiple handlers on a status
+  #   Note: Capybara automatically follows redirects, so the statuses 3xx
+  #   are never present. If you want to watch for a redirect pass see below
+  config.on_http_status(:redirect){
+    puts "Something redirected..."
+  }
+  config.on_http_status(200){
+    puts "Expected this, NBD."
+  }
+  config.on_http_status('5xx','4xx'){
+    puts "HOLY CRAP!"
+  }
   # If you want to do something with each resource, like stick it in AR
   #   go for it here...
-  after_crawl do |resource|
+  config.after do |resource|
     @user = User.new
     @user.name = resource[:name]
     @user.username = resource[:username]
@@ -79,7 +99,7 @@ end
 ```ruby
 @html = Capybara::Node::Simple.new(@html_string)
 @structure = Klepto::Structure.build(@html){
-  # inside the build method, everything works the same as Structure.crawl
+  # inside the build method, everything works the same as Bot.new
   name      'h1.fullname'
   username  'span.screen-name'
@@ -98,20 +118,6 @@ end
 }
 ```
-## Extra Configuration
-```ruby
-config = {
-  :headers => {
-    'Referer'     => 'http://www.twitter.com',
-    'X-Sup-Dawg'  => "Yo, What's up?"
-  }
-}
-@structures = Klepto::Structure.crawl("https://twitter.com/justinbieber",config){
-  #... yada, yada
-}
-```
 ## Stuff I'm going to add.
 Cookie Stuffing

data/lib/klepto/bot.rb ADDED Viewed

@@ -0,0 +1,66 @@
+module Klepto
+  class Bot
+    attr_reader :config
+    def initialize(*urls, &block)
+      @config = Klepto::Config.new
+      @config.urls urls
+      @queue  = []
+      instance_eval &block
+      instance_eval <<-EOS
+def queue; @queue; end;
+def resources; @resources; end;
+EOS
+      __process!
+    end
+    def __dispatch_handlers_for(status_code)
+      if status_code.is_a?(Fixnum)
+      elsif status_code.is_a?(Symbol)
+      elsif status_code.is_a?(String)
+      end
+    end
+    def __process!
+      @resources = []
+      config.urls.each do |url|
+        browser   = Klepto::Browser.new
+        browser.set_headers config.headers
+        browser.fetch! url
+        statuses = [browser.status, browser.statusx]
+        statuses.push :redirect if url != browser.page.current_url
+        statuses.each do |status|
+          config.dispatch_status_handlers(status, browser.page)
+        end
+        structure = Structure.new(browser.page)
+        queue.each do |instruction|
+          if instruction[2]
+            structure.send instruction[0], *instruction[1], &instruction[2]
+          else
+            structure.send instruction[0], *instruction[1]
+          end
+        end
+        config.after_handlers[:each].each do |ah|
+          ah.call(structure._hash)
+        end
+        resources << structure._hash
+      end
+      @resources
+    end
+    def method_missing(meth, *args, &block)
+      @queue.push([meth, args, block])
+    end
+  end
+end

data/lib/klepto/browser.rb CHANGED Viewed

@@ -9,6 +9,14 @@ module Klepto
     def set_headers(headers)
       page.driver.headers = headers
     end
+    def status
+      page.status_code
+    end
+    def statusx
+      page.status_code.to_s[0..-3] + "xx"
+    end
     def fetch!(url)
       visit url

data/lib/klepto/config.rb CHANGED Viewed

@@ -1,8 +1,13 @@
 module Klepto
   class Config
+    attr_reader :after_handlers
     def initialize
       @headers = {}
       @urls    = []
+      @after_handlers   = {:each => []}
+      @before_handlers  = {:each => []}
+      @status_handlers  = {}
     end
     def headers(_headers=nil)
@@ -10,8 +15,32 @@ module Klepto
       @headers
     end
+    def on_http_status(*statuses,&block)
+      statuses.each do |status|
+        @status_handlers[status] ||= []
+        @status_handlers[status].push block
+      end
+    end
+    def dispatch_status_handlers(status, page)
+      handlers = @status_handlers[status]
+      if handlers.present?
+        @status_handlers[status].each do |handler|
+          handler.call(page)
+        end
+      end
+    end
+    def after(which = :each, &block)
+      @after_handlers[which] ||= []
+      @after_handlers[which].push block
+    end
     def url(*args)
       @urls += args
+      @urls.flatten!
+      @urls.uniq!
+      @urls
     end
     alias :urls :url
   end

data/lib/klepto/structure.rb CHANGED Viewed

@@ -1,31 +1,14 @@
 module Klepto
   class Structure
    def self.build(_context=nil, _parent=nil, &block)
-      hb = Structure.new(_context, _parent)
-      hb.instance_eval &block
-      if hb._after_handler
-        hb._after_handler.call(hb._hash)
-      end
-      hb._hash
-    end
-    def self.crawl(*urls, &block)
-      config = urls.last.is_a?(Hash) ? urls.pop : {}
-      resources = []
-      urls.each do |url|
-        browser   = Klepto::Browser.new
-        browser.set_headers config[:headers]
-        browser.fetch! url
-        resources << Structure.build(browser.page, &block)
-      end
-      resources
+      structure = Structure.new(_context, _parent)
+      structure.instance_eval &block
+      structure._hash
     end
     attr_reader :_parent
     attr_reader :_hash
     attr_reader :_context
-    attr_reader :_after_handler
     def initialize(_context=nil, _parent=nil)
       @_context  = _context
@@ -34,10 +17,6 @@ module Klepto
       @_after_handler = nil
     end
-    def after_crawl(&block)
-      @_after_handler = block
-    end
     #options[:as]     :collection, :resource
     #options[:match]  :first, :all
     #options[:syntax] :xpath, :css

data/lib/klepto/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Klepto
-  VERSION = "0.2.1"
+  VERSION = "0.2.2"
 end

data/lib/klepto.rb CHANGED Viewed

@@ -21,4 +21,5 @@ end
 require 'klepto/version'
 require 'klepto/config'
 require 'klepto/browser'
-require 'klepto/structure'
+require 'klepto/structure'
+require 'klepto/bot'

data/samples/concept.rb CHANGED Viewed

@@ -2,30 +2,41 @@
 require 'bundler/setup'
 require 'klepto'
-@structure = Klepto::Structure.crawl('https://twitter.com/justinbieber')
+Klepto::Bot.new do
   config.headers 'Referer' => 'http://www.twitter.com'
+  config.on_http_status('5xx','4xx'){
+    puts "HOLY CRAP!"
+  }
+  # If you want to do something with each resource, like stick it in AR
+  #   go for it here...
+  config.after do |resource|
+    @user = User.new
+    @user.name = resource[:name]
+    @user.username = resource[:username]
+    @user.save
+    resource[:tweets].each do |tweet|
+      Tweet.create(tweet)
+    end
+  end
-  config.steps [
-    [:GET, 'https://twitter.com/login'],
-    [:POST,'https://twitter.com/sessions',
-      {
-        session: {
-          username_or_email: 'example',
-          password:'123456'
-        }
-      }
-    ]
-  ]
   config.urls 'https://twitter.com/justinbieber',
               'https://twitter.com/ladygaga'
-  # config.cookies 'jsession' => 'abcdefg1234567890'
-  # config.on_http_status(500,404){}
-  # assertions do
-  # end
-  # config.on_failed_assertion(){}
+  # config.steps [
+  #   [:GET, 'https://twitter.com/login'],
+  #   [:POST,'https://twitter.com/sessions',
+  #     {
+  #       session: {
+  #         username_or_email: 'example',
+  #         password:'123456'
+  #       }
+  #     }
+  #   ]
+  # ]
-  # Structur the content
+  # Structure the content
   name      'h1.fullname'
   username  '.username span.screen-name'
   links     'span.url a', :list, :attr => 'href'
@@ -42,9 +53,4 @@ require 'klepto'
     permalink '.time a', :css, :attr => :href
   end
-end
-# @resources = @structure.parse! #=> Array[Hash]
-# @resources.each do |resource|
-#   User.create(resource)
-# end
+end

data/spec/lib/klepto/bot_spec.rb ADDED Viewed

@@ -0,0 +1,126 @@
+require 'spec_helper'
+describe Klepto::Bot do
+  describe 'Klepto::Bot.new' do
+    describe 'create a bot with a redirect' do
+      before(:each) do
+        @bot = Klepto::Bot.new("https://www.twitter.com/justinbieber"){
+          name      'h1.fullname'
+          config.on_http_status(:redirect){
+            StatusLog.create message: 'redirect'
+          }
+          config.on_http_status(200){
+            StatusLog.create message: '200'
+          }
+        }
+        @structure = @bot.resources
+      end
+      it 'should structure the data' do
+        @structure.first[:name].should match(/Justin/i)
+      end
+      it 'should have dispatched status handlers' do
+        statuses = StatusLog.all.map(&:message)
+        statuses.should include 'redirect'
+        statuses.should include '200'
+      end
+    end
+    describe 'crawling multiple pages' do
+      before(:each) do
+        @bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
+          config.urls "https://twitter.com/ladygaga"
+          name 'h1.fullname'
+        }
+        @structure = @bot.resources
+      end
+      it 'should have both pages data' do
+        @structure.first[:name].should match(/Justin/i)
+        @structure.last[:name].should match(/Lady/i)
+      end
+    end
+    describe 'creating a bot' do
+      before(:each) do
+        @bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
+          config.headers({
+            'Referer'     => 'http://www.twitter.com',
+            'X-Sup-Dawg'  => "Yo, What's up?"
+          })
+          # Structure that stuff
+          name      'h1.fullname'
+          username "//span[contains(concat(' ',normalize-space(@class),' '),' screen-name ')]", :syntax => :xpath
+          tweet_ids 'li.stream-item', :match => :all, :attr => 'data-item-id'
+          links 'span.url a', :match => :all do |node|
+            node[:href]
+          end
+          last_tweet 'li.stream-item', :as => :resource do
+            twitter_id do |node|
+              node['data-item-id']
+            end
+            content '.content p'
+            timestamp '._timestamp', :attr => 'data-time'
+            permalink '.time a', :attr => :href
+          end
+          tweets    'li.stream-item', :as => :collection do
+            twitter_id do |node|
+              node['data-item-id']
+            end
+            tweet '.content p', :css
+            timestamp '._timestamp', :attr => 'data-time'
+            permalink '.time a', :css, :attr => :href
+          end
+          config.on_http_status('2xx'){
+            StatusLog.create message: '2xx'
+          }
+          config.on_http_status(:redirect){
+            StatusLog.create message: 'redirect'
+          }
+          config.on_http_status(200){
+            StatusLog.create message: '200'
+          }
+          config.after(:each) do |resource|
+            @user = User.new
+            @user.name = resource[:name]
+            @user.username = resource[:username]
+            @user.save
+            resource[:tweets].each do |tweet|
+              Tweet.create(tweet)
+            end
+          end
+        }
+        @structure = @bot.resources
+      end
+      it 'should structure the data' do
+        @structure.first[:name].should match(/Justin/i)
+        @structure.first[:links].should == ["http://www.youtube.com/justinbieber"]
+        @structure.first[:username].should eq '@justinbieber'
+        @structure.first[:last_tweet][:twitter_id].should == @structure.first[:tweets].first[:twitter_id]
+      end
+      it 'should store the data' do
+        User.count.should be(1)
+        Tweet.count.should_not be(0)
+      end
+      it 'should have dispatched status handlers' do
+        statuses = StatusLog.all.map(&:message)
+        statuses.should_not include 'redirect'
+        statuses.should include '200'
+        statuses.should include '2xx'
+      end
+    end
+  end
+end

data/spec/lib/klepto/config_spec.rb CHANGED Viewed

@@ -5,26 +5,41 @@ describe Klepto::Config do
     @config = Klepto::Config.new
     @config.headers({'Referer' => 'http://example.com'})
     @config.urls 'http://example.com', 'http://www.iana.org'
+    @config.on_http_status(200){
+      "Its 200"
+    }
+    @config.on_http_status('2xx'){
+      "Its 2xx"
+    }
+    @config.on_http_status('5xx','4xx'){
+      "Its crazy."
+    }
   end
   it 'should be able to set headers' do
     @config.headers['Referer'].should eq('http://example.com')
   end
+  it 'should have a 2xx status handler' do
+    @config.instance_variable_get("@status_handlers")['2xx'].first.call.should eq ('Its 2xx')
+  end
+  it 'should have a 200 status handler' do
+    @config.instance_variable_get("@status_handlers")[200].first.call.should eq ('Its 200')
+  end
+  it 'should have a 4xx and 5xx status handler' do
+    @config.instance_variable_get("@status_handlers")['5xx'].first.call.should eq ('Its crazy.')
+    @config.instance_variable_get("@status_handlers")['4xx'].first.call.should eq ('Its crazy.')
+  end
   it 'should be able to set URLs' do
     @config.urls.should == ['http://example.com', 'http://www.iana.org']
   end
+  pending 'should be able to set before handlers'
   pending 'should be able to set cookies'
   pending 'should be able to set steps'
   pending 'should be able to set assertions'
-  pending 'should be able to set on_http_status handler'
   pending 'should be able to set on_failed_assertion handler'
-  pending 'should be a sexier config' do
-    # Klepto::Structure.crawl("https://twitter.com/justinbieber"){
-    #   config.headers({
-    #     "Referer" => "http://example.com"
-    #   })
-    # }
-  end
 end

data/spec/lib/klepto/structure_spec.rb CHANGED Viewed

@@ -41,65 +41,4 @@ describe Klepto::Structure do
       @structure[:last_tweet][:twitter_id].should == @structure[:tweets].first[:twitter_id]
     end
   end
-  describe 'Klepto::Structure.crawl' do
-    before(:each) do
-      config = {
-        :headers => {
-          'Referer'     => 'http://www.twitter.com',
-          'X-Sup-Dawg'  => "Yo, What's up?"
-        }
-      }
-      @structure = Klepto::Structure.crawl("https://twitter.com/justinbieber", config){
-        # Structure that stuff
-        name      'h1.fullname'
-        username "//span[contains(concat(' ',normalize-space(@class),' '),' screen-name ')]", :syntax => :xpath
-        tweet_ids 'li.stream-item', :match => :all, :attr => 'data-item-id'
-        links 'span.url a', :match => :all do |node|
-          node[:href]
-        end
-        last_tweet 'li.stream-item', :as => :resource do
-          twitter_id do |node|
-            node['data-item-id']
-          end
-          content '.content p'
-          timestamp '._timestamp', :attr => 'data-time'
-          permalink '.time a', :attr => :href
-        end
-        tweets    'li.stream-item', :as => :collection do
-          twitter_id do |node|
-            node['data-item-id']
-          end
-          tweet '.content p', :css
-          timestamp '._timestamp', :attr => 'data-time'
-          permalink '.time a', :css, :attr => :href
-        end
-        after_crawl do |resource|
-          @user = User.new
-          @user.name = resource[:name]
-          @user.username = resource[:username]
-          @user.save
-          resource[:tweets].each do |tweet|
-            Tweet.create(tweet)
-          end
-        end
-      }
-    end
-    it 'should structure the data' do
-      @structure.first[:name].should match(/Justin/i)
-      @structure.first[:links].should == ["http://www.youtube.com/justinbieber"]
-      @structure.first[:username].should eq '@justinbieber'
-      @structure.first[:last_tweet][:twitter_id].should == @structure.first[:tweets].first[:twitter_id]
-    end
-    it 'should store the data' do
-      User.count.should be(1)
-      Tweet.count.should_not be(0)
-    end
-  end
 end

data/spec/orm/active_record.rb CHANGED Viewed

@@ -19,14 +19,24 @@ class TestMigration < ActiveRecord::Migration
       t.string :name
       t.string :username
     end
+    create_table :status_logs, :force => true do |t|
+      t.string :message
+    end
   end
   def self.down
     drop_table :tweets
+    drop_table :status_logs
     drop_table :users
   end
 end
+class StatusLog < ActiveRecord::Base
+end
 class Tweet < ActiveRecord::Base
   validates_presence_of :timestamp, :twitter_id, :permalink, :tweet
 end

data/spec/spec_helper.rb CHANGED Viewed

@@ -25,6 +25,7 @@ RSpec.configure do |config|
   config.after(:each){
     User.delete_all
     Tweet.delete_all
+    StatusLog.delete_all
   }
   config.after(:all) { TestMigration.down }
   config.treat_symbols_as_metadata_keys_with_true_values = true

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: klepto
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.2.2
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-04-18 00:00:00.000000000 Z
+date: 2013-04-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: poltergeist
-  requirement: &70139987462180 !ruby/object:Gem::Requirement
+  requirement: &70235243869100 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - =
@@ -21,10 +21,10 @@ dependencies:
         version: 1.1.0
   type: :runtime
   prerelease: false
-  version_requirements: *70139987462180
+  version_requirements: *70235243869100
 - !ruby/object:Gem::Dependency
   name: capybara
-  requirement: &70139987460880 !ruby/object:Gem::Requirement
+  requirement: &70235243868100 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - =
@@ -32,10 +32,10 @@ dependencies:
         version: 2.0.2
   type: :runtime
   prerelease: false
-  version_requirements: *70139987460880
+  version_requirements: *70235243868100
 - !ruby/object:Gem::Dependency
   name: nokogiri
-  requirement: &70139987459780 !ruby/object:Gem::Requirement
+  requirement: &70235243867320 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -43,10 +43,10 @@ dependencies:
         version: 1.5.6
   type: :runtime
   prerelease: false
-  version_requirements: *70139987459780
+  version_requirements: *70235243867320
 - !ruby/object:Gem::Dependency
   name: activesupport
-  requirement: &70139987458980 !ruby/object:Gem::Requirement
+  requirement: &70235243866680 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70139987458980
+  version_requirements: *70235243866680
 - !ruby/object:Gem::Dependency
   name: multi_json
-  requirement: &70139987456360 !ruby/object:Gem::Requirement
+  requirement: &70235243865240 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -65,7 +65,7 @@ dependencies:
         version: '1.0'
   type: :runtime
   prerelease: false
-  version_requirements: *70139987456360
+  version_requirements: *70235243865240
 description: Tearing up web pages into ActiveRecord resources
 email:
 - github@coryodaniel.com
@@ -83,6 +83,7 @@ files:
 - Rakefile
 - klepto.gemspec
 - lib/klepto.rb
+- lib/klepto/bot.rb
 - lib/klepto/browser.rb
 - lib/klepto/config.rb
 - lib/klepto/structure.rb
@@ -97,6 +98,7 @@ files:
 - spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml
 - spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml
 - spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml
+- spec/lib/klepto/bot_spec.rb
 - spec/lib/klepto/browser_spec.rb
 - spec/lib/klepto/config_spec.rb
 - spec/lib/klepto/structure_spec.rb
@@ -136,6 +138,7 @@ test_files:
 - spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml
 - spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml
 - spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml
+- spec/lib/klepto/bot_spec.rb
 - spec/lib/klepto/browser_spec.rb
 - spec/lib/klepto/config_spec.rb
 - spec/lib/klepto/structure_spec.rb