RubyGems - klepto - Versions diffs - 0.1.0 → 0.2.0 - Mend

klepto 0.1.0 → 0.2.0

Files changed (19) hide show

data/README.md +120 -89
data/lib/klepto.rb +2 -2
data/lib/klepto/config.rb +18 -0
data/lib/klepto/structure.rb +88 -0
data/lib/klepto/version.rb +1 -1
data/samples/bieber.html +4859 -0
data/samples/concept.rb +50 -0
data/spec/lib/klepto/config_spec.rb +23 -0
data/spec/lib/klepto/structure_spec.rb +105 -0
data/spec/lib/klepto_spec.rb +1 -58
data/spec/orm/active_record.rb +2 -2
data/spec/spec_helper.rb +1 -1
metadata +22 -23
data/lib/klepto/bot.rb +0 -59
data/lib/klepto/crawler.rb +0 -72
data/samples/example.rb +0 -49
data/spec/lib/klepto/bot_spec.rb +0 -40
data/spec/lib/klepto/crawler_spec.rb +0 -88
data/spec/lib/klepto/dsl_spec.rb +0 -6

@@ -1,72 +0,0 @@
-require 'docile'
-module Klepto
-  class Crawler
-    def initialize(scope,options={},&block)
-      @resources = []
-      @limit  = options[:limit]
-      @skip   = options[:skip]
-      @syntax = options[:syntax]
-      @scope  = scope
-      @designations = []
-      Docile.dsl_eval(self, &block) if block_given?
-    end
-    attr_accessor :resources
-    attr_reader :scope, :syntax
-    def scrape(selector=nil, assignee=nil, &block)
-      raise Exception if assignee.nil? && !block_given?
-      raise Exception if !assignee.nil? && block_given?
-      designate(:first, selector, assignee, &block)
-    end
-    def scrape_all(selector, assignee=nil, &block)
-      raise Exception if assignee.nil? && !block_given?
-      raise Exception if !assignee.nil? && block_given?
-      designate(:all, selector, assignee, &block)
-    end
-    def save(&block)
-      @resource_handler = block
-    end
-    def crawl(page)
-      page.all(syntax, scope).each do |selection|
-        params = {}
-        @designations.each do |first_or_all, selector, assignee, handler|
-          if selector.nil?
-            attribs = handler.call selection
-            params.merge!( attribs )
-          elsif first_or_all == :first
-            node = selection.first(syntax, selector)
-            if assignee
-              params[assignee] = node.try(:text)
-            else
-              attribs = handler.call node
-              params.merge!( attribs )
-            end
-          else
-            nodes = selection.all(syntax, selector)
-            attribs = handler.call nodes
-            params.merge!( attribs )
-          end
-        end
-        @resources << params
-      end
-      @resources
-    end
-    def persist!
-      if @resource_handler
-        @resources.each {|resource| @resource_handler.call(resource)}
-      end
-    end
-    protected
-      def designate(count, selector, assignee, &block)
-        @designations << [count, selector, assignee, block]
-      end
-  end
-end

data/samples/example.rb DELETED

@@ -1,49 +0,0 @@
-#! /usr/bin/env ruby
-require 'bundler/setup'
-require 'klepto'
-@bot = Klepto::Bot.new do
-  syntax :css
-  dry_run!
-  headers({
-    'Referer'     => 'http://www.twitter.com',
-    'User-Agent'  => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22"
-  })
-  # Lootin' them bieb tweets
-  urls  'https://twitter.com/justinbieber'
-  crawl 'body' do
-    scrape "h1.fullname", :name
-    scrape '.username span.screen-name', :username
-    save do |params|
-      user = User.find_by_name(params[:username]) || User.new
-      user.update_attributes params
-    end
-  end
-  crawl 'li.stream-item' do
-    scrape do |node|
-      {:twitter_id => node['data-item-id']}
-    end
-    scrape '.content p', :content
-    scrape '._timestamp' do |node|
-      {timestamp: node['data-time']}
-    end
-    scrape '.time a' do |node|
-      {permalink: node[:href]}
-    end
-    save do |params|
-      tweet = Tweet.find_by_twitter_id(params[:twitter_id]) || Tweet.new
-      tweet.update_attributes params
-    end
-  end
-end
-@bot.start!

data/spec/lib/klepto/bot_spec.rb DELETED

@@ -1,40 +0,0 @@
-require 'spec_helper'
-describe Klepto::Bot, :vcr => {:record => :new_episodes} do
-  before(:each) do
-    @bot = Klepto::Bot.new
-  end
-  it 'should know if it is a dry run' do
-    @bot.dry_run?.should be false
-    @bot.dry_run!
-    @bot.dry_run?.should be true
-  end
-  it 'should be able to set the selection syntax' do
-    @bot.syntax(:xpath)
-    @bot.syntax.should be(:xpath)
-  end
-  it 'should be able to read the selection syntax' do
-    @bot.syntax.should be(:css)
-  end
-  it 'should be able to set request headers' do
-    @bot.should respond_to(:headers)
-  end
-  it 'should be able to set a list of URLs to crawl' do
-    @bot.url 'http://www.google.com'
-    @bot.urls.should include('http://www.google.com')
-    @bot.urls 'http://twitter.com', 'http://facebook.com'
-    @bot.urls.should include('http://twitter.com')
-    @bot.urls.should include('http://facebook.com')
-  end
-  it 'should be able to add crawlers' do
-    @bot.crawl('div'){}
-    @bot.instance_variable_get("@crawlers").should have(1).crawler
-    @bot.instance_variable_get("@crawlers").first.should be_kind_of(Klepto::Crawler)
-  end
-end

data/spec/lib/klepto/crawler_spec.rb DELETED

@@ -1,88 +0,0 @@
-require 'spec_helper'
-require 'open-uri'
-describe Klepto::Crawler, :vcr => {:record => :new_episodes} do
-  describe 'dsl interaction' do
-    before(:each) do
-      @page = page("http://www.iana.org")
-      @crawler = Klepto::Crawler.new('body',{:syntax => :css}) do
-        scrape 'h1', :title
-        scrape '#intro p' do |node|
-          {description: node.text}
-        end
-        scrape_all '.home-panel h2' do |nodes|
-          { sections: nodes.map{|n| n.text} }
-        end
-      end
-      @resources = @crawler.crawl @page
-    end #end before
-    it 'should crawl the resource' do
-      @resources.should have(1).resource
-      @resources.first[:title].should match('Internet Assigned Numbers Authority')
-      @resources.first[:description].should match(/^The Internet Assigned Numbers Authority/i)
-      @resources.first[:sections].should have(3).sections
-    end
-  end
-  describe 'standard interaction' do
-    before(:each) do
-      @page     = page()
-      @crawler  = Klepto::Crawler.new 'body', {:syntax => :css}
-    end
-    it 'should have a CSS scope' do
-      @crawler.scope.should eq 'body'
-    end
-    it 'should have a desired syntax' do
-      @crawler.syntax.should == :css
-    end
-    it 'should be able to scrape the node that the crawler is scoped to' do
-      @crawler.scrape do |node|
-        {:name => node.native.name}
-      end
-      resources = @crawler.crawl( @page )
-      resources.should have(1).resource
-      resources.first[:name].should eq('body')
-    end
-    it 'should be able to designate scraping of a single node with a symbol' do
-      @crawler.scrape 'h1', :title
-      resources = @crawler.crawl( @page )
-      resources.should have(1).resource
-      resources.first[:title].should eq('Example Domain')
-    end
-    it 'should be able to designate scraping of a single node with a block' do
-      @crawler.scrape 'h1' do |node|
-        {title: node.text}
-      end
-      resources = @crawler.crawl( @page )
-      resources.should have(1).resource
-      resources.first[:title].should eq('Example Domain')
-    end
-    it 'should be able to designate scraping of a set of nodes' do
-      @crawler.scrape_all 'p' do |nodes|
-        {
-          paragraphs: [
-            nodes.first.text,
-            nodes.last.text
-          ]
-        }
-      end
-      resources = @crawler.crawl( @page )
-      resources.should have(1).resource
-      resources.first[:paragraphs].should be_kind_of(Array)
-      resources.first[:paragraphs].last.should eq("More information...")
-    end
-    pending 'should be able to save a set of resources'
-    pending 'should be able to specify a limit'
-    pending 'should be able to specify a skip'
-  end
-end

data/spec/lib/klepto/dsl_spec.rb DELETED

@@ -1,6 +0,0 @@
-require 'spec_helper'
-describe 'Klepto DSL' do
-end