RubyGems - wombat - Versions diffs - 0.3.1 → 0.4.0 - Mend

wombat 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/Gemfile.lock +30 -22
data/VERSION +1 -1
data/lib/wombat.rb +5 -0
data/lib/wombat/crawler.rb +18 -4
data/lib/wombat/metadata.rb +3 -3
data/lib/wombat/node_selector.rb +1 -1
data/lib/wombat/parser.rb +2 -2
data/lib/wombat/property_container.rb +2 -2
data/lib/wombat/property_locator.rb +2 -2
data/spec/crawler_spec.rb +135 -101
data/spec/integration/integration_spec.rb +59 -0
data/spec/wombat_spec.rb +7 -0
data/wombat.gemspec +3 -2
metadata +3 -2

data/Gemfile.lock CHANGED Viewed

@@ -1,46 +1,54 @@
 GEM
   remote: http://rubygems.org/
   specs:
-    activesupport (3.0.11)
+    activesupport (3.2.3)
+      i18n (~> 0.6)
+      multi_json (~> 1.0)
     diff-lcs (1.1.3)
-    domain_name (0.5.1)
+    domain_name (0.5.3)
       unf (~> 0.0.3)
     fakeweb (1.3.0)
     git (1.2.5)
-    jeweler (1.6.4)
+    i18n (0.6.0)
+    jeweler (1.8.3)
       bundler (~> 1.0)
       git (>= 1.2.5)
       rake
-    mechanize (2.1)
+      rdoc
+    json (1.7.3)
+    mechanize (2.5.1)
       domain_name (~> 0.5, >= 0.5.1)
+      mime-types (~> 1.17, >= 1.17.2)
       net-http-digest_auth (~> 1.1, >= 1.1.1)
-      net-http-persistent (~> 2.3, >= 2.3.2)
+      net-http-persistent (~> 2.5, >= 2.5.2)
       nokogiri (~> 1.4)
       ntlm-http (~> 0.1, >= 0.1.1)
       webrobots (~> 0.0, >= 0.0.9)
-    mime-types (1.17.2)
-    net-http-digest_auth (1.2)
-    net-http-persistent (2.3.3)
-    nokogiri (1.5.0)
+    mime-types (1.18)
+    multi_json (1.3.5)
+    net-http-digest_auth (1.2.1)
+    net-http-persistent (2.6)
+    nokogiri (1.5.2)
     ntlm-http (0.1.1)
     rake (0.9.2.2)
+    rdoc (3.12)
+      json (~> 1.4)
     rest-client (1.6.7)
       mime-types (>= 1.16)
-    rspec (2.7.0)
-      rspec-core (~> 2.7.0)
-      rspec-expectations (~> 2.7.0)
-      rspec-mocks (~> 2.7.0)
-    rspec-core (2.7.1)
-    rspec-expectations (2.7.0)
-      diff-lcs (~> 1.1.2)
-    rspec-mocks (2.7.0)
-    unf (0.0.4)
+    rspec (2.10.0)
+      rspec-core (~> 2.10.0)
+      rspec-expectations (~> 2.10.0)
+      rspec-mocks (~> 2.10.0)
+    rspec-core (2.10.1)
+    rspec-expectations (2.10.0)
+      diff-lcs (~> 1.1.3)
+    rspec-mocks (2.10.1)
+    unf (0.0.5)
       unf_ext
     unf_ext (0.0.4)
-    vcr (2.0.0)
-    webrobots (0.0.12)
-      nokogiri (>= 1.4.4)
-    yard (0.7.4)
+    vcr (2.1.1)
+    webrobots (0.0.13)
+    yard (0.8.1)
 PLATFORMS
   ruby

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.3.1
1	+ 0.4.0

data/lib/wombat.rb CHANGED Viewed

@@ -3,4 +3,9 @@
 require 'wombat/crawler'
 module Wombat
+	def self.crawl(&block)
+		klass = Class.new
+		klass.send(:include, Wombat::Crawler)
+		klass.new.crawl(&block)
+	end
 end

data/lib/wombat/crawler.rb CHANGED Viewed

@@ -14,27 +14,41 @@ module Wombat
       if block
         @metadata_dup = self.class.send(:metadata).clone
         instance_eval do
+          alias :old_method_missing :method_missing
           def method_missing method, *args, &block
             @metadata_dup.send method, *args, &block
           end
         end
         self.instance_eval &block
-        parse @metadata_dup
+        parsed = parse @metadata_dup
+        instance_eval do
+          alias :method_missing :old_method_missing
+          remove_instance_variable :@metadata_dup
+        end
+        parsed
       else
         parse self.class.send(:metadata)
       end
     end
+    def method_missing(method, *args, &block)
+      self.class.send method, *args, &block
+    end
+    def for_each(selector, &block)
+      self.class.for_each selector, &block
+    end
     module ClassMethods
-      def method_missing method, *args, &block
+      def method_missing(method, *args, &block)
         metadata.send method, *args, &block
       end
-      def for_each selector, &block
+      def for_each(selector, &block)
         metadata.for_each(selector).instance_eval(&block) if block
       end
-      def follow_links selector
+      def follow_links(selector)
       end

data/lib/wombat/metadata.rb CHANGED Viewed

@@ -9,15 +9,15 @@ module Wombat
       super
     end
-    def base_url url
+    def base_url(url)
       self[:base_url] = url
     end
-    def list_page url
+    def list_page(url)
       self[:list_page] = url
     end
-    def format format
+    def format(format)
       self[:format] = format
     end
   end

data/lib/wombat/node_selector.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Wombat
   module NodeSelector
-    def select_nodes selector, namespaces = nil
+    def select_nodes(selector, namespaces = nil)
       return [selector.to_s] if selector.is_a? Symbol
       return context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
       return context.css selector[4..-1] if selector.start_with? "css="

data/lib/wombat/parser.rb CHANGED Viewed

@@ -12,7 +12,7 @@ module Wombat
       @mechanize = Mechanize.new
     end
-    def parse metadata
+    def parse(metadata)
       self.context = parser_for metadata
       original_context = self.context
@@ -31,7 +31,7 @@ module Wombat
     end
     private
-    def parser_for metadata
+    def parser_for(metadata)
       url = "#{metadata[:base_url]}#{metadata[:list_page]}"
       if metadata[:format] == :html

data/lib/wombat/property_container.rb CHANGED Viewed

@@ -8,7 +8,7 @@ module Wombat
       @iterators = []
     end
-    def method_missing method, *args, &block
+    def method_missing(method, *args, &block)
       if args.empty? && block
         self["#{method.to_s}"] = PropertyContainer.new unless self["#{method.to_s}"]
         block.call(self["#{method.to_s}"])
@@ -61,7 +61,7 @@ module Wombat
       properties.merge iters
     end
-    def for_each selector
+    def for_each(selector)
       Iterator.new(selector).tap do |i|
         iterators << i
       end

data/lib/wombat/property_locator.rb CHANGED Viewed

@@ -5,13 +5,13 @@ module Wombat
   module PropertyLocator
     include NodeSelector
-    def locate property
+    def locate(property)
       props = _locate property
       property.format != :list ? props.first : props
     end
     private
-    def _locate property
+    def _locate(property)
       result = select_nodes(property.selector, property.namespaces).to_a
       result.map! {|r| r.inner_html.strip } if property.format == :html
       result.map {|r| r.kind_of?(String) ? r : r.inner_text }.map(&:strip)

data/spec/crawler_spec.rb CHANGED Viewed

@@ -1,145 +1,179 @@
 require 'spec_helper'
 describe Wombat::Crawler do
-  before(:each) do
-    @crawler = Class.new
-    @crawler.send(:include, Wombat::Crawler)
-    @crawler_instance = @crawler.new
-  end
+  describe '#crawl' do
+    before(:each) do
+      @crawler = Class.new
+      @crawler.send(:include, Wombat::Crawler)
+      @crawler_instance = @crawler.new
+    end
-  it 'should call the provided block' do
-    event_called = false
+    it 'should call the provided block' do
+      event_called = false
-    @crawler.event { event_called = true }
+      @crawler.event { event_called = true }
-    event_called.should be_true
-  end
+      event_called.should be_true
+    end
-  it 'should provide metadata to yielded block' do
-    @crawler.event do |e|
-      e.should_not be_nil
+    it 'should provide metadata to yielded block' do
+      @crawler.event do |e|
+        e.should_not be_nil
+      end
     end
-  end
-  it 'should store assigned metadata information' do
-    time = Time.now
+    it 'should store assigned metadata information' do
+      time = Time.now
-    @crawler.event do |e|
-      e.title 'Fulltronic Dezembro'
-      e.time Time.now
-    end
+      @crawler.event do |e|
+        e.title 'Fulltronic Dezembro'
+        e.time Time.now
+      end
-    @crawler.venue { |v| v.name "Scooba" }
-    @crawler.location { |v| v.latitude -50.2323 }
+      @crawler.venue { |v| v.name "Scooba" }
+      @crawler.location { |v| v.latitude -50.2323 }
-    @crawler_instance.should_receive(:parse) do |arg|
-      arg["event"]["title"].selector.should == "Fulltronic Dezembro"
-      arg["event"]["time"].selector.to_s.should == time.to_s
-      arg["venue"]["name"].selector.should == "Scooba"
-      arg["location"]["latitude"].selector.should == -50.2323
+      @crawler_instance.should_receive(:parse) do |arg|
+        arg["event"]["title"].selector.should == "Fulltronic Dezembro"
+        arg["event"]["time"].selector.to_s.should == time.to_s
+        arg["venue"]["name"].selector.should == "Scooba"
+        arg["location"]["latitude"].selector.should == -50.2323
+      end
+      @crawler_instance.crawl
     end
-    @crawler_instance.crawl
-  end
+    it 'should isolate metadata between different instances' do
+      another_crawler = Class.new
+      another_crawler.send(:include, Wombat::Crawler)
+      another_crawler_instance = another_crawler.new
-  it 'should isolate metadata between different instances' do
-    another_crawler = Class.new
-    another_crawler.send(:include, Wombat::Crawler)
-    another_crawler_instance = another_crawler.new
+      another_crawler.event { |e| e.title 'Ibiza' }
+      another_crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Ibiza" }
+      another_crawler_instance.crawl
-    another_crawler.event { |e| e.title 'Ibiza' }
-    another_crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Ibiza" }
-    another_crawler_instance.crawl
+      @crawler.event { |e| e.title 'Fulltronic Dezembro' }
+      @crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Fulltronic Dezembro" }
+      @crawler_instance.crawl
+    end
-    @crawler.event { |e| e.title 'Fulltronic Dezembro' }
-    @crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Fulltronic Dezembro" }
-    @crawler_instance.crawl
-  end
+    it 'should be able to assign arbitrary plain text metadata' do
+      @crawler.some_data("/event/list", :html, "geo") { |p| true }
-  it 'should be able to assign arbitrary plain text metadata' do
-    @crawler.some_data("/event/list", :html, "geo") { |p| true }
+      @crawler_instance.should_receive(:parse) do |arg|
+        prop = arg['some_data']
+        prop.name.should == "some_data"
+        prop.selector.should == "/event/list"
+        prop.format.should == :html
+        prop.namespaces.should == "geo"
+        prop.callback.should_not be_nil
+      end
-    @crawler_instance.should_receive(:parse) do |arg|
-      prop = arg['some_data']
-      prop.name.should == "some_data"
-      prop.selector.should == "/event/list"
-      prop.format.should == :html
-      prop.namespaces.should == "geo"
-      prop.callback.should_not be_nil
+      @crawler_instance.crawl
     end
-    @crawler_instance.crawl
-  end
+    it 'should be able to specify arbitrary block structure more than once' do
+      @crawler.structure do |s|
+        s.data "xpath=/xyz"
+      end
-  it 'should be able to specify arbitrary block structure more than once' do
-    @crawler.structure do |s|
-      s.data "xpath=/xyz"
-    end
+      @crawler.structure do |s|
+        s.another "css=.information"
+      end
-    @crawler.structure do |s|
-      s.another "css=.information"
-    end
+      @crawler_instance.should_receive(:parse) do |arg|
+        arg["structure"]["data"].selector.should == "xpath=/xyz"
+        arg["structure"]["another"].selector.should == "css=.information"
+      end
-    @crawler_instance.should_receive(:parse) do |arg|
-      arg["structure"]["data"].selector.should == "xpath=/xyz"
-      arg["structure"]["another"].selector.should == "css=.information"
+      @crawler_instance.crawl
     end
-    @crawler_instance.crawl
-  end
+    it 'should not explode if no block given' do
+      @crawler.event
+    end
-  it 'should not explode if no block given' do
-    @crawler.event
-  end
+    it 'should iterate on elements inside for_each block' do
+      @crawler.for_each "css=.element" do
+        title "css=.title"
+        body "css=.body"
+        event do |e|
+          e.all "yeah"
+        end
+      end
-  it 'should iterate on elements inside for_each block' do
-    @crawler.for_each "css=.element" do
-      title "css=.title"
-      body "css=.body"
-      event do |e|
-        e.all "yeah"
+      @crawler_instance.should_receive(:parse) do |arg|
+        it = arg.iterators.first
+        it.selector.should == "css=.element"
+        it["title"].selector.should == "css=.title"
+        it["body"].selector.should == "css=.body"
+        it["event"]["all"].selector.should == "yeah"
       end
+      @crawler_instance.crawl
     end
-    @crawler_instance.should_receive(:parse) do |arg|
-      it = arg.iterators.first
-      it.selector.should == "css=.element"
-      it["title"].selector.should == "css=.title"
-      it["body"].selector.should == "css=.body"
-      it["event"]["all"].selector.should == "yeah"
+    it 'should assign metadata format' do
+      @crawler_instance.should_receive(:parse) do |arg|
+        arg[:format].should == :xml
+      end
+      @crawler.format :xml
+      @crawler_instance.crawl
     end
-    @crawler_instance.crawl
-  end
+    it 'should crawl with block' do
+      @crawler.base_url "danielnc.com"
+      @crawler.list_page "/itens"
-  it 'should assign metadata format' do
-    @crawler_instance.should_receive(:parse) do |arg|
-      arg[:format].should == :xml
-    end
-    @crawler.format :xml
-    @crawler_instance.crawl
-  end
+      @crawler_instance.should_receive(:parse) do |arg|
+        arg[:base_url].should == "danielnc.com"
+        arg[:list_page].should == "/itens/1"
+      end
-  it 'should crawl with block' do
-    @crawler.base_url "danielnc.com"
-    @crawler.list_page "/itens"
+      @crawler_instance.crawl do
+        list_page "/itens/1"
+      end
-    @crawler_instance.should_receive(:parse) do |arg|
-      arg[:base_url].should == "danielnc.com"
-      arg[:list_page].should == "/itens/1"
-    end
+      another_instance = @crawler.new
-    @crawler_instance.crawl do
-      list_page "/itens/1"
+      another_instance.should_receive(:parse) do |arg|
+        arg[:base_url].should == "danielnc.com"
+        arg[:list_page].should == "/itens"
+      end
+      another_instance.crawl
     end
+    it 'should remove created method missing' do
+      @crawler.base_url "danielnc.com"
+      @crawler.list_page "/itens"
+      @crawler_instance.should_receive(:parse) do |arg|
+        arg[:base_url].should == "danielnc.com"
+        arg[:list_page].should == "/itens/1"
+      end
-    another_instance = @crawler.new
+      @crawler_instance.crawl do
+        list_page "/itens/1"
+      end
-    another_instance.should_receive(:parse) do |arg|
-      arg[:base_url].should == "danielnc.com"
-      arg[:list_page].should == "/itens"
+      lambda { @craler_intance.undefined_method }.should raise_error(NoMethodError)
     end
-    another_instance.crawl
+    it 'should remove created instance variable' do
+      @crawler.base_url "danielnc.com"
+      @crawler.list_page "/itens"
+      @crawler_instance.should_receive(:parse) do |arg|
+        arg[:base_url].should == "danielnc.com"
+        arg[:list_page].should == "/itens/1"
+      end
+      @crawler_instance.crawl do
+        list_page "/itens/1"
+      end
+      @crawler_instance.instance_variables.index(:@metadata_dup).should be_nil
+    end
   end
 end

data/spec/integration/integration_spec.rb CHANGED Viewed

@@ -34,6 +34,65 @@ describe 'basic crawler setup' do
     end
   end
+  it 'should crawl page through block to class instance crawl method' do
+    VCR.use_cassette('basic_crawler_page') do
+      crawler = Class.new
+      crawler.send(:include, Wombat::Crawler)
+      crawler_instance = crawler.new
+      results = crawler_instance.crawl do
+        base_url "http://www.terra.com.br"
+        list_page '/portal'
+        search "css=.btn-search"
+        social do |s|
+          s.twitter "css=.ctn-bar li.last"
+        end
+        for_each "css=.ctn-links" do
+          menu "css=a"
+        end
+        subheader "css=h2.ttl-dynamic" do |h|
+          h.gsub("London", "Londres")
+        end
+      end
+      results["search"].should == "Buscar"
+      results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
+      results["subheader"].should == "Londres 2012"
+      results["social"]["twitter"].should == "Verão"
+    end
+  end
+  it 'should crawl page through static crawl method' do
+    VCR.use_cassette('basic_crawler_page') do
+      results = Wombat.crawl do
+        base_url "http://www.terra.com.br"
+        list_page '/portal'
+        search "css=.btn-search"
+        social do |s|
+          s.twitter "css=.ctn-bar li.last"
+        end
+        for_each "css=.ctn-links" do
+          menu "css=a"
+        end
+        subheader "css=h2.ttl-dynamic" do |h|
+          h.gsub("London", "Londres")
+        end
+      end
+      results["search"].should == "Buscar"
+      results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
+      results["subheader"].should == "Londres 2012"
+      results["social"]["twitter"].should == "Verão"
+    end
+  end
   it 'should iterate elements' do
     VCR.use_cassette('for_each_page') do
       crawler = Class.new

data/spec/wombat_spec.rb ADDED Viewed

@@ -0,0 +1,7 @@
+require 'spec_helper'
+describe Wombat do
+	it 'should provide syntactic sugar method Wombat.crawl' do
+		Wombat.should respond_to(:crawl)
+	end
+end

data/wombat.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = "wombat"
-  s.version = "0.3.1"
+  s.version = "0.4.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Felipe Lima"]
-  s.date = "2012-04-12"
+  s.date = "2012-05-25"
   s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
   s.email = "felipe.lima@gmail.com"
   s.extra_rdoc_files = [
@@ -50,6 +50,7 @@ Gem::Specification.new do |s|
     "spec/property_spec.rb",
     "spec/sample_crawler_spec.rb",
     "spec/spec_helper.rb",
+    "spec/wombat_spec.rb",
     "wombat.gemspec"
   ]
   s.homepage = "http://github.com/felipecsl/wombat"

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wombat
 version: !ruby/object:Gem::Version
-  version: 0.3.1
+  version: 0.4.0
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-04-12 00:00:00.000000000 Z
+date: 2012-05-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mechanize
@@ -212,6 +212,7 @@ files:
 - spec/property_spec.rb
 - spec/sample_crawler_spec.rb
 - spec/spec_helper.rb
+- spec/wombat_spec.rb
 - wombat.gemspec
 homepage: http://github.com/felipecsl/wombat
 licenses: