RubyGems - wombat - Versions diffs - 2.0.0 → 2.0.1 - Mend

wombat 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/README.md +5 -9
data/Rakefile +1 -1
data/VERSION +1 -1
data/examples/iterator.rb +46 -0
data/examples/list.rb +44 -0
data/examples/no_class.rb +41 -0
data/examples/xml.rb +38 -0
data/lib/wombat/processing/parser.rb +4 -0
data/lib/wombat/property/locators/html.rb +15 -9
data/spec/integration/integration_spec.rb +7 -8
data/spec/property/locators/html_spec.rb +11 -0
data/wombat.gemspec +7 -3
metadata +7 -3

data/README.md CHANGED

@@ -19,10 +19,6 @@ Obs: Requires ruby 1.9
 The simplest way to use Wombat is by calling ``Wombat.crawl`` and passing it a block:
 ```ruby
-# => github_scraper.rb
-#coding: utf-8
 require 'wombat'
 Wombat.crawl do
@@ -30,8 +26,8 @@ Wombat.crawl do
   path "/"
   headline "xpath=//h1"
   what_is "css=.column.secondary p", :html
+  repositories "css=a.repo", :list
   explore "xpath=//ul/li[2]/a" do |e|
     e.gsub(/Explore/, "LOVE")
@@ -39,7 +35,7 @@ Wombat.crawl do
   benefits do
     first_benefit "css=.column.leftmost h3"
-    second_benefir "css=.column.leftmid h3"
+    second_benefit "css=.column.leftmid h3"
     third_benefit "css=.column.rightmid h3"
     fourth_benefit "css=.column.rightmost h3"
   end
@@ -53,6 +49,7 @@ end
   "headline" => "1,316,633 people hosting over 3,951,378 git repositories",
   "what_is" => "GitHub is the best way to collaborate with others.  Fork, send pull requests and manage all your <strong>public</strong> and <strong>private</strong> git repositories.",
   "explore" => "LOVE GitHub",
+  "repositories" => ["jQuery", "reddit", "Sparkle", "curl", "Ruby on Rails", "node.js", "ClickToFlash", "Erlang/OTP", "CakePHP", "Redis"]
   "benefits" => {
     "first_benefit"  => "Team management",
     "second_benefit" => "Code review",
@@ -63,7 +60,7 @@ end
 ```
 ### This is just a sneak peek of what Wombat can do. For the complete documentation, please check the [project Wiki](http://github.com/felipecsl/wombat/wiki).
-### [API Documentation](http://rubydoc.info/gems/wombat/1.0.0/frames)
+### [API Documentation](http://rubydoc.info/gems/wombat/2.0.0/frames)
 ### [Changelog](https://github.com/felipecsl/wombat/wiki/Changelog)
@@ -80,8 +77,7 @@ end
 ## Contributors
  * Felipe Lima ([@felipecsl](https://github.com/felipecsl))
- * Daniel Naves de Carvalho ([@danielnc](https://github.com/danielnc))
- * [@sigi](https://github.com/sigi)
+ * [List of all contributors](https://github.com/felipecsl/wombat/wiki/Contributors)
 ## Copyright

data/Rakefile CHANGED

@@ -10,7 +10,7 @@ require 'yard'
 Jeweler::Tasks.new do |gem|
   # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
   gem.name = "wombat"
-  gem.homepage = "http://github.com/felipecsl/wombat"
+  gem.homepage = "http://felipecsl.github.com/wombat"
   gem.license = "MIT"
   gem.summary = %Q{Ruby DSL to scrape web pages}
   gem.description = %Q{Generic Web crawler with a DSL that parses structured data from web pages}

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 2.0.0
1	+ 2.0.1

data/examples/iterator.rb ADDED

@@ -0,0 +1,46 @@
+#coding: utf-8
+require 'wombat'
+class IteratorCrawler
+  include Wombat::Crawler
+  base_url "https://www.github.com"
+  path "/explore"
+  repos "css=ol.ranked-repositories>li", :iterator do
+    repo 'css=h3'
+    description 'css=p.description'
+  end
+end
+=begin
+p IteratorCrawler.new.crawl
+{"repos"=>
+  [
+  	{
+  		"repo"=>"bernii / gauge.js",
+    	"description"=>"100% native and cool looking JavaScript gauge"
+  	},
+   	{
+   		"repo"=>"ZeitOnline / briefkasten",
+    	"description"=>"a reasonably secure web application for submitting content anonymously"
+  	},
+   	{
+   		"repo"=>"nothingmagical / cheddar-ios",
+   		"description"=>"Cheddar for iOS"
+ 		},
+   	{
+   		"repo"=>"nathanmarz / storm-mesos",
+    	"description"=>"Run Storm on top of the Mesos cluster resource manager"
+  	},
+   	{
+   		"repo"=>"Netflix / SimianArmy",
+    	"description"=>"Tools for keeping your cloud operating in top form. Chaos Monkey is a resiliency tool that helps ..."
+    },
+   	{
+   		"repo"=>nil,
+   		"description"=>nil
+   	}
+	]
+}
+=end

data/examples/list.rb ADDED

@@ -0,0 +1,44 @@
+#coding: utf-8
+require 'wombat'
+class ListCrawler
+  include Wombat::Crawler
+  base_url "http://www.rubygems.org"
+  path "/"
+  gems do
+    new "css=#new_gems li", :list
+    most_downloaded "css=#most_downloaded li", :list
+    just_updated "css=#just_updated li", :list
+  end
+end
+=begin
+pp ListCrawler.new.crawl
+{
+	"gems"=>{
+		"new"=>[
+			"buffer (0.0.1)",
+     	"resque-telework (0.2.0)",
+			"my_string_extend_lyk (0.0.1)",
+			"specr (0.0.1)",
+			"array-frequency (1.0.0)"
+		],
+   	"most_downloaded"=> [
+   		"rake-0.9.2.2 (7,128)",
+     	"mime-types-1.19 (5,331)",
+     	"tilt-1.3.3 (5,146)",
+     	"rack-1.4.1 (5,124)",
+     	"multi_json-1.3.6 (5,093)"
+   	],
+   	"just_updated"=>[
+   		"wombat (2.0.0)",
+     	"pdf-reader-turtletext (0.2.1)",
+     	"minitest-reporters (0.10.0)",
+     	"cloudprint (0.1.3)",
+     	"greenletters (0.2.0)"
+    ]
+  }
+}
+=end

data/examples/no_class.rb ADDED

@@ -0,0 +1,41 @@
+#coding: utf-8
+require 'wombat'
+data = Wombat.crawl do
+  base_url "http://www.github.com"
+  path "/"
+  headline "xpath=//h1"
+  what_is "css=.column.secondary p", :html
+  explore "xpath=//ul/li[2]/a" do |e|
+    e.gsub(/Explore/, "LOVE")
+  end
+  benefits do
+    team_mgmt "css=.column.leftmost h3"
+    code_review "css=.column.leftmid h3"
+    hosting "css=.column.rightmid h3"
+    collaboration "css=.column.rightmost h3"
+    links do
+      team_mgmt "xpath=//div[@class='column leftmost']//a/@href"
+    end
+  end
+end
+=begin
+pp data
+{
+  "headline"=>"1,900,094\n        people hosting over\n        3,371,168\n        repositories",
+  "what_is"=>"GitHub is the best way to collaborate with others.  Fork, send pull requests and manage all your <strong>public</strong> and <strong>private</strong> git repositories.",
+  "explore"=>"LOVE GitHub",
+  "benefits"=> {
+    "team_mgmt"=>"Team management",
+    "code_review"=>"Code review",
+    "hosting"=>"Reliable code hosting",
+    "collaboration"=>"Open source collaboration",
+    "links"=>{"team_mgmt"=>"/features/projects/collaboration"}
+  }
+}
+=end

data/examples/xml.rb ADDED

@@ -0,0 +1,38 @@
+#coding: utf-8
+require 'wombat'
+class XmlCrawler
+  include Wombat::Crawler
+  base_url "http://ws.audioscrobbler.com"
+  path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=ENV['API_KEY']"
+  document_format :xml
+  title "xpath=//event/title"
+  locations 'xpath=//event', :iterator do
+    latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
+    longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
+  end
+end
+=begin
+pp XmlCrawler.new.crawl
+{
+	"title"=>"Sinéad O'Connor",
+ 	"locations"=>[
+ 		{"latitude"=>"37.807717", "longitude"=>"-122.270059"},
+   	{"latitude"=>"37.76213", "longitude"=>"-122.419032"},
+   	{"latitude"=>"37.771491", "longitude"=>"-122.413241"},
+   	{"latitude"=>"37.776227", "longitude"=>"-122.42044"},
+   	{"latitude"=>"37.766588", "longitude"=>"-122.430391"},
+   	{"latitude"=>"37.788978", "longitude"=>"-122.40664"},
+   	{"latitude"=>"37.769715", "longitude"=>"-122.420427"},
+   	{"latitude"=>"37.78832", "longitude"=>"-122.446692"},
+   	{"latitude"=>"37.787583", "longitude"=>"-122.421665"},
+   	{"latitude"=>"37.776227", "longitude"=>"-122.42044"}
+	]
+}
+=end

data/lib/wombat/processing/parser.rb CHANGED

@@ -13,6 +13,10 @@ module Wombat
         @mechanize = Mechanize.new
       end
+      def set_proxy(host, port)
+      	@mechanize.set_proxy host, port
+      end
       def parse(metadata)
         @context = parser_for metadata

data/lib/wombat/property/locators/html.rb CHANGED

@@ -1,14 +1,20 @@
 #coding: utf-8
 module Wombat
-	module Property
-	  module Locators
-	    class Html < Base
-	    	def locate(context, page = nil)
-	    		node = locate_nodes(context).first
-	    		super { node.inner_html.strip }
-	    	end
-	    end
-	  end
+  module Property
+    module Locators
+      class Html < Base
+        def locate(context, page = nil)
+          node = locate_nodes(context).first
+          value =
+            unless node
+              nil
+            else
+              node.inner_html.strip
+            end
+          super { value }
+        end
+      end
+    end
   end
 end

data/spec/integration/integration_spec.rb CHANGED

@@ -189,7 +189,6 @@ describe 'basic crawler setup' do
       crawler = Class.new
       crawler.send(:include, Wombat::Crawler)
-      crawler.document_format :html
       crawler.base_url "https://www.github.com"
       crawler.path "/"
@@ -202,13 +201,13 @@ describe 'basic crawler setup' do
       results.should == {
         "github" => [
-          { "heading"=>"GitHub helps people build software together."},
-          { "heading"=>nil},
-          { "heading"=>"Features"},
-          { "heading"=>"Contact GitHub"},
-          { "heading"=>"GitHub Training — Git Training from the Experts"},
-          { "heading"=>"GitHub on Your Servers"},
-          { "heading"=>"Loading..."}
+          { "heading"=>"GitHub helps people build software together." },
+          { "heading"=>nil },
+          { "heading"=>"Features" },
+          { "heading"=>"Contact GitHub" },
+          { "heading"=>"GitHub Training — Git Training from the Experts" },
+          { "heading"=>"GitHub on Your Servers" },
+          { "heading"=>"Loading..." }
         ]
       }
     end

data/spec/property/locators/html_spec.rb CHANGED

@@ -12,4 +12,15 @@ describe Wombat::Property::Locators::Html do
 		locator.locate(context).should == { "data1" => "Something cool" }
 	end
+	it 'should return null if the property cannot be found' do
+		fake_elem = double :element
+		context   = double :context
+		context.stub(:xpath).with("/abc", nil).and_return []
+		property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :html)
+		locator = Wombat::Property::Locators::Html.new(property)
+		locator.locate(context).should == { "data1" => nil }
+	end
 end

data/wombat.gemspec CHANGED

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = "wombat"
-  s.version = "2.0.0"
+  s.version = "2.0.1"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Felipe Lima"]
-  s.date = "2012-07-31"
+  s.date = "2012-09-06"
   s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
   s.email = "felipe.lima@gmail.com"
   s.extra_rdoc_files = [
@@ -27,6 +27,10 @@ Gem::Specification.new do |s|
     "README.md",
     "Rakefile",
     "VERSION",
+    "examples/iterator.rb",
+    "examples/list.rb",
+    "examples/no_class.rb",
+    "examples/xml.rb",
     "fixtures/vcr_cassettes/basic_crawler_page.yml",
     "fixtures/vcr_cassettes/broken_selector.yml",
     "fixtures/vcr_cassettes/error_page.yml",
@@ -66,7 +70,7 @@ Gem::Specification.new do |s|
     "spec/wombat_spec.rb",
     "wombat.gemspec"
   ]
-  s.homepage = "http://github.com/felipecsl/wombat"
+  s.homepage = "http://felipecsl.github.com/wombat"
   s.licenses = ["MIT"]
   s.require_paths = ["lib"]
   s.required_ruby_version = Gem::Requirement.new(">= 1.9")

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wombat
 version: !ruby/object:Gem::Version
-  version: 2.0.0
+  version: 2.0.1
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-07-31 00:00:00.000000000 Z
+date: 2012-09-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mechanize
@@ -189,6 +189,10 @@ files:
 - README.md
 - Rakefile
 - VERSION
+- examples/iterator.rb
+- examples/list.rb
+- examples/no_class.rb
+- examples/xml.rb
 - fixtures/vcr_cassettes/basic_crawler_page.yml
 - fixtures/vcr_cassettes/broken_selector.yml
 - fixtures/vcr_cassettes/error_page.yml
@@ -227,7 +231,7 @@ files:
 - spec/spec_helper.rb
 - spec/wombat_spec.rb
 - wombat.gemspec
-homepage: http://github.com/felipecsl/wombat
+homepage: http://felipecsl.github.com/wombat
 licenses:
 - MIT
 post_install_message: