wombat 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -19,10 +19,6 @@ Obs: Requires ruby 1.9
19
19
  The simplest way to use Wombat is by calling ``Wombat.crawl`` and passing it a block:
20
20
 
21
21
  ```ruby
22
-
23
- # => github_scraper.rb
24
-
25
- #coding: utf-8
26
22
  require 'wombat'
27
23
 
28
24
  Wombat.crawl do
@@ -30,8 +26,8 @@ Wombat.crawl do
30
26
  path "/"
31
27
 
32
28
  headline "xpath=//h1"
33
-
34
29
  what_is "css=.column.secondary p", :html
30
+ repositories "css=a.repo", :list
35
31
 
36
32
  explore "xpath=//ul/li[2]/a" do |e|
37
33
  e.gsub(/Explore/, "LOVE")
@@ -39,7 +35,7 @@ Wombat.crawl do
39
35
 
40
36
  benefits do
41
37
  first_benefit "css=.column.leftmost h3"
42
- second_benefir "css=.column.leftmid h3"
38
+ second_benefit "css=.column.leftmid h3"
43
39
  third_benefit "css=.column.rightmid h3"
44
40
  fourth_benefit "css=.column.rightmost h3"
45
41
  end
@@ -53,6 +49,7 @@ end
53
49
  "headline" => "1,316,633 people hosting over 3,951,378 git repositories",
54
50
  "what_is" => "GitHub is the best way to collaborate with others. Fork, send pull requests and manage all your <strong>public</strong> and <strong>private</strong> git repositories.",
55
51
  "explore" => "LOVE GitHub",
52
+ "repositories" => ["jQuery", "reddit", "Sparkle", "curl", "Ruby on Rails", "node.js", "ClickToFlash", "Erlang/OTP", "CakePHP", "Redis"]
56
53
  "benefits" => {
57
54
  "first_benefit" => "Team management",
58
55
  "second_benefit" => "Code review",
@@ -63,7 +60,7 @@ end
63
60
  ```
64
61
 
65
62
  ### This is just a sneak peek of what Wombat can do. For the complete documentation, please check the [project Wiki](http://github.com/felipecsl/wombat/wiki).
66
- ### [API Documentation](http://rubydoc.info/gems/wombat/1.0.0/frames)
63
+ ### [API Documentation](http://rubydoc.info/gems/wombat/2.0.0/frames)
67
64
  ### [Changelog](https://github.com/felipecsl/wombat/wiki/Changelog)
68
65
 
69
66
 
@@ -80,8 +77,7 @@ end
80
77
  ## Contributors
81
78
 
82
79
  * Felipe Lima ([@felipecsl](https://github.com/felipecsl))
83
- * Daniel Naves de Carvalho ([@danielnc](https://github.com/danielnc))
84
- * [@sigi](https://github.com/sigi)
80
+ * [List of all contributors](https://github.com/felipecsl/wombat/wiki/Contributors)
85
81
 
86
82
  ## Copyright
87
83
 
data/Rakefile CHANGED
@@ -10,7 +10,7 @@ require 'yard'
10
10
  Jeweler::Tasks.new do |gem|
11
11
  # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
12
12
  gem.name = "wombat"
13
- gem.homepage = "http://github.com/felipecsl/wombat"
13
+ gem.homepage = "http://felipecsl.github.com/wombat"
14
14
  gem.license = "MIT"
15
15
  gem.summary = %Q{Ruby DSL to scrape web pages}
16
16
  gem.description = %Q{Generic Web crawler with a DSL that parses structured data from web pages}
data/VERSION CHANGED
@@ -1 +1 @@
1
- 2.0.0
1
+ 2.0.1
@@ -0,0 +1,46 @@
1
+ #coding: utf-8
2
+ require 'wombat'
3
+
4
+ class IteratorCrawler
5
+ include Wombat::Crawler
6
+
7
+ base_url "https://www.github.com"
8
+ path "/explore"
9
+
10
+ repos "css=ol.ranked-repositories>li", :iterator do
11
+ repo 'css=h3'
12
+ description 'css=p.description'
13
+ end
14
+ end
15
+
16
+ =begin
17
+ p IteratorCrawler.new.crawl
18
+ {"repos"=>
19
+ [
20
+ {
21
+ "repo"=>"bernii / gauge.js",
22
+ "description"=>"100% native and cool looking JavaScript gauge"
23
+ },
24
+ {
25
+ "repo"=>"ZeitOnline / briefkasten",
26
+ "description"=>"a reasonably secure web application for submitting content anonymously"
27
+ },
28
+ {
29
+ "repo"=>"nothingmagical / cheddar-ios",
30
+ "description"=>"Cheddar for iOS"
31
+ },
32
+ {
33
+ "repo"=>"nathanmarz / storm-mesos",
34
+ "description"=>"Run Storm on top of the Mesos cluster resource manager"
35
+ },
36
+ {
37
+ "repo"=>"Netflix / SimianArmy",
38
+ "description"=>"Tools for keeping your cloud operating in top form. Chaos Monkey is a resiliency tool that helps ..."
39
+ },
40
+ {
41
+ "repo"=>nil,
42
+ "description"=>nil
43
+ }
44
+ ]
45
+ }
46
+ =end
@@ -0,0 +1,44 @@
1
+ #coding: utf-8
2
+ require 'wombat'
3
+
4
+ class ListCrawler
5
+ include Wombat::Crawler
6
+
7
+ base_url "http://www.rubygems.org"
8
+ path "/"
9
+
10
+ gems do
11
+ new "css=#new_gems li", :list
12
+ most_downloaded "css=#most_downloaded li", :list
13
+ just_updated "css=#just_updated li", :list
14
+ end
15
+ end
16
+
17
+ =begin
18
+ pp ListCrawler.new.crawl
19
+ {
20
+ "gems"=>{
21
+ "new"=>[
22
+ "buffer (0.0.1)",
23
+ "resque-telework (0.2.0)",
24
+ "my_string_extend_lyk (0.0.1)",
25
+ "specr (0.0.1)",
26
+ "array-frequency (1.0.0)"
27
+ ],
28
+ "most_downloaded"=> [
29
+ "rake-0.9.2.2 (7,128)",
30
+ "mime-types-1.19 (5,331)",
31
+ "tilt-1.3.3 (5,146)",
32
+ "rack-1.4.1 (5,124)",
33
+ "multi_json-1.3.6 (5,093)"
34
+ ],
35
+ "just_updated"=>[
36
+ "wombat (2.0.0)",
37
+ "pdf-reader-turtletext (0.2.1)",
38
+ "minitest-reporters (0.10.0)",
39
+ "cloudprint (0.1.3)",
40
+ "greenletters (0.2.0)"
41
+ ]
42
+ }
43
+ }
44
+ =end
@@ -0,0 +1,41 @@
1
+ #coding: utf-8
2
+ require 'wombat'
3
+
4
+ data = Wombat.crawl do
5
+ base_url "http://www.github.com"
6
+ path "/"
7
+
8
+ headline "xpath=//h1"
9
+ what_is "css=.column.secondary p", :html
10
+
11
+ explore "xpath=//ul/li[2]/a" do |e|
12
+ e.gsub(/Explore/, "LOVE")
13
+ end
14
+
15
+ benefits do
16
+ team_mgmt "css=.column.leftmost h3"
17
+ code_review "css=.column.leftmid h3"
18
+ hosting "css=.column.rightmid h3"
19
+ collaboration "css=.column.rightmost h3"
20
+
21
+ links do
22
+ team_mgmt "xpath=//div[@class='column leftmost']//a/@href"
23
+ end
24
+ end
25
+ end
26
+
27
+ =begin
28
+ pp data
29
+ {
30
+ "headline"=>"1,900,094\n people hosting over\n 3,371,168\n repositories",
31
+ "what_is"=>"GitHub is the best way to collaborate with others. Fork, send pull requests and manage all your <strong>public</strong> and <strong>private</strong> git repositories.",
32
+ "explore"=>"LOVE GitHub",
33
+ "benefits"=> {
34
+ "team_mgmt"=>"Team management",
35
+ "code_review"=>"Code review",
36
+ "hosting"=>"Reliable code hosting",
37
+ "collaboration"=>"Open source collaboration",
38
+ "links"=>{"team_mgmt"=>"/features/projects/collaboration"}
39
+ }
40
+ }
41
+ =end
@@ -0,0 +1,38 @@
1
+ #coding: utf-8
2
+ require 'wombat'
3
+
4
+ class XmlCrawler
5
+ include Wombat::Crawler
6
+
7
+ base_url "http://ws.audioscrobbler.com"
8
+ path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=ENV['API_KEY']"
9
+
10
+ document_format :xml
11
+
12
+ title "xpath=//event/title"
13
+
14
+ locations 'xpath=//event', :iterator do
15
+ latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
16
+ longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
17
+ end
18
+ end
19
+
20
+ =begin
21
+ pp XmlCrawler.new.crawl
22
+
23
+ {
24
+ "title"=>"Sinéad O'Connor",
25
+ "locations"=>[
26
+ {"latitude"=>"37.807717", "longitude"=>"-122.270059"},
27
+ {"latitude"=>"37.76213", "longitude"=>"-122.419032"},
28
+ {"latitude"=>"37.771491", "longitude"=>"-122.413241"},
29
+ {"latitude"=>"37.776227", "longitude"=>"-122.42044"},
30
+ {"latitude"=>"37.766588", "longitude"=>"-122.430391"},
31
+ {"latitude"=>"37.788978", "longitude"=>"-122.40664"},
32
+ {"latitude"=>"37.769715", "longitude"=>"-122.420427"},
33
+ {"latitude"=>"37.78832", "longitude"=>"-122.446692"},
34
+ {"latitude"=>"37.787583", "longitude"=>"-122.421665"},
35
+ {"latitude"=>"37.776227", "longitude"=>"-122.42044"}
36
+ ]
37
+ }
38
+ =end
@@ -13,6 +13,10 @@ module Wombat
13
13
  @mechanize = Mechanize.new
14
14
  end
15
15
 
16
+ def set_proxy(host, port)
17
+ @mechanize.set_proxy host, port
18
+ end
19
+
16
20
  def parse(metadata)
17
21
  @context = parser_for metadata
18
22
 
@@ -1,14 +1,20 @@
1
1
  #coding: utf-8
2
2
 
3
3
  module Wombat
4
- module Property
5
- module Locators
6
- class Html < Base
7
- def locate(context, page = nil)
8
- node = locate_nodes(context).first
9
- super { node.inner_html.strip }
10
- end
11
- end
12
- end
4
+ module Property
5
+ module Locators
6
+ class Html < Base
7
+ def locate(context, page = nil)
8
+ node = locate_nodes(context).first
9
+ value =
10
+ unless node
11
+ nil
12
+ else
13
+ node.inner_html.strip
14
+ end
15
+ super { value }
16
+ end
17
+ end
18
+ end
13
19
  end
14
20
  end
@@ -189,7 +189,6 @@ describe 'basic crawler setup' do
189
189
  crawler = Class.new
190
190
  crawler.send(:include, Wombat::Crawler)
191
191
 
192
- crawler.document_format :html
193
192
  crawler.base_url "https://www.github.com"
194
193
  crawler.path "/"
195
194
 
@@ -202,13 +201,13 @@ describe 'basic crawler setup' do
202
201
 
203
202
  results.should == {
204
203
  "github" => [
205
- { "heading"=>"GitHub helps people build software together."},
206
- { "heading"=>nil},
207
- { "heading"=>"Features"},
208
- { "heading"=>"Contact GitHub"},
209
- { "heading"=>"GitHub Training — Git Training from the Experts"},
210
- { "heading"=>"GitHub on Your Servers"},
211
- { "heading"=>"Loading..."}
204
+ { "heading"=>"GitHub helps people build software together." },
205
+ { "heading"=>nil },
206
+ { "heading"=>"Features" },
207
+ { "heading"=>"Contact GitHub" },
208
+ { "heading"=>"GitHub Training — Git Training from the Experts" },
209
+ { "heading"=>"GitHub on Your Servers" },
210
+ { "heading"=>"Loading..." }
212
211
  ]
213
212
  }
214
213
  end
@@ -12,4 +12,15 @@ describe Wombat::Property::Locators::Html do
12
12
 
13
13
  locator.locate(context).should == { "data1" => "Something cool" }
14
14
  end
15
+
16
+ it 'should return null if the property cannot be found' do
17
+ fake_elem = double :element
18
+ context = double :context
19
+ context.stub(:xpath).with("/abc", nil).and_return []
20
+ property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :html)
21
+
22
+ locator = Wombat::Property::Locators::Html.new(property)
23
+
24
+ locator.locate(context).should == { "data1" => nil }
25
+ end
15
26
  end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "wombat"
8
- s.version = "2.0.0"
8
+ s.version = "2.0.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Felipe Lima"]
12
- s.date = "2012-07-31"
12
+ s.date = "2012-09-06"
13
13
  s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
14
14
  s.email = "felipe.lima@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -27,6 +27,10 @@ Gem::Specification.new do |s|
27
27
  "README.md",
28
28
  "Rakefile",
29
29
  "VERSION",
30
+ "examples/iterator.rb",
31
+ "examples/list.rb",
32
+ "examples/no_class.rb",
33
+ "examples/xml.rb",
30
34
  "fixtures/vcr_cassettes/basic_crawler_page.yml",
31
35
  "fixtures/vcr_cassettes/broken_selector.yml",
32
36
  "fixtures/vcr_cassettes/error_page.yml",
@@ -66,7 +70,7 @@ Gem::Specification.new do |s|
66
70
  "spec/wombat_spec.rb",
67
71
  "wombat.gemspec"
68
72
  ]
69
- s.homepage = "http://github.com/felipecsl/wombat"
73
+ s.homepage = "http://felipecsl.github.com/wombat"
70
74
  s.licenses = ["MIT"]
71
75
  s.require_paths = ["lib"]
72
76
  s.required_ruby_version = Gem::Requirement.new(">= 1.9")
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wombat
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-31 00:00:00.000000000 Z
12
+ date: 2012-09-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -189,6 +189,10 @@ files:
189
189
  - README.md
190
190
  - Rakefile
191
191
  - VERSION
192
+ - examples/iterator.rb
193
+ - examples/list.rb
194
+ - examples/no_class.rb
195
+ - examples/xml.rb
192
196
  - fixtures/vcr_cassettes/basic_crawler_page.yml
193
197
  - fixtures/vcr_cassettes/broken_selector.yml
194
198
  - fixtures/vcr_cassettes/error_page.yml
@@ -227,7 +231,7 @@ files:
227
231
  - spec/spec_helper.rb
228
232
  - spec/wombat_spec.rb
229
233
  - wombat.gemspec
230
- homepage: http://github.com/felipecsl/wombat
234
+ homepage: http://felipecsl.github.com/wombat
231
235
  licenses:
232
236
  - MIT
233
237
  post_install_message: