wombat 2.1.1 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,7 +1,8 @@
1
1
  # Wombat
2
2
 
3
- [![CI Build Status](https://secure.travis-ci.org/felipecsl/wombat.png?branch=master)][travis] [![Dependency Status](https://gemnasium.com/felipecsl/wombat.png?travis)][gemnasium] [![Code Climate](https://codeclimate.com/badge.png)][codeclimate]
3
+ [[![Gem Version](https://badge.fury.io/rb/wombat.png)](http://badge.fury.io/rb/wombat)][rubygems] [![CI Build Status](https://secure.travis-ci.org/felipecsl/wombat.png?branch=master)][travis] [![Dependency Status](https://gemnasium.com/felipecsl/wombat.png?travis)][gemnasium] [![Code Climate](https://codeclimate.com/github/felipecsl/wombat.png)][codeclimate]
4
4
 
5
+ [rubygems]: http://rubygems.org/gems/wombat
5
6
  [travis]: http://travis-ci.org/felipecsl/wombat
6
7
  [gemnasium]: https://gemnasium.com/felipecsl/wombat
7
8
  [codeclimate]: https://codeclimate.com/github/felipecsl/wombat
@@ -25,19 +26,19 @@ Wombat.crawl do
25
26
  base_url "http://www.github.com"
26
27
  path "/"
27
28
 
28
- headline "xpath=//h1"
29
- subheading "css=p.subheading"
29
+ headline xpath: "//h1"
30
+ subheading css: "p.subheading"
30
31
 
31
- what_is "css=.teaser h3", :list
32
+ what_is({ css: ".teaser h3" }, :list)
32
33
 
33
34
  links do
34
- explore 'xpath=//*[@id="wrapper"]/div[1]/div/ul/li[1]/a' do |e|
35
+ explore xpath: '//*[@id="wrapper"]/div[1]/div/ul/li[1]/a' do |e|
35
36
  e.gsub(/Explore/, "Love")
36
37
  end
37
38
 
38
- search 'css=.search'
39
- features 'css=.features'
40
- blog 'css=.blog'
39
+ search css: '.search'
40
+ features css: '.features'
41
+ blog css: '.blog'
41
42
  end
42
43
  end
43
44
  ```
@@ -62,10 +63,11 @@ end
62
63
  }
63
64
  ```
64
65
 
65
- ### This is just a sneak peek of what Wombat can do. For the complete documentation, please check the [project Wiki](http://github.com/felipecsl/wombat/wiki).
66
- ### [API Documentation](http://rubydoc.info/gems/wombat/2.0.0/frames)
67
- ### [Changelog](https://github.com/felipecsl/wombat/wiki/Changelog)
66
+ ### This is just a sneak peek of what Wombat can do. For the complete documentation, please check the links below:
68
67
 
68
+ ### [Wiki](http://github.com/felipecsl/wombat/wiki).
69
+ ### [API Documentation](http://rubydoc.info/gems/wombat/2.1.1/frames)
70
+ ### [Changelog](https://github.com/felipecsl/wombat/wiki/Changelog)
69
71
 
70
72
  ## Contributing to Wombat
71
73
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 2.1.1
1
+ 2.1.2
@@ -0,0 +1,43 @@
1
+ #coding: utf-8
2
+ require 'wombat'
3
+
4
+ data = Wombat.crawl do
5
+ base_url "http://www.github.com"
6
+ path "/"
7
+
8
+ headline xpath: "//h1"
9
+ subheading css: "p.subheading"
10
+
11
+ what_is({ css: ".teaser h3" }, :list)
12
+
13
+ links do
14
+ explore xpath: '//*[@id="wrapper"]/div[1]/div/ul/li[1]/a' do |e|
15
+ e.gsub(/Explore/, "Love")
16
+ end
17
+
18
+ search css: '.search'
19
+ features css: '.features'
20
+ blog css: '.blog'
21
+ end
22
+ end
23
+
24
+ pp data
25
+
26
+ =begin
27
+ pp data
28
+ {
29
+ "headline"=>"Build software better, together.",
30
+ "subheading"=>
31
+ "Powerful collaboration, review, and code management for open source and private development projects.",
32
+ "what_is"=>
33
+ ["Great collaboration starts with communication.",
34
+ "Manage and contribute from all your devices.",
35
+ "The world’s largest open source community."],
36
+ "links"=>
37
+ {"explore"=>"Love GitHub",
38
+ "search"=>"Search",
39
+ "features"=>"Features",
40
+ "blog"=>"Blog"
41
+ }
42
+ }
43
+ =end
data/lib/wombat.rb CHANGED
@@ -4,12 +4,23 @@ require 'wombat/crawler'
4
4
 
5
5
  module Wombat
6
6
  class << self
7
+
8
+ attr_reader :proxy_args
9
+
7
10
  def crawl(&block)
8
11
  klass = Class.new
9
12
  klass.send(:include, Wombat::Crawler)
10
13
  klass.new.crawl(&block)
11
14
  end
12
-
15
+
16
+ def configure
17
+ yield self
18
+ end
19
+
20
+ def set_proxy(*args)
21
+ @proxy_args = args
22
+ end
23
+
13
24
  alias_method :scrape, :crawl
14
25
  end
15
26
  end
@@ -5,12 +5,12 @@ module Wombat
5
5
 
6
6
  def initialize(name, selector)
7
7
  @wombat_property_selector = selector
8
-
8
+
9
9
  super(name)
10
10
  end
11
11
 
12
12
  # So that Property::Locators::Iterator can identify this class
13
- # as an iterator property.
13
+ # as a follow property.
14
14
  def wombat_property_format
15
15
  :follow
16
16
  end
@@ -1,12 +1,23 @@
1
1
  module Wombat
2
- module Processing
3
- module NodeSelector
4
- def select_nodes(selector, namespaces = nil)
5
- return [selector.to_s] if selector.is_a? Symbol
6
- return @context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
7
- return @context.css selector[4..-1] if selector.start_with? "css="
8
- [selector]
9
- end
10
- end
2
+ module Processing
3
+ module NodeSelector
4
+ def select_nodes(selector, namespaces = nil)
5
+ return [selector.to_s] if selector.is_a? Symbol
6
+
7
+ if selector.is_a? String
8
+ return @context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
9
+ return @context.css selector[4..-1] if selector.start_with? "css="
10
+ end
11
+
12
+ if selector.is_a? Hash
13
+ method = selector.flatten.first
14
+ selector = selector.flatten.last
15
+ return @context.xpath selector, namespaces if method == :xpath
16
+ return @context.css selector if method == :css
17
+ end
18
+
19
+ [selector]
20
+ end
21
+ end
11
22
  end
12
23
  end
@@ -19,10 +19,7 @@ module Wombat
19
19
 
20
20
  def initialize
21
21
  @mechanize = Mechanize.new
22
- end
23
-
24
- def set_proxy(host, port)
25
- @mechanize.set_proxy host, port
22
+ @mechanize.set_proxy(*Wombat.proxy_args) if Wombat.proxy_args
26
23
  end
27
24
 
28
25
  def parse(metadata)
@@ -6,7 +6,7 @@ describe 'basic crawler setup' do
6
6
  VCR.use_cassette('basic_crawler_page') do
7
7
  crawler = Class.new
8
8
  crawler.send(:include, Wombat::Crawler)
9
-
9
+
10
10
  crawler.base_url "http://www.terra.com.br"
11
11
  crawler.path '/portal'
12
12
 
@@ -32,6 +32,36 @@ describe 'basic crawler setup' do
32
32
  end
33
33
  end
34
34
 
35
+ it 'should support hash based selectors' do
36
+ VCR.use_cassette('basic_crawler_page') do
37
+ crawler = Class.new
38
+ crawler.send(:include, Wombat::Crawler)
39
+
40
+ crawler.base_url "http://www.terra.com.br"
41
+ crawler.path '/portal'
42
+
43
+ crawler.search css: ".btn-search"
44
+ crawler.social do
45
+ twitter css: ".ctn-bar li.last"
46
+ end
47
+ crawler.links({css: ".ctn-links"}, :iterator) do
48
+ menu css: "a"
49
+ end
50
+ crawler.subheader css: "h2.ttl-dynamic" do |h|
51
+ h.gsub("London", "Londres")
52
+ end
53
+
54
+ crawler_instance = crawler.new
55
+
56
+ results = crawler_instance.crawl
57
+
58
+ results["search"].should == "Buscar"
59
+ results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
60
+ results["subheader"].should == "Londres 2012"
61
+ results["social"]["twitter"].should == "Verão"
62
+ end
63
+ end
64
+
35
65
  it 'should clear iterators between multiple runs' do
36
66
  crawler = Class.new
37
67
  crawler.send(:include, Wombat::Crawler)
@@ -56,7 +86,7 @@ describe 'basic crawler setup' do
56
86
  VCR.use_cassette('basic_crawler_page') do
57
87
  results = crawler_instance.crawl
58
88
  end
59
-
89
+
60
90
  results["links"].should == result_hash
61
91
  end
62
92
 
@@ -123,7 +153,7 @@ describe 'basic crawler setup' do
123
153
  VCR.use_cassette('for_each_page') do
124
154
  crawler = Class.new
125
155
  crawler.send(:include, Wombat::Crawler)
126
-
156
+
127
157
  crawler.base_url "https://www.github.com"
128
158
  crawler.path "/explore"
129
159
 
@@ -151,13 +181,13 @@ describe 'basic crawler setup' do
151
181
  VCR.use_cassette('xml_with_namespace') do
152
182
  crawler = Class.new
153
183
  crawler.send(:include, Wombat::Crawler)
154
-
184
+
155
185
  crawler.document_format :xml
156
186
  crawler.base_url "http://ws.audioscrobbler.com"
157
187
  crawler.path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=060decb474b73437d5bbec37f527ae7b"
158
188
 
159
189
  crawler.artist "xpath=//title", :list
160
-
190
+
161
191
  crawler.location 'xpath=//event', :iterator do
162
192
  latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
163
193
  longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
@@ -168,18 +198,18 @@ describe 'basic crawler setup' do
168
198
  iterator = results['location']
169
199
 
170
200
  iterator.should == [
171
- {"latitude"=>"37.807775", "longitude"=>"-122.272736"},
172
- {"latitude"=>"37.807717", "longitude"=>"-122.270059"},
173
- {"latitude"=>"37.869784", "longitude"=>"-122.267701"},
174
- {"latitude"=>"37.870873", "longitude"=>"-122.269313"},
175
- {"latitude"=>"37.782348", "longitude"=>"-122.408059"},
176
- {"latitude"=>"37.775529", "longitude"=>"-122.437757"},
177
- {"latitude"=>"37.771079", "longitude"=>"-122.412604"},
178
- {"latitude"=>"37.771079", "longitude"=>"-122.412604"},
179
- {"latitude"=>"37.784963", "longitude"=>"-122.418871"},
201
+ {"latitude"=>"37.807775", "longitude"=>"-122.272736"},
202
+ {"latitude"=>"37.807717", "longitude"=>"-122.270059"},
203
+ {"latitude"=>"37.869784", "longitude"=>"-122.267701"},
204
+ {"latitude"=>"37.870873", "longitude"=>"-122.269313"},
205
+ {"latitude"=>"37.782348", "longitude"=>"-122.408059"},
206
+ {"latitude"=>"37.775529", "longitude"=>"-122.437757"},
207
+ {"latitude"=>"37.771079", "longitude"=>"-122.412604"},
208
+ {"latitude"=>"37.771079", "longitude"=>"-122.412604"},
209
+ {"latitude"=>"37.784963", "longitude"=>"-122.418871"},
180
210
  {"latitude"=>"37.788978", "longitude"=>"-122.40664"}
181
211
  ]
182
-
212
+
183
213
  results["artist"].should =~ ["Davka", "Digitalism (DJ Set)", "Gary Clark Jr.", "Lenny Kravitz", "Little Muddy", "Michael Schenker Group", "The Asteroids Galaxy Tour", "When Indie Attacks", "When Indie Attacks", "YOB"]
184
214
  end
185
215
  end
@@ -188,7 +218,7 @@ describe 'basic crawler setup' do
188
218
  VCR.use_cassette('follow_links') do
189
219
  crawler = Class.new
190
220
  crawler.send(:include, Wombat::Crawler)
191
-
221
+
192
222
  crawler.base_url "https://www.github.com"
193
223
  crawler.path "/"
194
224
 
@@ -199,7 +229,7 @@ describe 'basic crawler setup' do
199
229
  crawler_instance = crawler.new
200
230
  results = crawler_instance.crawl
201
231
 
202
- results.should == {
232
+ results.should == {
203
233
  "github" => [
204
234
  { "heading"=>"GitHub helps people build software together." },
205
235
  { "heading"=>nil },
data/spec/wombat_spec.rb CHANGED
@@ -17,9 +17,16 @@ describe Wombat do
17
17
  Wombat.scrape
18
18
  end
19
19
 
20
+ it 'should provide configuration method with block' do
21
+ Wombat.configure do |config|
22
+ config.set_proxy "10.0.0.1", 8080
23
+ end
24
+ Wombat.proxy_args.should == ["10.0.0.1", 8080]
25
+ end
26
+
20
27
  it 'should accept regular properties (non-selectors)' do
21
28
  VCR.use_cassette('broken_selector') do
22
- lambda {
29
+ lambda {
23
30
  Wombat.crawl do
24
31
  base_url "http://www.github.com"
25
32
  path "/"
data/wombat.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "wombat"
8
- s.version = "2.1.1"
8
+ s.version = "2.1.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Felipe Lima"]
@@ -27,6 +27,7 @@ Gem::Specification.new do |s|
27
27
  "README.md",
28
28
  "Rakefile",
29
29
  "VERSION",
30
+ "examples/hashes.rb",
30
31
  "examples/iterator.rb",
31
32
  "examples/list.rb",
32
33
  "examples/no_class.rb",
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wombat
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.1.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -189,6 +189,7 @@ files:
189
189
  - README.md
190
190
  - Rakefile
191
191
  - VERSION
192
+ - examples/hashes.rb
192
193
  - examples/iterator.rb
193
194
  - examples/list.rb
194
195
  - examples/no_class.rb