wombat 2.1.1 → 2.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,7 +1,8 @@
1
1
  # Wombat
2
2
 
3
- [![CI Build Status](https://secure.travis-ci.org/felipecsl/wombat.png?branch=master)][travis] [![Dependency Status](https://gemnasium.com/felipecsl/wombat.png?travis)][gemnasium] [![Code Climate](https://codeclimate.com/badge.png)][codeclimate]
3
+ [[![Gem Version](https://badge.fury.io/rb/wombat.png)](http://badge.fury.io/rb/wombat)][rubygems] [![CI Build Status](https://secure.travis-ci.org/felipecsl/wombat.png?branch=master)][travis] [![Dependency Status](https://gemnasium.com/felipecsl/wombat.png?travis)][gemnasium] [![Code Climate](https://codeclimate.com/github/felipecsl/wombat.png)][codeclimate]
4
4
 
5
+ [rubygems]: http://rubygems.org/gems/wombat
5
6
  [travis]: http://travis-ci.org/felipecsl/wombat
6
7
  [gemnasium]: https://gemnasium.com/felipecsl/wombat
7
8
  [codeclimate]: https://codeclimate.com/github/felipecsl/wombat
@@ -25,19 +26,19 @@ Wombat.crawl do
25
26
  base_url "http://www.github.com"
26
27
  path "/"
27
28
 
28
- headline "xpath=//h1"
29
- subheading "css=p.subheading"
29
+ headline xpath: "//h1"
30
+ subheading css: "p.subheading"
30
31
 
31
- what_is "css=.teaser h3", :list
32
+ what_is({ css: ".teaser h3" }, :list)
32
33
 
33
34
  links do
34
- explore 'xpath=//*[@id="wrapper"]/div[1]/div/ul/li[1]/a' do |e|
35
+ explore xpath: '//*[@id="wrapper"]/div[1]/div/ul/li[1]/a' do |e|
35
36
  e.gsub(/Explore/, "Love")
36
37
  end
37
38
 
38
- search 'css=.search'
39
- features 'css=.features'
40
- blog 'css=.blog'
39
+ search css: '.search'
40
+ features css: '.features'
41
+ blog css: '.blog'
41
42
  end
42
43
  end
43
44
  ```
@@ -62,10 +63,11 @@ end
62
63
  }
63
64
  ```
64
65
 
65
- ### This is just a sneak peek of what Wombat can do. For the complete documentation, please check the [project Wiki](http://github.com/felipecsl/wombat/wiki).
66
- ### [API Documentation](http://rubydoc.info/gems/wombat/2.0.0/frames)
67
- ### [Changelog](https://github.com/felipecsl/wombat/wiki/Changelog)
66
+ ### This is just a sneak peek of what Wombat can do. For the complete documentation, please check the links below:
68
67
 
68
+ ### [Wiki](http://github.com/felipecsl/wombat/wiki).
69
+ ### [API Documentation](http://rubydoc.info/gems/wombat/2.1.1/frames)
70
+ ### [Changelog](https://github.com/felipecsl/wombat/wiki/Changelog)
69
71
 
70
72
  ## Contributing to Wombat
71
73
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 2.1.1
1
+ 2.1.2
@@ -0,0 +1,43 @@
1
+ #coding: utf-8
2
+ require 'wombat'
3
+
4
+ data = Wombat.crawl do
5
+ base_url "http://www.github.com"
6
+ path "/"
7
+
8
+ headline xpath: "//h1"
9
+ subheading css: "p.subheading"
10
+
11
+ what_is({ css: ".teaser h3" }, :list)
12
+
13
+ links do
14
+ explore xpath: '//*[@id="wrapper"]/div[1]/div/ul/li[1]/a' do |e|
15
+ e.gsub(/Explore/, "Love")
16
+ end
17
+
18
+ search css: '.search'
19
+ features css: '.features'
20
+ blog css: '.blog'
21
+ end
22
+ end
23
+
24
+ pp data
25
+
26
+ =begin
27
+ pp data
28
+ {
29
+ "headline"=>"Build software better, together.",
30
+ "subheading"=>
31
+ "Powerful collaboration, review, and code management for open source and private development projects.",
32
+ "what_is"=>
33
+ ["Great collaboration starts with communication.",
34
+ "Manage and contribute from all your devices.",
35
+ "The world’s largest open source community."],
36
+ "links"=>
37
+ {"explore"=>"Love GitHub",
38
+ "search"=>"Search",
39
+ "features"=>"Features",
40
+ "blog"=>"Blog"
41
+ }
42
+ }
43
+ =end
data/lib/wombat.rb CHANGED
@@ -4,12 +4,23 @@ require 'wombat/crawler'
4
4
 
5
5
  module Wombat
6
6
  class << self
7
+
8
+ attr_reader :proxy_args
9
+
7
10
  def crawl(&block)
8
11
  klass = Class.new
9
12
  klass.send(:include, Wombat::Crawler)
10
13
  klass.new.crawl(&block)
11
14
  end
12
-
15
+
16
+ def configure
17
+ yield self
18
+ end
19
+
20
+ def set_proxy(*args)
21
+ @proxy_args = args
22
+ end
23
+
13
24
  alias_method :scrape, :crawl
14
25
  end
15
26
  end
@@ -5,12 +5,12 @@ module Wombat
5
5
 
6
6
  def initialize(name, selector)
7
7
  @wombat_property_selector = selector
8
-
8
+
9
9
  super(name)
10
10
  end
11
11
 
12
12
  # So that Property::Locators::Iterator can identify this class
13
- # as an iterator property.
13
+ # as a follow property.
14
14
  def wombat_property_format
15
15
  :follow
16
16
  end
@@ -1,12 +1,23 @@
1
1
  module Wombat
2
- module Processing
3
- module NodeSelector
4
- def select_nodes(selector, namespaces = nil)
5
- return [selector.to_s] if selector.is_a? Symbol
6
- return @context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
7
- return @context.css selector[4..-1] if selector.start_with? "css="
8
- [selector]
9
- end
10
- end
2
+ module Processing
3
+ module NodeSelector
4
+ def select_nodes(selector, namespaces = nil)
5
+ return [selector.to_s] if selector.is_a? Symbol
6
+
7
+ if selector.is_a? String
8
+ return @context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
9
+ return @context.css selector[4..-1] if selector.start_with? "css="
10
+ end
11
+
12
+ if selector.is_a? Hash
13
+ method = selector.flatten.first
14
+ selector = selector.flatten.last
15
+ return @context.xpath selector, namespaces if method == :xpath
16
+ return @context.css selector if method == :css
17
+ end
18
+
19
+ [selector]
20
+ end
21
+ end
11
22
  end
12
23
  end
@@ -19,10 +19,7 @@ module Wombat
19
19
 
20
20
  def initialize
21
21
  @mechanize = Mechanize.new
22
- end
23
-
24
- def set_proxy(host, port)
25
- @mechanize.set_proxy host, port
22
+ @mechanize.set_proxy(*Wombat.proxy_args) if Wombat.proxy_args
26
23
  end
27
24
 
28
25
  def parse(metadata)
@@ -6,7 +6,7 @@ describe 'basic crawler setup' do
6
6
  VCR.use_cassette('basic_crawler_page') do
7
7
  crawler = Class.new
8
8
  crawler.send(:include, Wombat::Crawler)
9
-
9
+
10
10
  crawler.base_url "http://www.terra.com.br"
11
11
  crawler.path '/portal'
12
12
 
@@ -32,6 +32,36 @@ describe 'basic crawler setup' do
32
32
  end
33
33
  end
34
34
 
35
+ it 'should support hash based selectors' do
36
+ VCR.use_cassette('basic_crawler_page') do
37
+ crawler = Class.new
38
+ crawler.send(:include, Wombat::Crawler)
39
+
40
+ crawler.base_url "http://www.terra.com.br"
41
+ crawler.path '/portal'
42
+
43
+ crawler.search css: ".btn-search"
44
+ crawler.social do
45
+ twitter css: ".ctn-bar li.last"
46
+ end
47
+ crawler.links({css: ".ctn-links"}, :iterator) do
48
+ menu css: "a"
49
+ end
50
+ crawler.subheader css: "h2.ttl-dynamic" do |h|
51
+ h.gsub("London", "Londres")
52
+ end
53
+
54
+ crawler_instance = crawler.new
55
+
56
+ results = crawler_instance.crawl
57
+
58
+ results["search"].should == "Buscar"
59
+ results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
60
+ results["subheader"].should == "Londres 2012"
61
+ results["social"]["twitter"].should == "Verão"
62
+ end
63
+ end
64
+
35
65
  it 'should clear iterators between multiple runs' do
36
66
  crawler = Class.new
37
67
  crawler.send(:include, Wombat::Crawler)
@@ -56,7 +86,7 @@ describe 'basic crawler setup' do
56
86
  VCR.use_cassette('basic_crawler_page') do
57
87
  results = crawler_instance.crawl
58
88
  end
59
-
89
+
60
90
  results["links"].should == result_hash
61
91
  end
62
92
 
@@ -123,7 +153,7 @@ describe 'basic crawler setup' do
123
153
  VCR.use_cassette('for_each_page') do
124
154
  crawler = Class.new
125
155
  crawler.send(:include, Wombat::Crawler)
126
-
156
+
127
157
  crawler.base_url "https://www.github.com"
128
158
  crawler.path "/explore"
129
159
 
@@ -151,13 +181,13 @@ describe 'basic crawler setup' do
151
181
  VCR.use_cassette('xml_with_namespace') do
152
182
  crawler = Class.new
153
183
  crawler.send(:include, Wombat::Crawler)
154
-
184
+
155
185
  crawler.document_format :xml
156
186
  crawler.base_url "http://ws.audioscrobbler.com"
157
187
  crawler.path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=060decb474b73437d5bbec37f527ae7b"
158
188
 
159
189
  crawler.artist "xpath=//title", :list
160
-
190
+
161
191
  crawler.location 'xpath=//event', :iterator do
162
192
  latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
163
193
  longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
@@ -168,18 +198,18 @@ describe 'basic crawler setup' do
168
198
  iterator = results['location']
169
199
 
170
200
  iterator.should == [
171
- {"latitude"=>"37.807775", "longitude"=>"-122.272736"},
172
- {"latitude"=>"37.807717", "longitude"=>"-122.270059"},
173
- {"latitude"=>"37.869784", "longitude"=>"-122.267701"},
174
- {"latitude"=>"37.870873", "longitude"=>"-122.269313"},
175
- {"latitude"=>"37.782348", "longitude"=>"-122.408059"},
176
- {"latitude"=>"37.775529", "longitude"=>"-122.437757"},
177
- {"latitude"=>"37.771079", "longitude"=>"-122.412604"},
178
- {"latitude"=>"37.771079", "longitude"=>"-122.412604"},
179
- {"latitude"=>"37.784963", "longitude"=>"-122.418871"},
201
+ {"latitude"=>"37.807775", "longitude"=>"-122.272736"},
202
+ {"latitude"=>"37.807717", "longitude"=>"-122.270059"},
203
+ {"latitude"=>"37.869784", "longitude"=>"-122.267701"},
204
+ {"latitude"=>"37.870873", "longitude"=>"-122.269313"},
205
+ {"latitude"=>"37.782348", "longitude"=>"-122.408059"},
206
+ {"latitude"=>"37.775529", "longitude"=>"-122.437757"},
207
+ {"latitude"=>"37.771079", "longitude"=>"-122.412604"},
208
+ {"latitude"=>"37.771079", "longitude"=>"-122.412604"},
209
+ {"latitude"=>"37.784963", "longitude"=>"-122.418871"},
180
210
  {"latitude"=>"37.788978", "longitude"=>"-122.40664"}
181
211
  ]
182
-
212
+
183
213
  results["artist"].should =~ ["Davka", "Digitalism (DJ Set)", "Gary Clark Jr.", "Lenny Kravitz", "Little Muddy", "Michael Schenker Group", "The Asteroids Galaxy Tour", "When Indie Attacks", "When Indie Attacks", "YOB"]
184
214
  end
185
215
  end
@@ -188,7 +218,7 @@ describe 'basic crawler setup' do
188
218
  VCR.use_cassette('follow_links') do
189
219
  crawler = Class.new
190
220
  crawler.send(:include, Wombat::Crawler)
191
-
221
+
192
222
  crawler.base_url "https://www.github.com"
193
223
  crawler.path "/"
194
224
 
@@ -199,7 +229,7 @@ describe 'basic crawler setup' do
199
229
  crawler_instance = crawler.new
200
230
  results = crawler_instance.crawl
201
231
 
202
- results.should == {
232
+ results.should == {
203
233
  "github" => [
204
234
  { "heading"=>"GitHub helps people build software together." },
205
235
  { "heading"=>nil },
data/spec/wombat_spec.rb CHANGED
@@ -17,9 +17,16 @@ describe Wombat do
17
17
  Wombat.scrape
18
18
  end
19
19
 
20
+ it 'should provide configuration method with block' do
21
+ Wombat.configure do |config|
22
+ config.set_proxy "10.0.0.1", 8080
23
+ end
24
+ Wombat.proxy_args.should == ["10.0.0.1", 8080]
25
+ end
26
+
20
27
  it 'should accept regular properties (non-selectors)' do
21
28
  VCR.use_cassette('broken_selector') do
22
- lambda {
29
+ lambda {
23
30
  Wombat.crawl do
24
31
  base_url "http://www.github.com"
25
32
  path "/"
data/wombat.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "wombat"
8
- s.version = "2.1.1"
8
+ s.version = "2.1.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Felipe Lima"]
@@ -27,6 +27,7 @@ Gem::Specification.new do |s|
27
27
  "README.md",
28
28
  "Rakefile",
29
29
  "VERSION",
30
+ "examples/hashes.rb",
30
31
  "examples/iterator.rb",
31
32
  "examples/list.rb",
32
33
  "examples/no_class.rb",
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wombat
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.1.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -189,6 +189,7 @@ files:
189
189
  - README.md
190
190
  - Rakefile
191
191
  - VERSION
192
+ - examples/hashes.rb
192
193
  - examples/iterator.rb
193
194
  - examples/list.rb
194
195
  - examples/no_class.rb