wombat 2.1.3 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 2.1.3
1
+ 2.2.0
@@ -23,6 +23,10 @@ module Wombat
23
23
  def document_format(format)
24
24
  self[:document_format] = format
25
25
  end
26
+
27
+ def page(page)
28
+ self[:page] = page
29
+ end
26
30
  end
27
31
  end
28
- end
32
+ end
@@ -34,12 +34,14 @@ module Wombat
34
34
  page = nil
35
35
  parser = nil
36
36
  begin
37
+ @page = metadata[:page]
38
+
37
39
  if metadata[:document_format] == :html
38
- @page = @mechanize.get(url)
40
+ @page = @mechanize.get(url) unless @page
39
41
  parser = @page.parser
40
42
  parser.headers = @page.header
41
43
  else
42
- @page = RestClient.get(url)
44
+ @page = RestClient.get(url) unless @page
43
45
  parser = Nokogiri::XML @page
44
46
  parser.headers = @page.headers
45
47
  end
@@ -56,4 +58,4 @@ module Wombat
56
58
  end
57
59
  end
58
60
  end
59
- end
61
+ end
@@ -32,6 +32,37 @@ describe 'basic crawler setup' do
32
32
  end
33
33
  end
34
34
 
35
+ it 'should crawl a Mechanize::Page' do
36
+ VCR.use_cassette('basic_crawler_page') do
37
+ crawler = Class.new
38
+ crawler.send(:include, Wombat::Crawler)
39
+
40
+ m = Mechanize.new
41
+ mp = m.get "http://www.terra.com.br/portal"
42
+ crawler.page mp
43
+
44
+ crawler.search "css=.btn-search"
45
+ crawler.social do
46
+ twitter "css=.ctn-bar li.last"
47
+ end
48
+ crawler.links "css=.ctn-links", :iterator do
49
+ menu "css=a"
50
+ end
51
+ crawler.subheader "css=h2.ttl-dynamic" do |h|
52
+ h.gsub("London", "Londres")
53
+ end
54
+
55
+ crawler_instance = crawler.new
56
+
57
+ results = crawler_instance.crawl
58
+
59
+ results["search"].should == "Buscar"
60
+ results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
61
+ results["subheader"].should == "Londres 2012"
62
+ results["social"]["twitter"].should == "Verão"
63
+ end
64
+ end
65
+
35
66
  it 'should support hash based selectors' do
36
67
  VCR.use_cassette('basic_crawler_page') do
37
68
  crawler = Class.new
@@ -242,4 +273,4 @@ describe 'basic crawler setup' do
242
273
  }
243
274
  end
244
275
  end
245
- end
276
+ end
@@ -18,7 +18,7 @@ describe Wombat::Processing::Parser do
18
18
  fake_document.should_receive(:header).and_return(fake_header)
19
19
  fake_parser.should_receive(:headers=)
20
20
  @parser.mechanize.should_receive(:get).with("http://www.google.com/search").and_return fake_document
21
-
21
+
22
22
  @parser.parse @metadata
23
23
  end
24
24
 
@@ -32,7 +32,20 @@ describe Wombat::Processing::Parser do
32
32
  Nokogiri.should_receive(:XML).with(fake_document).and_return fake_parser
33
33
  fake_document.should_receive(:headers).and_return(fake_headers)
34
34
  fake_parser.should_receive(:headers=)
35
-
35
+
36
36
  @parser.parse @metadata
37
37
  end
38
- end
38
+
39
+ it 'should accept a Mechanize::Page' do
40
+ VCR.use_cassette('basic_crawler_page') do
41
+ m = Mechanize.new
42
+ page = m.get('http://www.terra.com.br/portal')
43
+ @metadata.page page
44
+
45
+ @parser.mechanize.should_not_receive(:get)
46
+
47
+ @parser.parse @metadata
48
+ end
49
+ end
50
+
51
+ end
data/wombat.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "wombat"
8
- s.version = "2.1.3"
8
+ s.version = "2.2.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Felipe Lima"]
12
- s.date = "2013-04-22"
12
+ s.date = "2013-06-06"
13
13
  s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
14
14
  s.email = "felipe.lima@gmail.com"
15
15
  s.extra_rdoc_files = [
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wombat
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.3
4
+ version: 2.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-22 00:00:00.000000000 Z
12
+ date: 2013-06-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize