wombat 2.1.3 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 2.1.3
1
+ 2.2.0
@@ -23,6 +23,10 @@ module Wombat
23
23
  def document_format(format)
24
24
  self[:document_format] = format
25
25
  end
26
+
27
+ def page(page)
28
+ self[:page] = page
29
+ end
26
30
  end
27
31
  end
28
- end
32
+ end
@@ -34,12 +34,14 @@ module Wombat
34
34
  page = nil
35
35
  parser = nil
36
36
  begin
37
+ @page = metadata[:page]
38
+
37
39
  if metadata[:document_format] == :html
38
- @page = @mechanize.get(url)
40
+ @page = @mechanize.get(url) unless @page
39
41
  parser = @page.parser
40
42
  parser.headers = @page.header
41
43
  else
42
- @page = RestClient.get(url)
44
+ @page = RestClient.get(url) unless @page
43
45
  parser = Nokogiri::XML @page
44
46
  parser.headers = @page.headers
45
47
  end
@@ -56,4 +58,4 @@ module Wombat
56
58
  end
57
59
  end
58
60
  end
59
- end
61
+ end
@@ -32,6 +32,37 @@ describe 'basic crawler setup' do
32
32
  end
33
33
  end
34
34
 
35
+ it 'should crawl a Mechanize::Page' do
36
+ VCR.use_cassette('basic_crawler_page') do
37
+ crawler = Class.new
38
+ crawler.send(:include, Wombat::Crawler)
39
+
40
+ m = Mechanize.new
41
+ mp = m.get "http://www.terra.com.br/portal"
42
+ crawler.page mp
43
+
44
+ crawler.search "css=.btn-search"
45
+ crawler.social do
46
+ twitter "css=.ctn-bar li.last"
47
+ end
48
+ crawler.links "css=.ctn-links", :iterator do
49
+ menu "css=a"
50
+ end
51
+ crawler.subheader "css=h2.ttl-dynamic" do |h|
52
+ h.gsub("London", "Londres")
53
+ end
54
+
55
+ crawler_instance = crawler.new
56
+
57
+ results = crawler_instance.crawl
58
+
59
+ results["search"].should == "Buscar"
60
+ results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
61
+ results["subheader"].should == "Londres 2012"
62
+ results["social"]["twitter"].should == "Verão"
63
+ end
64
+ end
65
+
35
66
  it 'should support hash based selectors' do
36
67
  VCR.use_cassette('basic_crawler_page') do
37
68
  crawler = Class.new
@@ -242,4 +273,4 @@ describe 'basic crawler setup' do
242
273
  }
243
274
  end
244
275
  end
245
- end
276
+ end
@@ -18,7 +18,7 @@ describe Wombat::Processing::Parser do
18
18
  fake_document.should_receive(:header).and_return(fake_header)
19
19
  fake_parser.should_receive(:headers=)
20
20
  @parser.mechanize.should_receive(:get).with("http://www.google.com/search").and_return fake_document
21
-
21
+
22
22
  @parser.parse @metadata
23
23
  end
24
24
 
@@ -32,7 +32,20 @@ describe Wombat::Processing::Parser do
32
32
  Nokogiri.should_receive(:XML).with(fake_document).and_return fake_parser
33
33
  fake_document.should_receive(:headers).and_return(fake_headers)
34
34
  fake_parser.should_receive(:headers=)
35
-
35
+
36
36
  @parser.parse @metadata
37
37
  end
38
- end
38
+
39
+ it 'should accept a Mechanize::Page' do
40
+ VCR.use_cassette('basic_crawler_page') do
41
+ m = Mechanize.new
42
+ page = m.get('http://www.terra.com.br/portal')
43
+ @metadata.page page
44
+
45
+ @parser.mechanize.should_not_receive(:get)
46
+
47
+ @parser.parse @metadata
48
+ end
49
+ end
50
+
51
+ end
data/wombat.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "wombat"
8
- s.version = "2.1.3"
8
+ s.version = "2.2.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Felipe Lima"]
12
- s.date = "2013-04-22"
12
+ s.date = "2013-06-06"
13
13
  s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
14
14
  s.email = "felipe.lima@gmail.com"
15
15
  s.extra_rdoc_files = [
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wombat
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.3
4
+ version: 2.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-22 00:00:00.000000000 Z
12
+ date: 2013-06-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize