wombat 2.1.3 → 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/wombat/dsl/metadata.rb +5 -1
- data/lib/wombat/processing/parser.rb +5 -3
- data/spec/integration/integration_spec.rb +32 -1
- data/spec/processing/parser_spec.rb +16 -3
- data/wombat.gemspec +2 -2
- metadata +2 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.2.0
|
data/lib/wombat/dsl/metadata.rb
CHANGED
@@ -34,12 +34,14 @@ module Wombat
|
|
34
34
|
page = nil
|
35
35
|
parser = nil
|
36
36
|
begin
|
37
|
+
@page = metadata[:page]
|
38
|
+
|
37
39
|
if metadata[:document_format] == :html
|
38
|
-
@page = @mechanize.get(url)
|
40
|
+
@page = @mechanize.get(url) unless @page
|
39
41
|
parser = @page.parser
|
40
42
|
parser.headers = @page.header
|
41
43
|
else
|
42
|
-
@page = RestClient.get(url)
|
44
|
+
@page = RestClient.get(url) unless @page
|
43
45
|
parser = Nokogiri::XML @page
|
44
46
|
parser.headers = @page.headers
|
45
47
|
end
|
@@ -56,4 +58,4 @@ module Wombat
|
|
56
58
|
end
|
57
59
|
end
|
58
60
|
end
|
59
|
-
end
|
61
|
+
end
|
@@ -32,6 +32,37 @@ describe 'basic crawler setup' do
|
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
35
|
+
it 'should crawl a Mechanize::Page' do
|
36
|
+
VCR.use_cassette('basic_crawler_page') do
|
37
|
+
crawler = Class.new
|
38
|
+
crawler.send(:include, Wombat::Crawler)
|
39
|
+
|
40
|
+
m = Mechanize.new
|
41
|
+
mp = m.get "http://www.terra.com.br/portal"
|
42
|
+
crawler.page mp
|
43
|
+
|
44
|
+
crawler.search "css=.btn-search"
|
45
|
+
crawler.social do
|
46
|
+
twitter "css=.ctn-bar li.last"
|
47
|
+
end
|
48
|
+
crawler.links "css=.ctn-links", :iterator do
|
49
|
+
menu "css=a"
|
50
|
+
end
|
51
|
+
crawler.subheader "css=h2.ttl-dynamic" do |h|
|
52
|
+
h.gsub("London", "Londres")
|
53
|
+
end
|
54
|
+
|
55
|
+
crawler_instance = crawler.new
|
56
|
+
|
57
|
+
results = crawler_instance.crawl
|
58
|
+
|
59
|
+
results["search"].should == "Buscar"
|
60
|
+
results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
61
|
+
results["subheader"].should == "Londres 2012"
|
62
|
+
results["social"]["twitter"].should == "Verão"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
35
66
|
it 'should support hash based selectors' do
|
36
67
|
VCR.use_cassette('basic_crawler_page') do
|
37
68
|
crawler = Class.new
|
@@ -242,4 +273,4 @@ describe 'basic crawler setup' do
|
|
242
273
|
}
|
243
274
|
end
|
244
275
|
end
|
245
|
-
end
|
276
|
+
end
|
@@ -18,7 +18,7 @@ describe Wombat::Processing::Parser do
|
|
18
18
|
fake_document.should_receive(:header).and_return(fake_header)
|
19
19
|
fake_parser.should_receive(:headers=)
|
20
20
|
@parser.mechanize.should_receive(:get).with("http://www.google.com/search").and_return fake_document
|
21
|
-
|
21
|
+
|
22
22
|
@parser.parse @metadata
|
23
23
|
end
|
24
24
|
|
@@ -32,7 +32,20 @@ describe Wombat::Processing::Parser do
|
|
32
32
|
Nokogiri.should_receive(:XML).with(fake_document).and_return fake_parser
|
33
33
|
fake_document.should_receive(:headers).and_return(fake_headers)
|
34
34
|
fake_parser.should_receive(:headers=)
|
35
|
-
|
35
|
+
|
36
36
|
@parser.parse @metadata
|
37
37
|
end
|
38
|
-
|
38
|
+
|
39
|
+
it 'should accept a Mechanize::Page' do
|
40
|
+
VCR.use_cassette('basic_crawler_page') do
|
41
|
+
m = Mechanize.new
|
42
|
+
page = m.get('http://www.terra.com.br/portal')
|
43
|
+
@metadata.page page
|
44
|
+
|
45
|
+
@parser.mechanize.should_not_receive(:get)
|
46
|
+
|
47
|
+
@parser.parse @metadata
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
data/wombat.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "2.
|
8
|
+
s.version = "2.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "2013-
|
12
|
+
s.date = "2013-06-06"
|
13
13
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
14
|
s.email = "felipe.lima@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-06-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|