wombat 2.1.3 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/wombat/dsl/metadata.rb +5 -1
- data/lib/wombat/processing/parser.rb +5 -3
- data/spec/integration/integration_spec.rb +32 -1
- data/spec/processing/parser_spec.rb +16 -3
- data/wombat.gemspec +2 -2
- metadata +2 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.2.0
|
data/lib/wombat/dsl/metadata.rb
CHANGED
@@ -34,12 +34,14 @@ module Wombat
|
|
34
34
|
page = nil
|
35
35
|
parser = nil
|
36
36
|
begin
|
37
|
+
@page = metadata[:page]
|
38
|
+
|
37
39
|
if metadata[:document_format] == :html
|
38
|
-
@page = @mechanize.get(url)
|
40
|
+
@page = @mechanize.get(url) unless @page
|
39
41
|
parser = @page.parser
|
40
42
|
parser.headers = @page.header
|
41
43
|
else
|
42
|
-
@page = RestClient.get(url)
|
44
|
+
@page = RestClient.get(url) unless @page
|
43
45
|
parser = Nokogiri::XML @page
|
44
46
|
parser.headers = @page.headers
|
45
47
|
end
|
@@ -56,4 +58,4 @@ module Wombat
|
|
56
58
|
end
|
57
59
|
end
|
58
60
|
end
|
59
|
-
end
|
61
|
+
end
|
@@ -32,6 +32,37 @@ describe 'basic crawler setup' do
|
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
35
|
+
it 'should crawl a Mechanize::Page' do
|
36
|
+
VCR.use_cassette('basic_crawler_page') do
|
37
|
+
crawler = Class.new
|
38
|
+
crawler.send(:include, Wombat::Crawler)
|
39
|
+
|
40
|
+
m = Mechanize.new
|
41
|
+
mp = m.get "http://www.terra.com.br/portal"
|
42
|
+
crawler.page mp
|
43
|
+
|
44
|
+
crawler.search "css=.btn-search"
|
45
|
+
crawler.social do
|
46
|
+
twitter "css=.ctn-bar li.last"
|
47
|
+
end
|
48
|
+
crawler.links "css=.ctn-links", :iterator do
|
49
|
+
menu "css=a"
|
50
|
+
end
|
51
|
+
crawler.subheader "css=h2.ttl-dynamic" do |h|
|
52
|
+
h.gsub("London", "Londres")
|
53
|
+
end
|
54
|
+
|
55
|
+
crawler_instance = crawler.new
|
56
|
+
|
57
|
+
results = crawler_instance.crawl
|
58
|
+
|
59
|
+
results["search"].should == "Buscar"
|
60
|
+
results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
61
|
+
results["subheader"].should == "Londres 2012"
|
62
|
+
results["social"]["twitter"].should == "Verão"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
35
66
|
it 'should support hash based selectors' do
|
36
67
|
VCR.use_cassette('basic_crawler_page') do
|
37
68
|
crawler = Class.new
|
@@ -242,4 +273,4 @@ describe 'basic crawler setup' do
|
|
242
273
|
}
|
243
274
|
end
|
244
275
|
end
|
245
|
-
end
|
276
|
+
end
|
@@ -18,7 +18,7 @@ describe Wombat::Processing::Parser do
|
|
18
18
|
fake_document.should_receive(:header).and_return(fake_header)
|
19
19
|
fake_parser.should_receive(:headers=)
|
20
20
|
@parser.mechanize.should_receive(:get).with("http://www.google.com/search").and_return fake_document
|
21
|
-
|
21
|
+
|
22
22
|
@parser.parse @metadata
|
23
23
|
end
|
24
24
|
|
@@ -32,7 +32,20 @@ describe Wombat::Processing::Parser do
|
|
32
32
|
Nokogiri.should_receive(:XML).with(fake_document).and_return fake_parser
|
33
33
|
fake_document.should_receive(:headers).and_return(fake_headers)
|
34
34
|
fake_parser.should_receive(:headers=)
|
35
|
-
|
35
|
+
|
36
36
|
@parser.parse @metadata
|
37
37
|
end
|
38
|
-
|
38
|
+
|
39
|
+
it 'should accept a Mechanize::Page' do
|
40
|
+
VCR.use_cassette('basic_crawler_page') do
|
41
|
+
m = Mechanize.new
|
42
|
+
page = m.get('http://www.terra.com.br/portal')
|
43
|
+
@metadata.page page
|
44
|
+
|
45
|
+
@parser.mechanize.should_not_receive(:get)
|
46
|
+
|
47
|
+
@parser.parse @metadata
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
data/wombat.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "2.
|
8
|
+
s.version = "2.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "2013-
|
12
|
+
s.date = "2013-06-06"
|
13
13
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
14
|
s.email = "felipe.lima@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-06-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|