wombat 2.9.0 → 2.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +0 -2
- data/CHANGELOG.md +5 -0
- data/Gemfile.lock +14 -10
- data/VERSION +1 -1
- data/fixtures/vcr_cassettes/xml_with_namespace.yml +9 -17
- data/lib/wombat/processing/parser.rb +10 -2
- data/spec/dsl/property_spec.rb +5 -5
- data/spec/integration/follow_relative_links_spec.rb +1 -1
- data/spec/integration/integration_spec.rb +28 -30
- data/spec/processing/parser_spec.rb +3 -3
- data/spec/property/locators/html_spec.rb +2 -7
- data/spec/property/locators/list_spec.rb +1 -1
- data/spec/property/locators/text_spec.rb +5 -6
- data/spec/sample_crawler_spec.rb +3 -3
- data/spec/wombat_spec.rb +3 -3
- data/wombat.gemspec +3 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7114b5a602e08f1253f7af6fbdebf6c598cd4a714a0fc28d11725b2d393ca232
|
4
|
+
data.tar.gz: ed31a4692959b35534d66accd7538db66f80c320bb0cff1c1b0a1b72c63860dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: caa0ef5373883bb65ced56709a48fdf95dc0bd64053ca2f8402ec8c603cbd05bae46401b8360616b996d5e80d6473d1e407abad0004c4695c102bd41f22fbf86
|
7
|
+
data.tar.gz: 9ec8827f2989c035b6f9ca6dcd5ecb0c5905702cf140887c012ee9b39ba6211764fdb7d1118ba12ce583a0654ecd0c30b1758ea6fb5bb4e45c5b93276d78f950
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
-
activesupport (
|
4
|
+
activesupport (6.0.0)
|
5
5
|
concurrent-ruby (~> 1.0, >= 1.0.2)
|
6
6
|
i18n (>= 0.7, < 2)
|
7
7
|
minitest (~> 5.1)
|
8
8
|
tzinfo (~> 1.1)
|
9
|
+
zeitwerk (~> 2.1, >= 2.1.8)
|
9
10
|
addressable (2.4.0)
|
10
11
|
builder (3.2.3)
|
11
12
|
concurrent-ruby (1.1.5)
|
@@ -34,9 +35,10 @@ GEM
|
|
34
35
|
hashie (>= 3.4)
|
35
36
|
mime-types (>= 1.16, < 3.0)
|
36
37
|
oauth2 (~> 1.0)
|
37
|
-
hashdiff (0.
|
38
|
+
hashdiff (1.0.0)
|
38
39
|
hashie (3.6.0)
|
39
40
|
highline (2.0.2)
|
41
|
+
http-accept (1.7.0)
|
40
42
|
http-cookie (1.0.3)
|
41
43
|
domain_name (~> 0.5)
|
42
44
|
i18n (1.6.0)
|
@@ -70,10 +72,10 @@ GEM
|
|
70
72
|
multi_xml (0.6.0)
|
71
73
|
multipart-post (2.1.1)
|
72
74
|
net-http-digest_auth (1.4.1)
|
73
|
-
net-http-persistent (3.0
|
75
|
+
net-http-persistent (3.1.0)
|
74
76
|
connection_pool (~> 2.2)
|
75
77
|
netrc (0.11.0)
|
76
|
-
nokogiri (1.10.
|
78
|
+
nokogiri (1.10.4)
|
77
79
|
mini_portile2 (~> 2.4.0)
|
78
80
|
ntlm-http (0.1.1)
|
79
81
|
oauth2 (1.4.1)
|
@@ -84,9 +86,10 @@ GEM
|
|
84
86
|
rack (>= 1.2, < 3)
|
85
87
|
psych (3.1.0)
|
86
88
|
rack (2.0.7)
|
87
|
-
rake (12.3.
|
88
|
-
rdoc (6.
|
89
|
-
rest-client (2.0
|
89
|
+
rake (12.3.3)
|
90
|
+
rdoc (6.2.0)
|
91
|
+
rest-client (2.1.0)
|
92
|
+
http-accept (>= 1.7.0, < 2.0)
|
90
93
|
http-cookie (>= 1.0.2, < 2.0)
|
91
94
|
mime-types (>= 1.16, < 4.0)
|
92
95
|
netrc (~> 0.8)
|
@@ -114,19 +117,20 @@ GEM
|
|
114
117
|
tins (~> 1.0)
|
115
118
|
thor (0.20.3)
|
116
119
|
thread_safe (0.3.6)
|
117
|
-
tins (1.21.
|
120
|
+
tins (1.21.1)
|
118
121
|
tzinfo (1.2.5)
|
119
122
|
thread_safe (~> 0.1)
|
120
123
|
unf (0.1.4)
|
121
124
|
unf_ext
|
122
125
|
unf_ext (0.0.7.6)
|
123
126
|
vcr (5.0.0)
|
124
|
-
webmock (3.
|
127
|
+
webmock (3.7.1)
|
125
128
|
addressable (>= 2.3.6)
|
126
129
|
crack (>= 0.3.2)
|
127
130
|
hashdiff (>= 0.4.0, < 2.0.0)
|
128
131
|
webrobots (0.1.2)
|
129
132
|
yard (0.9.20)
|
133
|
+
zeitwerk (2.1.9)
|
130
134
|
|
131
135
|
PLATFORMS
|
132
136
|
ruby
|
@@ -145,4 +149,4 @@ DEPENDENCIES
|
|
145
149
|
yard
|
146
150
|
|
147
151
|
BUNDLED WITH
|
148
|
-
|
152
|
+
2.0.2
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.10.0
|
@@ -3,24 +3,16 @@ http_interactions:
|
|
3
3
|
- request:
|
4
4
|
method: get
|
5
5
|
uri: http://ws.audioscrobbler.com/2.0/?api_key=060decb474b73437d5bbec37f527ae7b&location=San%20Francisco&method=geo.getevents
|
6
|
-
body:
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
7
9
|
headers:
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
accept-charset:
|
15
|
-
- ISO-8859-1,utf-8;q=0.7,*;q=0.7
|
16
|
-
accept-language:
|
17
|
-
- en-us,en;q=0.5
|
18
|
-
host:
|
19
|
-
- ws.audioscrobbler.com
|
20
|
-
connection:
|
21
|
-
- keep-alive
|
22
|
-
keep-alive:
|
23
|
-
- 300
|
10
|
+
Accept:
|
11
|
+
- "*/*"
|
12
|
+
User-Agent:
|
13
|
+
- rest-client/2.1.0 (darwin17.3.0 x86_64) ruby/2.5.0p0
|
14
|
+
Host:
|
15
|
+
- ws.audioscrobbler.com
|
24
16
|
response:
|
25
17
|
status:
|
26
18
|
code: 200
|
@@ -49,7 +49,6 @@ module Wombat
|
|
49
49
|
private
|
50
50
|
def parser_for(metadata, url)
|
51
51
|
url ||= "#{metadata[:base_url]}#{metadata[:path]}"
|
52
|
-
page = nil
|
53
52
|
parser = nil
|
54
53
|
_method = method_from(metadata[:http_method])
|
55
54
|
data = metadata[:data]
|
@@ -64,7 +63,7 @@ module Wombat
|
|
64
63
|
parser.headers = @page.header
|
65
64
|
else
|
66
65
|
@page = RestClient.public_send(_method, *args) unless @page
|
67
|
-
parser = Nokogiri::XML
|
66
|
+
parser = Nokogiri::XML(decode_body)
|
68
67
|
parser.headers = @page.headers
|
69
68
|
end
|
70
69
|
@response_code = @page.code.to_i if @page.respond_to? :code
|
@@ -79,6 +78,15 @@ module Wombat
|
|
79
78
|
end
|
80
79
|
end
|
81
80
|
|
81
|
+
def decode_body
|
82
|
+
# Check if it's gzip encoded
|
83
|
+
if @page.body.start_with?("\x1F\x8B".b)
|
84
|
+
Zlib::GzipReader.new(StringIO.new(@page.body)).read
|
85
|
+
else
|
86
|
+
@page.body
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
82
90
|
def method_from(_method)
|
83
91
|
return :get if _method.nil?
|
84
92
|
HTTP_METHODS.detect(->{:get}){ |i| i == _method.downcase.to_sym }
|
data/spec/dsl/property_spec.rb
CHANGED
@@ -4,10 +4,10 @@ describe Wombat::DSL::Property do
|
|
4
4
|
it 'should store property data' do
|
5
5
|
callback = lambda { false }
|
6
6
|
property = Wombat::DSL::Property.new("title", *["/some/selector", :html], &callback)
|
7
|
-
|
8
|
-
property.wombat_property_name.should
|
9
|
-
property.selector.should
|
10
|
-
property.format.should
|
11
|
-
property.callback.should
|
7
|
+
|
8
|
+
property.wombat_property_name.should eq "title"
|
9
|
+
property.selector.should eq "/some/selector"
|
10
|
+
property.format.should eq :html
|
11
|
+
property.callback.should eq callback
|
12
12
|
end
|
13
13
|
end
|
@@ -22,7 +22,7 @@ describe 'following pages referred by relative links' do
|
|
22
22
|
results = crawler_instance.crawl
|
23
23
|
|
24
24
|
# There are many entries. It's enough to check first three ones
|
25
|
-
results["vocabulary"][0..2].should
|
25
|
+
results["vocabulary"][0..2].should eq [
|
26
26
|
{"entry"=>{"word"=>"Dmoz", "description"=>"Dmoz - второй по популярности каталог сайтов после Яндекс-Каталога. Адрес каталога Dmoz - .\r\n\r\nЗаметка: Как вы думаете, мебель из Китая дорого стоит? Правильно, она недорогая. поставляет не только мебель, но и китайскую сантехнику, люстры, светильники и многое другое. Если вы хотите здорово съэкономить, то не пропустите такую возможность."}},
|
27
27
|
{"entry"=>{"word"=>"PR", "description"=>"PR - PageRank - показатель Google для конкретной страницы сайта. Зависит от количества ссылок на страницу и от качества этих ссылок. Учитываются и ссылки с внутренних страниц сайта. PR влияет на выдачу в поисковой системе Google. Повысить PR сайту можно внутренней перелинковкой. PR бывает тулбарный и внутренний. Апдейт PR происходит, как правило, несколько раз в год. Сейчас у этого блога PR=2, а у сайта PR равен 3."}},
|
28
28
|
{"entry"=>{"word"=>"Sape (сапа)", "description"=>"Sape (сапа) - это самая популярная в России биржа ссылок. Адрес: www.sape.ru. Веб-мастер может продать ссылки со своего сайта, а оптимизатор купить ссылки. Продажа ссылок осуществляется с ежемесячной оплатой. Цена на ссылки устанавливается веб-мастером для своего сайта. Для продажи ссылок на сайте размещается специальный код системы и в дальнейшем вся продажа происходит автоматически через веб-интерфейс Sape.\r\n\r\nЗаметка: Интересует монтаж и эксплуатация противопожарных металлических ДПМ или ? Читайте технологическую документацию и нормативные документы."}}
|
@@ -25,10 +25,10 @@ describe 'basic crawler setup' do
|
|
25
25
|
|
26
26
|
results = crawler_instance.crawl
|
27
27
|
|
28
|
-
results["search"].should
|
29
|
-
results["links"].should
|
30
|
-
results["subheader"].should
|
31
|
-
results["social"]["twitter"].should
|
28
|
+
results["search"].should eq "Buscar"
|
29
|
+
results["links"].should eq [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
30
|
+
results["subheader"].should eq "Londres 2012"
|
31
|
+
results["social"]["twitter"].should eq "Verão"
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
@@ -56,10 +56,10 @@ describe 'basic crawler setup' do
|
|
56
56
|
|
57
57
|
results = crawler_instance.crawl
|
58
58
|
|
59
|
-
results["search"].should
|
60
|
-
results["links"].should
|
61
|
-
results["subheader"].should
|
62
|
-
results["social"]["twitter"].should
|
59
|
+
results["search"].should eq "Buscar"
|
60
|
+
results["links"].should eq [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
61
|
+
results["subheader"].should eq "Londres 2012"
|
62
|
+
results["social"]["twitter"].should eq "Verão"
|
63
63
|
end
|
64
64
|
end
|
65
65
|
|
@@ -86,10 +86,10 @@ describe 'basic crawler setup' do
|
|
86
86
|
|
87
87
|
results = crawler_instance.crawl
|
88
88
|
|
89
|
-
results["search"].should
|
90
|
-
results["links"].should
|
91
|
-
results["subheader"].should
|
92
|
-
results["social"]["twitter"].should
|
89
|
+
results["search"].should eq "Buscar"
|
90
|
+
results["links"].should eq [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
91
|
+
results["subheader"].should eq "Londres 2012"
|
92
|
+
results["social"]["twitter"].should eq "Verão"
|
93
93
|
end
|
94
94
|
end
|
95
95
|
|
@@ -112,13 +112,13 @@ describe 'basic crawler setup' do
|
|
112
112
|
results = crawler_instance.crawl
|
113
113
|
end
|
114
114
|
|
115
|
-
results["links"].should
|
115
|
+
results["links"].should eq result_hash
|
116
116
|
|
117
117
|
VCR.use_cassette('basic_crawler_page') do
|
118
118
|
results = crawler_instance.crawl
|
119
119
|
end
|
120
120
|
|
121
|
-
results["links"].should
|
121
|
+
results["links"].should eq result_hash
|
122
122
|
end
|
123
123
|
|
124
124
|
it 'should crawl page through block to class instance crawl method' do
|
@@ -145,10 +145,10 @@ describe 'basic crawler setup' do
|
|
145
145
|
end
|
146
146
|
end
|
147
147
|
|
148
|
-
results["search"].should
|
149
|
-
results["links"].should
|
150
|
-
results["subheader"].should
|
151
|
-
results["social"]["twitter"].should
|
148
|
+
results["search"].should eq "Buscar"
|
149
|
+
results["links"].should eq [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
150
|
+
results["subheader"].should eq "Londres 2012"
|
151
|
+
results["social"]["twitter"].should eq "Verão"
|
152
152
|
end
|
153
153
|
end
|
154
154
|
|
@@ -173,10 +173,10 @@ describe 'basic crawler setup' do
|
|
173
173
|
end
|
174
174
|
end
|
175
175
|
|
176
|
-
results["search"].should
|
177
|
-
results["links"].should
|
178
|
-
results["subheader"].should
|
179
|
-
results["social"]["twitter"].should
|
176
|
+
results["search"].should eq "Buscar"
|
177
|
+
results["links"].should eq [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
178
|
+
results["subheader"].should eq "Londres 2012"
|
179
|
+
results["social"]["twitter"].should eq "Verão"
|
180
180
|
end
|
181
181
|
end
|
182
182
|
|
@@ -197,14 +197,14 @@ describe 'basic crawler setup' do
|
|
197
197
|
|
198
198
|
results = crawler.new.crawl
|
199
199
|
|
200
|
-
results.should
|
200
|
+
results.should eq({ "repos" => [
|
201
201
|
{ "project" => { "repo" => "jairajs89 / Touchy.js", "description" => "A simple light-weight JavaScript library dealing with touch events" } },
|
202
202
|
{ "project" => { "repo" => "mcavage / node-restify", "description" => "node.js REST framework specifically meant web service APIs" } },
|
203
203
|
{ "project" => { "repo" => "notlion / streetview-stereographic", "description" => "Shader Toy + Google Map + Panoramic Explorer" } },
|
204
204
|
{ "project" => { "repo" => "twitter / bootstrap", "description" => "HTML, CSS, and JS toolkit from Twitter" } },
|
205
205
|
{ "project" => { "repo" => "stolksdorf / Parallaxjs", "description" => "a Library Javascript that allows easy page parallaxing" } },
|
206
206
|
{ "project" => { "repo" => nil, "description" => nil}}
|
207
|
-
]}
|
207
|
+
]})
|
208
208
|
end
|
209
209
|
end
|
210
210
|
|
@@ -216,9 +216,7 @@ describe 'basic crawler setup' do
|
|
216
216
|
crawler.document_format :xml
|
217
217
|
crawler.base_url "http://ws.audioscrobbler.com"
|
218
218
|
crawler.path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=060decb474b73437d5bbec37f527ae7b"
|
219
|
-
|
220
219
|
crawler.artist "xpath=//title", :list
|
221
|
-
|
222
220
|
crawler.location 'xpath=//event', :iterator do
|
223
221
|
latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
224
222
|
longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
@@ -228,7 +226,7 @@ describe 'basic crawler setup' do
|
|
228
226
|
results = crawler_instance.crawl
|
229
227
|
iterator = results['location']
|
230
228
|
|
231
|
-
iterator.should
|
229
|
+
iterator.should eq([
|
232
230
|
{"latitude"=>"37.807775", "longitude"=>"-122.272736"},
|
233
231
|
{"latitude"=>"37.807717", "longitude"=>"-122.270059"},
|
234
232
|
{"latitude"=>"37.869784", "longitude"=>"-122.267701"},
|
@@ -239,7 +237,7 @@ describe 'basic crawler setup' do
|
|
239
237
|
{"latitude"=>"37.771079", "longitude"=>"-122.412604"},
|
240
238
|
{"latitude"=>"37.784963", "longitude"=>"-122.418871"},
|
241
239
|
{"latitude"=>"37.788978", "longitude"=>"-122.40664"}
|
242
|
-
]
|
240
|
+
])
|
243
241
|
|
244
242
|
results["artist"].should =~ ["Davka", "Digitalism (DJ Set)", "Gary Clark Jr.", "Lenny Kravitz", "Little Muddy", "Michael Schenker Group", "The Asteroids Galaxy Tour", "When Indie Attacks", "When Indie Attacks", "YOB"]
|
245
243
|
end
|
@@ -260,7 +258,7 @@ describe 'basic crawler setup' do
|
|
260
258
|
crawler_instance = crawler.new
|
261
259
|
results = crawler_instance.crawl
|
262
260
|
|
263
|
-
results.should
|
261
|
+
results.should eq({
|
264
262
|
"github" => [
|
265
263
|
{ "heading"=>"GitHub helps people build software together." },
|
266
264
|
{ "heading"=>nil },
|
@@ -270,7 +268,7 @@ describe 'basic crawler setup' do
|
|
270
268
|
{ "heading"=>"GitHub on Your Servers" },
|
271
269
|
{ "heading"=>"Loading..." }
|
272
270
|
]
|
273
|
-
}
|
271
|
+
})
|
274
272
|
end
|
275
273
|
end
|
276
274
|
|
@@ -46,7 +46,7 @@ describe Wombat::Processing::Parser do
|
|
46
46
|
@metadata.http_method :get
|
47
47
|
|
48
48
|
fake_document = double :document
|
49
|
-
fake_parser = double :
|
49
|
+
fake_parser = double(:parser, body: 'foo')
|
50
50
|
fake_header = double :header
|
51
51
|
fake_document.should_receive(:parser).and_return(fake_parser)
|
52
52
|
fake_document.should_receive(:header).and_return(fake_header)
|
@@ -58,13 +58,13 @@ describe Wombat::Processing::Parser do
|
|
58
58
|
end
|
59
59
|
|
60
60
|
it 'should correctly parse xml documents' do
|
61
|
-
fake_document = double :
|
61
|
+
fake_document = double(:xml, body: 'foo')
|
62
62
|
fake_parser = double :parser
|
63
63
|
fake_headers = double :headers
|
64
64
|
@metadata.document_format :xml
|
65
65
|
@parser.mechanize.should_not_receive(:get)
|
66
66
|
RestClient.should_receive(:get).and_return fake_document
|
67
|
-
Nokogiri.should_receive(:XML).with(
|
67
|
+
Nokogiri.should_receive(:XML).with('foo').and_return fake_parser
|
68
68
|
fake_document.should_receive(:headers).and_return(fake_headers)
|
69
69
|
fake_parser.should_receive(:headers=)
|
70
70
|
|
@@ -7,20 +7,15 @@ describe Wombat::Property::Locators::Html do
|
|
7
7
|
fake_elem.stub inner_html: "Something cool "
|
8
8
|
context.stub(:xpath).with("/abc", nil).and_return [fake_elem]
|
9
9
|
property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :html)
|
10
|
-
|
11
10
|
locator = Wombat::Property::Locators::Html.new(property)
|
12
|
-
|
13
|
-
locator.locate(context).should == { "data1" => "Something cool" }
|
11
|
+
locator.locate(context).should eq({ "data1" => "Something cool" })
|
14
12
|
end
|
15
13
|
|
16
14
|
it 'should return null if the property cannot be found' do
|
17
|
-
fake_elem = double :element
|
18
15
|
context = double :context
|
19
16
|
context.stub(:xpath).with("/abc", nil).and_return []
|
20
17
|
property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :html)
|
21
|
-
|
22
18
|
locator = Wombat::Property::Locators::Html.new(property)
|
23
|
-
|
24
|
-
locator.locate(context).should == { "data1" => nil }
|
19
|
+
locator.locate(context).should eq({ "data1" => nil })
|
25
20
|
end
|
26
21
|
end
|
@@ -10,7 +10,7 @@ describe Wombat::Property::Locators::Text do
|
|
10
10
|
|
11
11
|
locator = Wombat::Property::Locators::Text.new(property)
|
12
12
|
|
13
|
-
locator.locate(context).should
|
13
|
+
locator.locate(context).should eq({ "data1" => "Something cool" })
|
14
14
|
end
|
15
15
|
|
16
16
|
it 'should locate text property with xpath selector using xpath functions' do
|
@@ -20,7 +20,7 @@ describe Wombat::Property::Locators::Text do
|
|
20
20
|
|
21
21
|
locator = Wombat::Property::Locators::Text.new(property)
|
22
22
|
|
23
|
-
locator.locate(context).should
|
23
|
+
locator.locate(context).should eq({ "data1" => "Something" })
|
24
24
|
end
|
25
25
|
|
26
26
|
it 'should locate text property with css selector' do
|
@@ -32,17 +32,16 @@ describe Wombat::Property::Locators::Text do
|
|
32
32
|
|
33
33
|
locator = Wombat::Property::Locators::Text.new(property)
|
34
34
|
|
35
|
-
locator.locate(context).should
|
35
|
+
locator.locate(context).should eq({ "data1" => "My name" })
|
36
36
|
end
|
37
37
|
|
38
38
|
it 'should return plain symbols as strings' do
|
39
|
-
fake_elem = double :element
|
40
39
|
context = double :context
|
41
40
|
property = Wombat::DSL::Property.new('data_2', :hardcoded_value, :text)
|
42
41
|
|
43
42
|
locator = Wombat::Property::Locators::Text.new(property)
|
44
43
|
|
45
|
-
locator.locate(context).should
|
44
|
+
locator.locate(context).should eq({ "data_2" => "hardcoded_value" })
|
46
45
|
end
|
47
46
|
|
48
47
|
it 'should invoke property callback' do
|
@@ -54,6 +53,6 @@ describe Wombat::Property::Locators::Text do
|
|
54
53
|
|
55
54
|
locator = Wombat::Property::Locators::Text.new(property)
|
56
55
|
|
57
|
-
locator.locate(context).should
|
56
|
+
locator.locate(context).should eq({ "data1" => "My ass" })
|
58
57
|
end
|
59
58
|
end
|
data/spec/sample_crawler_spec.rb
CHANGED
@@ -8,7 +8,7 @@ describe SampleCrawler do
|
|
8
8
|
|
9
9
|
it 'should correctly assign event metadata' do
|
10
10
|
@sample_crawler.should_receive(:parse) do |args|
|
11
|
-
args['event_group'].wombat_property_selector.should
|
11
|
+
args['event_group'].wombat_property_selector.should eq "css=div.title-agenda"
|
12
12
|
it = args['event_group']
|
13
13
|
expect(it["event"]["title"].wombat_property_selector).to eq("xpath=.")
|
14
14
|
expect(it["event"]["date"].wombat_property_selector).to(
|
@@ -16,8 +16,8 @@ describe SampleCrawler do
|
|
16
16
|
expect(it["event"]["type"].wombat_property_selector).to eq("xpath=.type")
|
17
17
|
expect(it["venue"]["name"].wombat_property_selector).to eq("xpath=.")
|
18
18
|
|
19
|
-
args[:base_url].should
|
20
|
-
args[:path].should
|
19
|
+
args[:base_url].should eq 'http://www.obaoba.com.br'
|
20
|
+
args[:path].should eq '/porto-alegre/agenda'
|
21
21
|
end
|
22
22
|
|
23
23
|
@sample_crawler.crawl
|
data/spec/wombat_spec.rb
CHANGED
@@ -23,9 +23,9 @@ describe Wombat do
|
|
23
23
|
config.set_user_agent "Wombat"
|
24
24
|
config.set_user_agent_alias 'Mac Safari'
|
25
25
|
end
|
26
|
-
Wombat.proxy_args.should
|
27
|
-
Wombat.user_agent.should
|
28
|
-
Wombat.user_agent_alias.should
|
26
|
+
Wombat.proxy_args.should eq ["10.0.0.1", 8080]
|
27
|
+
Wombat.user_agent.should eq 'Wombat'
|
28
|
+
Wombat.user_agent_alias.should eq 'Mac Safari'
|
29
29
|
end
|
30
30
|
|
31
31
|
it 'should accept regular properties (non-selectors)' do
|
data/wombat.gemspec
CHANGED
@@ -2,16 +2,16 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: wombat 2.
|
5
|
+
# stub: wombat 2.10.0 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = "wombat".freeze
|
9
|
-
s.version = "2.
|
9
|
+
s.version = "2.10.0"
|
10
10
|
|
11
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
|
12
12
|
s.require_paths = ["lib".freeze]
|
13
13
|
s.authors = ["Felipe Lima".freeze]
|
14
|
-
s.date = "2019-
|
14
|
+
s.date = "2019-09-03"
|
15
15
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages".freeze
|
16
16
|
s.email = "felipe.lima@gmail.com".freeze
|
17
17
|
s.extra_rdoc_files = [
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.10.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Felipe Lima
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-09-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|