wombat 2.9.0 → 2.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +0 -2
- data/CHANGELOG.md +5 -0
- data/Gemfile.lock +14 -10
- data/VERSION +1 -1
- data/fixtures/vcr_cassettes/xml_with_namespace.yml +9 -17
- data/lib/wombat/processing/parser.rb +10 -2
- data/spec/dsl/property_spec.rb +5 -5
- data/spec/integration/follow_relative_links_spec.rb +1 -1
- data/spec/integration/integration_spec.rb +28 -30
- data/spec/processing/parser_spec.rb +3 -3
- data/spec/property/locators/html_spec.rb +2 -7
- data/spec/property/locators/list_spec.rb +1 -1
- data/spec/property/locators/text_spec.rb +5 -6
- data/spec/sample_crawler_spec.rb +3 -3
- data/spec/wombat_spec.rb +3 -3
- data/wombat.gemspec +3 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7114b5a602e08f1253f7af6fbdebf6c598cd4a714a0fc28d11725b2d393ca232
|
4
|
+
data.tar.gz: ed31a4692959b35534d66accd7538db66f80c320bb0cff1c1b0a1b72c63860dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: caa0ef5373883bb65ced56709a48fdf95dc0bd64053ca2f8402ec8c603cbd05bae46401b8360616b996d5e80d6473d1e407abad0004c4695c102bd41f22fbf86
|
7
|
+
data.tar.gz: 9ec8827f2989c035b6f9ca6dcd5ecb0c5905702cf140887c012ee9b39ba6211764fdb7d1118ba12ce583a0654ecd0c30b1758ea6fb5bb4e45c5b93276d78f950
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
-
activesupport (
|
4
|
+
activesupport (6.0.0)
|
5
5
|
concurrent-ruby (~> 1.0, >= 1.0.2)
|
6
6
|
i18n (>= 0.7, < 2)
|
7
7
|
minitest (~> 5.1)
|
8
8
|
tzinfo (~> 1.1)
|
9
|
+
zeitwerk (~> 2.1, >= 2.1.8)
|
9
10
|
addressable (2.4.0)
|
10
11
|
builder (3.2.3)
|
11
12
|
concurrent-ruby (1.1.5)
|
@@ -34,9 +35,10 @@ GEM
|
|
34
35
|
hashie (>= 3.4)
|
35
36
|
mime-types (>= 1.16, < 3.0)
|
36
37
|
oauth2 (~> 1.0)
|
37
|
-
hashdiff (0.
|
38
|
+
hashdiff (1.0.0)
|
38
39
|
hashie (3.6.0)
|
39
40
|
highline (2.0.2)
|
41
|
+
http-accept (1.7.0)
|
40
42
|
http-cookie (1.0.3)
|
41
43
|
domain_name (~> 0.5)
|
42
44
|
i18n (1.6.0)
|
@@ -70,10 +72,10 @@ GEM
|
|
70
72
|
multi_xml (0.6.0)
|
71
73
|
multipart-post (2.1.1)
|
72
74
|
net-http-digest_auth (1.4.1)
|
73
|
-
net-http-persistent (3.0
|
75
|
+
net-http-persistent (3.1.0)
|
74
76
|
connection_pool (~> 2.2)
|
75
77
|
netrc (0.11.0)
|
76
|
-
nokogiri (1.10.
|
78
|
+
nokogiri (1.10.4)
|
77
79
|
mini_portile2 (~> 2.4.0)
|
78
80
|
ntlm-http (0.1.1)
|
79
81
|
oauth2 (1.4.1)
|
@@ -84,9 +86,10 @@ GEM
|
|
84
86
|
rack (>= 1.2, < 3)
|
85
87
|
psych (3.1.0)
|
86
88
|
rack (2.0.7)
|
87
|
-
rake (12.3.
|
88
|
-
rdoc (6.
|
89
|
-
rest-client (2.0
|
89
|
+
rake (12.3.3)
|
90
|
+
rdoc (6.2.0)
|
91
|
+
rest-client (2.1.0)
|
92
|
+
http-accept (>= 1.7.0, < 2.0)
|
90
93
|
http-cookie (>= 1.0.2, < 2.0)
|
91
94
|
mime-types (>= 1.16, < 4.0)
|
92
95
|
netrc (~> 0.8)
|
@@ -114,19 +117,20 @@ GEM
|
|
114
117
|
tins (~> 1.0)
|
115
118
|
thor (0.20.3)
|
116
119
|
thread_safe (0.3.6)
|
117
|
-
tins (1.21.
|
120
|
+
tins (1.21.1)
|
118
121
|
tzinfo (1.2.5)
|
119
122
|
thread_safe (~> 0.1)
|
120
123
|
unf (0.1.4)
|
121
124
|
unf_ext
|
122
125
|
unf_ext (0.0.7.6)
|
123
126
|
vcr (5.0.0)
|
124
|
-
webmock (3.
|
127
|
+
webmock (3.7.1)
|
125
128
|
addressable (>= 2.3.6)
|
126
129
|
crack (>= 0.3.2)
|
127
130
|
hashdiff (>= 0.4.0, < 2.0.0)
|
128
131
|
webrobots (0.1.2)
|
129
132
|
yard (0.9.20)
|
133
|
+
zeitwerk (2.1.9)
|
130
134
|
|
131
135
|
PLATFORMS
|
132
136
|
ruby
|
@@ -145,4 +149,4 @@ DEPENDENCIES
|
|
145
149
|
yard
|
146
150
|
|
147
151
|
BUNDLED WITH
|
148
|
-
|
152
|
+
2.0.2
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.10.0
|
@@ -3,24 +3,16 @@ http_interactions:
|
|
3
3
|
- request:
|
4
4
|
method: get
|
5
5
|
uri: http://ws.audioscrobbler.com/2.0/?api_key=060decb474b73437d5bbec37f527ae7b&location=San%20Francisco&method=geo.getevents
|
6
|
-
body:
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
7
9
|
headers:
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
accept-charset:
|
15
|
-
- ISO-8859-1,utf-8;q=0.7,*;q=0.7
|
16
|
-
accept-language:
|
17
|
-
- en-us,en;q=0.5
|
18
|
-
host:
|
19
|
-
- ws.audioscrobbler.com
|
20
|
-
connection:
|
21
|
-
- keep-alive
|
22
|
-
keep-alive:
|
23
|
-
- 300
|
10
|
+
Accept:
|
11
|
+
- "*/*"
|
12
|
+
User-Agent:
|
13
|
+
- rest-client/2.1.0 (darwin17.3.0 x86_64) ruby/2.5.0p0
|
14
|
+
Host:
|
15
|
+
- ws.audioscrobbler.com
|
24
16
|
response:
|
25
17
|
status:
|
26
18
|
code: 200
|
@@ -49,7 +49,6 @@ module Wombat
|
|
49
49
|
private
|
50
50
|
def parser_for(metadata, url)
|
51
51
|
url ||= "#{metadata[:base_url]}#{metadata[:path]}"
|
52
|
-
page = nil
|
53
52
|
parser = nil
|
54
53
|
_method = method_from(metadata[:http_method])
|
55
54
|
data = metadata[:data]
|
@@ -64,7 +63,7 @@ module Wombat
|
|
64
63
|
parser.headers = @page.header
|
65
64
|
else
|
66
65
|
@page = RestClient.public_send(_method, *args) unless @page
|
67
|
-
parser = Nokogiri::XML
|
66
|
+
parser = Nokogiri::XML(decode_body)
|
68
67
|
parser.headers = @page.headers
|
69
68
|
end
|
70
69
|
@response_code = @page.code.to_i if @page.respond_to? :code
|
@@ -79,6 +78,15 @@ module Wombat
|
|
79
78
|
end
|
80
79
|
end
|
81
80
|
|
81
|
+
def decode_body
|
82
|
+
# Check if it's gzip encoded
|
83
|
+
if @page.body.start_with?("\x1F\x8B".b)
|
84
|
+
Zlib::GzipReader.new(StringIO.new(@page.body)).read
|
85
|
+
else
|
86
|
+
@page.body
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
82
90
|
def method_from(_method)
|
83
91
|
return :get if _method.nil?
|
84
92
|
HTTP_METHODS.detect(->{:get}){ |i| i == _method.downcase.to_sym }
|
data/spec/dsl/property_spec.rb
CHANGED
@@ -4,10 +4,10 @@ describe Wombat::DSL::Property do
|
|
4
4
|
it 'should store property data' do
|
5
5
|
callback = lambda { false }
|
6
6
|
property = Wombat::DSL::Property.new("title", *["/some/selector", :html], &callback)
|
7
|
-
|
8
|
-
property.wombat_property_name.should
|
9
|
-
property.selector.should
|
10
|
-
property.format.should
|
11
|
-
property.callback.should
|
7
|
+
|
8
|
+
property.wombat_property_name.should eq "title"
|
9
|
+
property.selector.should eq "/some/selector"
|
10
|
+
property.format.should eq :html
|
11
|
+
property.callback.should eq callback
|
12
12
|
end
|
13
13
|
end
|
@@ -22,7 +22,7 @@ describe 'following pages referred by relative links' do
|
|
22
22
|
results = crawler_instance.crawl
|
23
23
|
|
24
24
|
# There are many entries. It's enough to check first three ones
|
25
|
-
results["vocabulary"][0..2].should
|
25
|
+
results["vocabulary"][0..2].should eq [
|
26
26
|
{"entry"=>{"word"=>"Dmoz", "description"=>"Dmoz - второй по популярности каталог сайтов после Яндекс-Каталога. Адрес каталога Dmoz - .\r\n\r\nЗаметка: Как вы думаете, мебель из Китая дорого стоит? Правильно, она недорогая. поставляет не только мебель, но и китайскую сантехнику, люстры, светильники и многое другое. Если вы хотите здорово съэкономить, то не пропустите такую возможность."}},
|
27
27
|
{"entry"=>{"word"=>"PR", "description"=>"PR - PageRank - показатель Google для конкретной страницы сайта. Зависит от количества ссылок на страницу и от качества этих ссылок. Учитываются и ссылки с внутренних страниц сайта. PR влияет на выдачу в поисковой системе Google. Повысить PR сайту можно внутренней перелинковкой. PR бывает тулбарный и внутренний. Апдейт PR происходит, как правило, несколько раз в год. Сейчас у этого блога PR=2, а у сайта PR равен 3."}},
|
28
28
|
{"entry"=>{"word"=>"Sape (сапа)", "description"=>"Sape (сапа) - это самая популярная в России биржа ссылок. Адрес: www.sape.ru. Веб-мастер может продать ссылки со своего сайта, а оптимизатор купить ссылки. Продажа ссылок осуществляется с ежемесячной оплатой. Цена на ссылки устанавливается веб-мастером для своего сайта. Для продажи ссылок на сайте размещается специальный код системы и в дальнейшем вся продажа происходит автоматически через веб-интерфейс Sape.\r\n\r\nЗаметка: Интересует монтаж и эксплуатация противопожарных металлических ДПМ или ? Читайте технологическую документацию и нормативные документы."}}
|
@@ -25,10 +25,10 @@ describe 'basic crawler setup' do
|
|
25
25
|
|
26
26
|
results = crawler_instance.crawl
|
27
27
|
|
28
|
-
results["search"].should
|
29
|
-
results["links"].should
|
30
|
-
results["subheader"].should
|
31
|
-
results["social"]["twitter"].should
|
28
|
+
results["search"].should eq "Buscar"
|
29
|
+
results["links"].should eq [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
30
|
+
results["subheader"].should eq "Londres 2012"
|
31
|
+
results["social"]["twitter"].should eq "Verão"
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
@@ -56,10 +56,10 @@ describe 'basic crawler setup' do
|
|
56
56
|
|
57
57
|
results = crawler_instance.crawl
|
58
58
|
|
59
|
-
results["search"].should
|
60
|
-
results["links"].should
|
61
|
-
results["subheader"].should
|
62
|
-
results["social"]["twitter"].should
|
59
|
+
results["search"].should eq "Buscar"
|
60
|
+
results["links"].should eq [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
61
|
+
results["subheader"].should eq "Londres 2012"
|
62
|
+
results["social"]["twitter"].should eq "Verão"
|
63
63
|
end
|
64
64
|
end
|
65
65
|
|
@@ -86,10 +86,10 @@ describe 'basic crawler setup' do
|
|
86
86
|
|
87
87
|
results = crawler_instance.crawl
|
88
88
|
|
89
|
-
results["search"].should
|
90
|
-
results["links"].should
|
91
|
-
results["subheader"].should
|
92
|
-
results["social"]["twitter"].should
|
89
|
+
results["search"].should eq "Buscar"
|
90
|
+
results["links"].should eq [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
91
|
+
results["subheader"].should eq "Londres 2012"
|
92
|
+
results["social"]["twitter"].should eq "Verão"
|
93
93
|
end
|
94
94
|
end
|
95
95
|
|
@@ -112,13 +112,13 @@ describe 'basic crawler setup' do
|
|
112
112
|
results = crawler_instance.crawl
|
113
113
|
end
|
114
114
|
|
115
|
-
results["links"].should
|
115
|
+
results["links"].should eq result_hash
|
116
116
|
|
117
117
|
VCR.use_cassette('basic_crawler_page') do
|
118
118
|
results = crawler_instance.crawl
|
119
119
|
end
|
120
120
|
|
121
|
-
results["links"].should
|
121
|
+
results["links"].should eq result_hash
|
122
122
|
end
|
123
123
|
|
124
124
|
it 'should crawl page through block to class instance crawl method' do
|
@@ -145,10 +145,10 @@ describe 'basic crawler setup' do
|
|
145
145
|
end
|
146
146
|
end
|
147
147
|
|
148
|
-
results["search"].should
|
149
|
-
results["links"].should
|
150
|
-
results["subheader"].should
|
151
|
-
results["social"]["twitter"].should
|
148
|
+
results["search"].should eq "Buscar"
|
149
|
+
results["links"].should eq [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
150
|
+
results["subheader"].should eq "Londres 2012"
|
151
|
+
results["social"]["twitter"].should eq "Verão"
|
152
152
|
end
|
153
153
|
end
|
154
154
|
|
@@ -173,10 +173,10 @@ describe 'basic crawler setup' do
|
|
173
173
|
end
|
174
174
|
end
|
175
175
|
|
176
|
-
results["search"].should
|
177
|
-
results["links"].should
|
178
|
-
results["subheader"].should
|
179
|
-
results["social"]["twitter"].should
|
176
|
+
results["search"].should eq "Buscar"
|
177
|
+
results["links"].should eq [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
178
|
+
results["subheader"].should eq "Londres 2012"
|
179
|
+
results["social"]["twitter"].should eq "Verão"
|
180
180
|
end
|
181
181
|
end
|
182
182
|
|
@@ -197,14 +197,14 @@ describe 'basic crawler setup' do
|
|
197
197
|
|
198
198
|
results = crawler.new.crawl
|
199
199
|
|
200
|
-
results.should
|
200
|
+
results.should eq({ "repos" => [
|
201
201
|
{ "project" => { "repo" => "jairajs89 / Touchy.js", "description" => "A simple light-weight JavaScript library dealing with touch events" } },
|
202
202
|
{ "project" => { "repo" => "mcavage / node-restify", "description" => "node.js REST framework specifically meant web service APIs" } },
|
203
203
|
{ "project" => { "repo" => "notlion / streetview-stereographic", "description" => "Shader Toy + Google Map + Panoramic Explorer" } },
|
204
204
|
{ "project" => { "repo" => "twitter / bootstrap", "description" => "HTML, CSS, and JS toolkit from Twitter" } },
|
205
205
|
{ "project" => { "repo" => "stolksdorf / Parallaxjs", "description" => "a Library Javascript that allows easy page parallaxing" } },
|
206
206
|
{ "project" => { "repo" => nil, "description" => nil}}
|
207
|
-
]}
|
207
|
+
]})
|
208
208
|
end
|
209
209
|
end
|
210
210
|
|
@@ -216,9 +216,7 @@ describe 'basic crawler setup' do
|
|
216
216
|
crawler.document_format :xml
|
217
217
|
crawler.base_url "http://ws.audioscrobbler.com"
|
218
218
|
crawler.path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=060decb474b73437d5bbec37f527ae7b"
|
219
|
-
|
220
219
|
crawler.artist "xpath=//title", :list
|
221
|
-
|
222
220
|
crawler.location 'xpath=//event', :iterator do
|
223
221
|
latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
224
222
|
longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
@@ -228,7 +226,7 @@ describe 'basic crawler setup' do
|
|
228
226
|
results = crawler_instance.crawl
|
229
227
|
iterator = results['location']
|
230
228
|
|
231
|
-
iterator.should
|
229
|
+
iterator.should eq([
|
232
230
|
{"latitude"=>"37.807775", "longitude"=>"-122.272736"},
|
233
231
|
{"latitude"=>"37.807717", "longitude"=>"-122.270059"},
|
234
232
|
{"latitude"=>"37.869784", "longitude"=>"-122.267701"},
|
@@ -239,7 +237,7 @@ describe 'basic crawler setup' do
|
|
239
237
|
{"latitude"=>"37.771079", "longitude"=>"-122.412604"},
|
240
238
|
{"latitude"=>"37.784963", "longitude"=>"-122.418871"},
|
241
239
|
{"latitude"=>"37.788978", "longitude"=>"-122.40664"}
|
242
|
-
]
|
240
|
+
])
|
243
241
|
|
244
242
|
results["artist"].should =~ ["Davka", "Digitalism (DJ Set)", "Gary Clark Jr.", "Lenny Kravitz", "Little Muddy", "Michael Schenker Group", "The Asteroids Galaxy Tour", "When Indie Attacks", "When Indie Attacks", "YOB"]
|
245
243
|
end
|
@@ -260,7 +258,7 @@ describe 'basic crawler setup' do
|
|
260
258
|
crawler_instance = crawler.new
|
261
259
|
results = crawler_instance.crawl
|
262
260
|
|
263
|
-
results.should
|
261
|
+
results.should eq({
|
264
262
|
"github" => [
|
265
263
|
{ "heading"=>"GitHub helps people build software together." },
|
266
264
|
{ "heading"=>nil },
|
@@ -270,7 +268,7 @@ describe 'basic crawler setup' do
|
|
270
268
|
{ "heading"=>"GitHub on Your Servers" },
|
271
269
|
{ "heading"=>"Loading..." }
|
272
270
|
]
|
273
|
-
}
|
271
|
+
})
|
274
272
|
end
|
275
273
|
end
|
276
274
|
|
@@ -46,7 +46,7 @@ describe Wombat::Processing::Parser do
|
|
46
46
|
@metadata.http_method :get
|
47
47
|
|
48
48
|
fake_document = double :document
|
49
|
-
fake_parser = double :
|
49
|
+
fake_parser = double(:parser, body: 'foo')
|
50
50
|
fake_header = double :header
|
51
51
|
fake_document.should_receive(:parser).and_return(fake_parser)
|
52
52
|
fake_document.should_receive(:header).and_return(fake_header)
|
@@ -58,13 +58,13 @@ describe Wombat::Processing::Parser do
|
|
58
58
|
end
|
59
59
|
|
60
60
|
it 'should correctly parse xml documents' do
|
61
|
-
fake_document = double :
|
61
|
+
fake_document = double(:xml, body: 'foo')
|
62
62
|
fake_parser = double :parser
|
63
63
|
fake_headers = double :headers
|
64
64
|
@metadata.document_format :xml
|
65
65
|
@parser.mechanize.should_not_receive(:get)
|
66
66
|
RestClient.should_receive(:get).and_return fake_document
|
67
|
-
Nokogiri.should_receive(:XML).with(
|
67
|
+
Nokogiri.should_receive(:XML).with('foo').and_return fake_parser
|
68
68
|
fake_document.should_receive(:headers).and_return(fake_headers)
|
69
69
|
fake_parser.should_receive(:headers=)
|
70
70
|
|
@@ -7,20 +7,15 @@ describe Wombat::Property::Locators::Html do
|
|
7
7
|
fake_elem.stub inner_html: "Something cool "
|
8
8
|
context.stub(:xpath).with("/abc", nil).and_return [fake_elem]
|
9
9
|
property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :html)
|
10
|
-
|
11
10
|
locator = Wombat::Property::Locators::Html.new(property)
|
12
|
-
|
13
|
-
locator.locate(context).should == { "data1" => "Something cool" }
|
11
|
+
locator.locate(context).should eq({ "data1" => "Something cool" })
|
14
12
|
end
|
15
13
|
|
16
14
|
it 'should return null if the property cannot be found' do
|
17
|
-
fake_elem = double :element
|
18
15
|
context = double :context
|
19
16
|
context.stub(:xpath).with("/abc", nil).and_return []
|
20
17
|
property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :html)
|
21
|
-
|
22
18
|
locator = Wombat::Property::Locators::Html.new(property)
|
23
|
-
|
24
|
-
locator.locate(context).should == { "data1" => nil }
|
19
|
+
locator.locate(context).should eq({ "data1" => nil })
|
25
20
|
end
|
26
21
|
end
|
@@ -10,7 +10,7 @@ describe Wombat::Property::Locators::Text do
|
|
10
10
|
|
11
11
|
locator = Wombat::Property::Locators::Text.new(property)
|
12
12
|
|
13
|
-
locator.locate(context).should
|
13
|
+
locator.locate(context).should eq({ "data1" => "Something cool" })
|
14
14
|
end
|
15
15
|
|
16
16
|
it 'should locate text property with xpath selector using xpath functions' do
|
@@ -20,7 +20,7 @@ describe Wombat::Property::Locators::Text do
|
|
20
20
|
|
21
21
|
locator = Wombat::Property::Locators::Text.new(property)
|
22
22
|
|
23
|
-
locator.locate(context).should
|
23
|
+
locator.locate(context).should eq({ "data1" => "Something" })
|
24
24
|
end
|
25
25
|
|
26
26
|
it 'should locate text property with css selector' do
|
@@ -32,17 +32,16 @@ describe Wombat::Property::Locators::Text do
|
|
32
32
|
|
33
33
|
locator = Wombat::Property::Locators::Text.new(property)
|
34
34
|
|
35
|
-
locator.locate(context).should
|
35
|
+
locator.locate(context).should eq({ "data1" => "My name" })
|
36
36
|
end
|
37
37
|
|
38
38
|
it 'should return plain symbols as strings' do
|
39
|
-
fake_elem = double :element
|
40
39
|
context = double :context
|
41
40
|
property = Wombat::DSL::Property.new('data_2', :hardcoded_value, :text)
|
42
41
|
|
43
42
|
locator = Wombat::Property::Locators::Text.new(property)
|
44
43
|
|
45
|
-
locator.locate(context).should
|
44
|
+
locator.locate(context).should eq({ "data_2" => "hardcoded_value" })
|
46
45
|
end
|
47
46
|
|
48
47
|
it 'should invoke property callback' do
|
@@ -54,6 +53,6 @@ describe Wombat::Property::Locators::Text do
|
|
54
53
|
|
55
54
|
locator = Wombat::Property::Locators::Text.new(property)
|
56
55
|
|
57
|
-
locator.locate(context).should
|
56
|
+
locator.locate(context).should eq({ "data1" => "My ass" })
|
58
57
|
end
|
59
58
|
end
|
data/spec/sample_crawler_spec.rb
CHANGED
@@ -8,7 +8,7 @@ describe SampleCrawler do
|
|
8
8
|
|
9
9
|
it 'should correctly assign event metadata' do
|
10
10
|
@sample_crawler.should_receive(:parse) do |args|
|
11
|
-
args['event_group'].wombat_property_selector.should
|
11
|
+
args['event_group'].wombat_property_selector.should eq "css=div.title-agenda"
|
12
12
|
it = args['event_group']
|
13
13
|
expect(it["event"]["title"].wombat_property_selector).to eq("xpath=.")
|
14
14
|
expect(it["event"]["date"].wombat_property_selector).to(
|
@@ -16,8 +16,8 @@ describe SampleCrawler do
|
|
16
16
|
expect(it["event"]["type"].wombat_property_selector).to eq("xpath=.type")
|
17
17
|
expect(it["venue"]["name"].wombat_property_selector).to eq("xpath=.")
|
18
18
|
|
19
|
-
args[:base_url].should
|
20
|
-
args[:path].should
|
19
|
+
args[:base_url].should eq 'http://www.obaoba.com.br'
|
20
|
+
args[:path].should eq '/porto-alegre/agenda'
|
21
21
|
end
|
22
22
|
|
23
23
|
@sample_crawler.crawl
|
data/spec/wombat_spec.rb
CHANGED
@@ -23,9 +23,9 @@ describe Wombat do
|
|
23
23
|
config.set_user_agent "Wombat"
|
24
24
|
config.set_user_agent_alias 'Mac Safari'
|
25
25
|
end
|
26
|
-
Wombat.proxy_args.should
|
27
|
-
Wombat.user_agent.should
|
28
|
-
Wombat.user_agent_alias.should
|
26
|
+
Wombat.proxy_args.should eq ["10.0.0.1", 8080]
|
27
|
+
Wombat.user_agent.should eq 'Wombat'
|
28
|
+
Wombat.user_agent_alias.should eq 'Mac Safari'
|
29
29
|
end
|
30
30
|
|
31
31
|
it 'should accept regular properties (non-selectors)' do
|
data/wombat.gemspec
CHANGED
@@ -2,16 +2,16 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: wombat 2.
|
5
|
+
# stub: wombat 2.10.0 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = "wombat".freeze
|
9
|
-
s.version = "2.
|
9
|
+
s.version = "2.10.0"
|
10
10
|
|
11
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
|
12
12
|
s.require_paths = ["lib".freeze]
|
13
13
|
s.authors = ["Felipe Lima".freeze]
|
14
|
-
s.date = "2019-
|
14
|
+
s.date = "2019-09-03"
|
15
15
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages".freeze
|
16
16
|
s.email = "felipe.lima@gmail.com".freeze
|
17
17
|
s.extra_rdoc_files = [
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.10.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Felipe Lima
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-09-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|