wombat 2.2.1 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +2 -1
- data/Gemfile.lock +73 -66
- data/README.md +17 -16
- data/VERSION +1 -1
- data/fixtures/vcr_cassettes/follow_relative_links.yml +15839 -0
- data/lib/wombat/processing/parser.rb +8 -1
- data/lib/wombat/property/locators/follow.rb +30 -4
- data/spec/crawler_spec.rb +2 -2
- data/spec/dsl/property_spec.rb +4 -3
- data/spec/integration/follow_relative_links_spec.rb +33 -0
- data/spec/spec_helper.rb +0 -1
- data/wombat.gemspec +8 -5
- metadata +34 -56
@@ -18,7 +18,14 @@ module Wombat
|
|
18
18
|
attr_accessor :mechanize, :context, :response_code, :page
|
19
19
|
|
20
20
|
def initialize
|
21
|
-
|
21
|
+
# http://stackoverflow.com/questions/6918277/ruby-mechanize-web-scraper-library-returns-file-instead-of-page
|
22
|
+
@mechanize = Mechanize.new { |a|
|
23
|
+
a.post_connect_hooks << lambda { |_,_,response,_|
|
24
|
+
if response.content_type.nil? || response.content_type.empty?
|
25
|
+
response.content_type = 'text/html'
|
26
|
+
end
|
27
|
+
}
|
28
|
+
}
|
22
29
|
@mechanize.set_proxy(*Wombat.proxy_args) if Wombat.proxy_args
|
23
30
|
end
|
24
31
|
|
@@ -7,14 +7,40 @@ module Wombat
|
|
7
7
|
def locate(context, page = nil)
|
8
8
|
super do
|
9
9
|
locate_nodes(context).flat_map do |node|
|
10
|
-
|
11
|
-
|
10
|
+
retried = false
|
11
|
+
begin
|
12
|
+
# Certain erroneous pages contain http
|
13
|
+
# links with relative href attribute,
|
14
|
+
# while browsers actually use them as
|
15
|
+
# absolute.
|
16
|
+
# So, let wombat try that approach when
|
17
|
+
# loading relative link fails.
|
18
|
+
#
|
19
|
+
target_page = page.click node
|
20
|
+
context = target_page.parser
|
12
21
|
|
13
|
-
|
22
|
+
filter_properties(context, page)
|
23
|
+
rescue Mechanize::ResponseCodeError => e
|
24
|
+
# Either the page is unavailable, or
|
25
|
+
# the link is mistakenly relative
|
26
|
+
#
|
27
|
+
raise e if retried
|
28
|
+
|
29
|
+
# Give it a try first time
|
30
|
+
href = node.attributes && node.attributes["href"]
|
31
|
+
if href.respond_to? :value
|
32
|
+
href.value = '/' + href.value unless
|
33
|
+
href.value.start_with? '/'
|
34
|
+
retried = true
|
35
|
+
retry
|
36
|
+
else
|
37
|
+
raise e
|
38
|
+
end
|
39
|
+
end
|
14
40
|
end
|
15
41
|
end
|
16
42
|
end
|
17
43
|
end
|
18
44
|
end
|
19
45
|
end
|
20
|
-
end
|
46
|
+
end
|
data/spec/crawler_spec.rb
CHANGED
@@ -13,7 +13,7 @@ describe Wombat::Crawler do
|
|
13
13
|
|
14
14
|
@crawler.event { event_called = true }
|
15
15
|
|
16
|
-
event_called.should
|
16
|
+
event_called.should eq(true)
|
17
17
|
end
|
18
18
|
|
19
19
|
it 'should provide metadata to yielded block' do
|
@@ -30,7 +30,7 @@ describe Wombat::Crawler do
|
|
30
30
|
e.time Time.now
|
31
31
|
end
|
32
32
|
|
33
|
-
@crawler.venue do |v|
|
33
|
+
@crawler.venue do |v|
|
34
34
|
v.name "Scooba"
|
35
35
|
end
|
36
36
|
|
data/spec/dsl/property_spec.rb
CHANGED
@@ -2,11 +2,12 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe Wombat::DSL::Property do
|
4
4
|
it 'should store property data' do
|
5
|
-
|
5
|
+
callback = lambda { false }
|
6
|
+
property = Wombat::DSL::Property.new("title", *["/some/selector", :html], &callback)
|
6
7
|
|
7
8
|
property.wombat_property_name.should == "title"
|
8
9
|
property.selector.should == "/some/selector"
|
9
10
|
property.format.should == :html
|
10
|
-
property.callback.should ==
|
11
|
+
property.callback.should == callback
|
11
12
|
end
|
12
|
-
end
|
13
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe 'following pages referred by relative links' do
|
5
|
+
it 'should follow relative links' do
|
6
|
+
VCR.use_cassette('follow_relative_links') do
|
7
|
+
crawler = Class.new
|
8
|
+
crawler.send(:include, Wombat::Crawler)
|
9
|
+
|
10
|
+
crawler.base_url "http://liteproblog.ru/"
|
11
|
+
crawler.path '/vocabulary'
|
12
|
+
|
13
|
+
crawler.vocabulary 'css=.postcontent ul li a', :follow do
|
14
|
+
entry do
|
15
|
+
word 'css=.post p strong', :text
|
16
|
+
description 'css=.post p'
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
crawler_instance = crawler.new
|
21
|
+
|
22
|
+
results = crawler_instance.crawl
|
23
|
+
|
24
|
+
# There are many entries. It's enough to check first three ones
|
25
|
+
results["vocabulary"][0..2].should == [
|
26
|
+
{"entry"=>{"word"=>"Dmoz", "description"=>"Dmoz - второй по популярности каталог сайтов после Яндекс-Каталога. Адрес каталога Dmoz - .\r\n\r\nЗаметка: Как вы думаете, мебель из Китая дорого стоит? Правильно, она недорогая. поставляет не только мебель, но и китайскую сантехнику, люстры, светильники и многое другое. Если вы хотите здорово съэкономить, то не пропустите такую возможность."}},
|
27
|
+
{"entry"=>{"word"=>"PR", "description"=>"PR - PageRank - показатель Google для конкретной страницы сайта. Зависит от количества ссылок на страницу и от качества этих ссылок. Учитываются и ссылки с внутренних страниц сайта. PR влияет на выдачу в поисковой системе Google. Повысить PR сайту можно внутренней перелинковкой. PR бывает тулбарный и внутренний. Апдейт PR происходит, как правило, несколько раз в год. Сейчас у этого блога PR=2, а у сайта PR равен 3."}},
|
28
|
+
{"entry"=>{"word"=>"Sape (сапа)", "description"=>"Sape (сапа) - это самая популярная в России биржа ссылок. Адрес: www.sape.ru. Веб-мастер может продать ссылки со своего сайта, а оптимизатор купить ссылки. Продажа ссылок осуществляется с ежемесячной оплатой. Цена на ссылки устанавливается веб-мастером для своего сайта. Для продажи ссылок на сайте размещается специальный код системы и в дальнейшем вся продажа происходит автоматически через веб-интерфейс Sape.\r\n\r\nЗаметка: Интересует монтаж и эксплуатация противопожарных металлических ДПМ или ? Читайте технологическую документацию и нормативные документы."}}
|
29
|
+
]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
data/spec/spec_helper.rb
CHANGED
data/wombat.gemspec
CHANGED
@@ -2,14 +2,16 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
+
# stub: wombat 2.3.0 ruby lib
|
5
6
|
|
6
7
|
Gem::Specification.new do |s|
|
7
8
|
s.name = "wombat"
|
8
|
-
s.version = "2.
|
9
|
+
s.version = "2.3.0"
|
9
10
|
|
10
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
|
+
s.require_paths = ["lib"]
|
11
13
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "
|
14
|
+
s.date = "2014-12-02"
|
13
15
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
16
|
s.email = "felipe.lima@gmail.com"
|
15
17
|
s.extra_rdoc_files = [
|
@@ -36,6 +38,7 @@ Gem::Specification.new do |s|
|
|
36
38
|
"fixtures/vcr_cassettes/broken_selector.yml",
|
37
39
|
"fixtures/vcr_cassettes/error_page.yml",
|
38
40
|
"fixtures/vcr_cassettes/follow_links.yml",
|
41
|
+
"fixtures/vcr_cassettes/follow_relative_links.yml",
|
39
42
|
"fixtures/vcr_cassettes/for_each_page.yml",
|
40
43
|
"fixtures/vcr_cassettes/headers_selector.yml",
|
41
44
|
"fixtures/vcr_cassettes/xml_with_namespace.yml",
|
@@ -61,6 +64,7 @@ Gem::Specification.new do |s|
|
|
61
64
|
"spec/crawler_spec.rb",
|
62
65
|
"spec/dsl/property_spec.rb",
|
63
66
|
"spec/helpers/sample_crawler.rb",
|
67
|
+
"spec/integration/follow_relative_links_spec.rb",
|
64
68
|
"spec/integration/integration_spec.rb",
|
65
69
|
"spec/processing/parser_spec.rb",
|
66
70
|
"spec/property/locators/factory_spec.rb",
|
@@ -77,13 +81,12 @@ Gem::Specification.new do |s|
|
|
77
81
|
]
|
78
82
|
s.homepage = "http://felipecsl.github.com/wombat"
|
79
83
|
s.licenses = ["MIT"]
|
80
|
-
s.require_paths = ["lib"]
|
81
84
|
s.required_ruby_version = Gem::Requirement.new(">= 1.9")
|
82
|
-
s.rubygems_version = "
|
85
|
+
s.rubygems_version = "2.2.2"
|
83
86
|
s.summary = "Ruby DSL to scrape web pages"
|
84
87
|
|
85
88
|
if s.respond_to? :specification_version then
|
86
|
-
s.specification_version =
|
89
|
+
s.specification_version = 4
|
87
90
|
|
88
91
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
89
92
|
s.add_runtime_dependency(%q<mechanize>, [">= 0"])
|
metadata
CHANGED
@@ -1,190 +1,167 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
5
|
-
prerelease:
|
4
|
+
version: 2.3.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Felipe Lima
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-12-02 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: mechanize
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- -
|
17
|
+
- - ">="
|
20
18
|
- !ruby/object:Gem::Version
|
21
19
|
version: '0'
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- -
|
24
|
+
- - ">="
|
28
25
|
- !ruby/object:Gem::Version
|
29
26
|
version: '0'
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: activesupport
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- -
|
31
|
+
- - ">="
|
36
32
|
- !ruby/object:Gem::Version
|
37
33
|
version: '0'
|
38
34
|
type: :runtime
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
|
-
- -
|
38
|
+
- - ">="
|
44
39
|
- !ruby/object:Gem::Version
|
45
40
|
version: '0'
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: rest-client
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
|
-
- -
|
45
|
+
- - ">="
|
52
46
|
- !ruby/object:Gem::Version
|
53
47
|
version: '0'
|
54
48
|
type: :runtime
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
|
-
- -
|
52
|
+
- - ">="
|
60
53
|
- !ruby/object:Gem::Version
|
61
54
|
version: '0'
|
62
55
|
- !ruby/object:Gem::Dependency
|
63
56
|
name: bundler
|
64
57
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
58
|
requirements:
|
67
|
-
- -
|
59
|
+
- - ">="
|
68
60
|
- !ruby/object:Gem::Version
|
69
61
|
version: '0'
|
70
62
|
type: :development
|
71
63
|
prerelease: false
|
72
64
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
65
|
requirements:
|
75
|
-
- -
|
66
|
+
- - ">="
|
76
67
|
- !ruby/object:Gem::Version
|
77
68
|
version: '0'
|
78
69
|
- !ruby/object:Gem::Dependency
|
79
70
|
name: rake
|
80
71
|
requirement: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
72
|
requirements:
|
83
|
-
- -
|
73
|
+
- - ">="
|
84
74
|
- !ruby/object:Gem::Version
|
85
75
|
version: '0'
|
86
76
|
type: :development
|
87
77
|
prerelease: false
|
88
78
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
79
|
requirements:
|
91
|
-
- -
|
80
|
+
- - ">="
|
92
81
|
- !ruby/object:Gem::Version
|
93
82
|
version: '0'
|
94
83
|
- !ruby/object:Gem::Dependency
|
95
84
|
name: yard
|
96
85
|
requirement: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
86
|
requirements:
|
99
|
-
- -
|
87
|
+
- - ">="
|
100
88
|
- !ruby/object:Gem::Version
|
101
89
|
version: '0'
|
102
90
|
type: :development
|
103
91
|
prerelease: false
|
104
92
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
93
|
requirements:
|
107
|
-
- -
|
94
|
+
- - ">="
|
108
95
|
- !ruby/object:Gem::Version
|
109
96
|
version: '0'
|
110
97
|
- !ruby/object:Gem::Dependency
|
111
98
|
name: jeweler
|
112
99
|
requirement: !ruby/object:Gem::Requirement
|
113
|
-
none: false
|
114
100
|
requirements:
|
115
|
-
- -
|
101
|
+
- - ">="
|
116
102
|
- !ruby/object:Gem::Version
|
117
103
|
version: '0'
|
118
104
|
type: :development
|
119
105
|
prerelease: false
|
120
106
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
107
|
requirements:
|
123
|
-
- -
|
108
|
+
- - ">="
|
124
109
|
- !ruby/object:Gem::Version
|
125
110
|
version: '0'
|
126
111
|
- !ruby/object:Gem::Dependency
|
127
112
|
name: rspec
|
128
113
|
requirement: !ruby/object:Gem::Requirement
|
129
|
-
none: false
|
130
114
|
requirements:
|
131
|
-
- -
|
115
|
+
- - ">="
|
132
116
|
- !ruby/object:Gem::Version
|
133
117
|
version: '0'
|
134
118
|
type: :development
|
135
119
|
prerelease: false
|
136
120
|
version_requirements: !ruby/object:Gem::Requirement
|
137
|
-
none: false
|
138
121
|
requirements:
|
139
|
-
- -
|
122
|
+
- - ">="
|
140
123
|
- !ruby/object:Gem::Version
|
141
124
|
version: '0'
|
142
125
|
- !ruby/object:Gem::Dependency
|
143
126
|
name: vcr
|
144
127
|
requirement: !ruby/object:Gem::Requirement
|
145
|
-
none: false
|
146
128
|
requirements:
|
147
|
-
- -
|
129
|
+
- - ">="
|
148
130
|
- !ruby/object:Gem::Version
|
149
131
|
version: '0'
|
150
132
|
type: :development
|
151
133
|
prerelease: false
|
152
134
|
version_requirements: !ruby/object:Gem::Requirement
|
153
|
-
none: false
|
154
135
|
requirements:
|
155
|
-
- -
|
136
|
+
- - ">="
|
156
137
|
- !ruby/object:Gem::Version
|
157
138
|
version: '0'
|
158
139
|
- !ruby/object:Gem::Dependency
|
159
140
|
name: fakeweb
|
160
141
|
requirement: !ruby/object:Gem::Requirement
|
161
|
-
none: false
|
162
142
|
requirements:
|
163
|
-
- -
|
143
|
+
- - ">="
|
164
144
|
- !ruby/object:Gem::Version
|
165
145
|
version: '0'
|
166
146
|
type: :development
|
167
147
|
prerelease: false
|
168
148
|
version_requirements: !ruby/object:Gem::Requirement
|
169
|
-
none: false
|
170
149
|
requirements:
|
171
|
-
- -
|
150
|
+
- - ">="
|
172
151
|
- !ruby/object:Gem::Version
|
173
152
|
version: '0'
|
174
153
|
- !ruby/object:Gem::Dependency
|
175
154
|
name: coveralls
|
176
155
|
requirement: !ruby/object:Gem::Requirement
|
177
|
-
none: false
|
178
156
|
requirements:
|
179
|
-
- -
|
157
|
+
- - ">="
|
180
158
|
- !ruby/object:Gem::Version
|
181
159
|
version: '0'
|
182
160
|
type: :development
|
183
161
|
prerelease: false
|
184
162
|
version_requirements: !ruby/object:Gem::Requirement
|
185
|
-
none: false
|
186
163
|
requirements:
|
187
|
-
- -
|
164
|
+
- - ">="
|
188
165
|
- !ruby/object:Gem::Version
|
189
166
|
version: '0'
|
190
167
|
description: Generic Web crawler with a DSL that parses structured data from web pages
|
@@ -195,9 +172,9 @@ extra_rdoc_files:
|
|
195
172
|
- LICENSE.txt
|
196
173
|
- README.md
|
197
174
|
files:
|
198
|
-
- .document
|
199
|
-
- .rspec
|
200
|
-
- .travis.yml
|
175
|
+
- ".document"
|
176
|
+
- ".rspec"
|
177
|
+
- ".travis.yml"
|
201
178
|
- Gemfile
|
202
179
|
- Gemfile.lock
|
203
180
|
- Guardfile
|
@@ -214,6 +191,7 @@ files:
|
|
214
191
|
- fixtures/vcr_cassettes/broken_selector.yml
|
215
192
|
- fixtures/vcr_cassettes/error_page.yml
|
216
193
|
- fixtures/vcr_cassettes/follow_links.yml
|
194
|
+
- fixtures/vcr_cassettes/follow_relative_links.yml
|
217
195
|
- fixtures/vcr_cassettes/for_each_page.yml
|
218
196
|
- fixtures/vcr_cassettes/headers_selector.yml
|
219
197
|
- fixtures/vcr_cassettes/xml_with_namespace.yml
|
@@ -239,6 +217,7 @@ files:
|
|
239
217
|
- spec/crawler_spec.rb
|
240
218
|
- spec/dsl/property_spec.rb
|
241
219
|
- spec/helpers/sample_crawler.rb
|
220
|
+
- spec/integration/follow_relative_links_spec.rb
|
242
221
|
- spec/integration/integration_spec.rb
|
243
222
|
- spec/processing/parser_spec.rb
|
244
223
|
- spec/property/locators/factory_spec.rb
|
@@ -255,26 +234,25 @@ files:
|
|
255
234
|
homepage: http://felipecsl.github.com/wombat
|
256
235
|
licenses:
|
257
236
|
- MIT
|
237
|
+
metadata: {}
|
258
238
|
post_install_message:
|
259
239
|
rdoc_options: []
|
260
240
|
require_paths:
|
261
241
|
- lib
|
262
242
|
required_ruby_version: !ruby/object:Gem::Requirement
|
263
|
-
none: false
|
264
243
|
requirements:
|
265
|
-
- -
|
244
|
+
- - ">="
|
266
245
|
- !ruby/object:Gem::Version
|
267
246
|
version: '1.9'
|
268
247
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
269
|
-
none: false
|
270
248
|
requirements:
|
271
|
-
- -
|
249
|
+
- - ">="
|
272
250
|
- !ruby/object:Gem::Version
|
273
251
|
version: '0'
|
274
252
|
requirements: []
|
275
253
|
rubyforge_project:
|
276
|
-
rubygems_version:
|
254
|
+
rubygems_version: 2.2.2
|
277
255
|
signing_key:
|
278
|
-
specification_version:
|
256
|
+
specification_version: 4
|
279
257
|
summary: Ruby DSL to scrape web pages
|
280
258
|
test_files: []
|