wombat 2.2.1 → 2.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +2 -1
- data/Gemfile.lock +73 -66
- data/README.md +17 -16
- data/VERSION +1 -1
- data/fixtures/vcr_cassettes/follow_relative_links.yml +15839 -0
- data/lib/wombat/processing/parser.rb +8 -1
- data/lib/wombat/property/locators/follow.rb +30 -4
- data/spec/crawler_spec.rb +2 -2
- data/spec/dsl/property_spec.rb +4 -3
- data/spec/integration/follow_relative_links_spec.rb +33 -0
- data/spec/spec_helper.rb +0 -1
- data/wombat.gemspec +8 -5
- metadata +34 -56
@@ -18,7 +18,14 @@ module Wombat
|
|
18
18
|
attr_accessor :mechanize, :context, :response_code, :page
|
19
19
|
|
20
20
|
def initialize
|
21
|
-
|
21
|
+
# http://stackoverflow.com/questions/6918277/ruby-mechanize-web-scraper-library-returns-file-instead-of-page
|
22
|
+
@mechanize = Mechanize.new { |a|
|
23
|
+
a.post_connect_hooks << lambda { |_,_,response,_|
|
24
|
+
if response.content_type.nil? || response.content_type.empty?
|
25
|
+
response.content_type = 'text/html'
|
26
|
+
end
|
27
|
+
}
|
28
|
+
}
|
22
29
|
@mechanize.set_proxy(*Wombat.proxy_args) if Wombat.proxy_args
|
23
30
|
end
|
24
31
|
|
@@ -7,14 +7,40 @@ module Wombat
|
|
7
7
|
def locate(context, page = nil)
|
8
8
|
super do
|
9
9
|
locate_nodes(context).flat_map do |node|
|
10
|
-
|
11
|
-
|
10
|
+
retried = false
|
11
|
+
begin
|
12
|
+
# Certain erroneous pages contain http
|
13
|
+
# links with relative href attribute,
|
14
|
+
# while browsers actually use them as
|
15
|
+
# absolute.
|
16
|
+
# So, let wombat try that approach when
|
17
|
+
# loading relative link fails.
|
18
|
+
#
|
19
|
+
target_page = page.click node
|
20
|
+
context = target_page.parser
|
12
21
|
|
13
|
-
|
22
|
+
filter_properties(context, page)
|
23
|
+
rescue Mechanize::ResponseCodeError => e
|
24
|
+
# Either the page is unavailable, or
|
25
|
+
# the link is mistakenly relative
|
26
|
+
#
|
27
|
+
raise e if retried
|
28
|
+
|
29
|
+
# Give it a try first time
|
30
|
+
href = node.attributes && node.attributes["href"]
|
31
|
+
if href.respond_to? :value
|
32
|
+
href.value = '/' + href.value unless
|
33
|
+
href.value.start_with? '/'
|
34
|
+
retried = true
|
35
|
+
retry
|
36
|
+
else
|
37
|
+
raise e
|
38
|
+
end
|
39
|
+
end
|
14
40
|
end
|
15
41
|
end
|
16
42
|
end
|
17
43
|
end
|
18
44
|
end
|
19
45
|
end
|
20
|
-
end
|
46
|
+
end
|
data/spec/crawler_spec.rb
CHANGED
@@ -13,7 +13,7 @@ describe Wombat::Crawler do
|
|
13
13
|
|
14
14
|
@crawler.event { event_called = true }
|
15
15
|
|
16
|
-
event_called.should
|
16
|
+
event_called.should eq(true)
|
17
17
|
end
|
18
18
|
|
19
19
|
it 'should provide metadata to yielded block' do
|
@@ -30,7 +30,7 @@ describe Wombat::Crawler do
|
|
30
30
|
e.time Time.now
|
31
31
|
end
|
32
32
|
|
33
|
-
@crawler.venue do |v|
|
33
|
+
@crawler.venue do |v|
|
34
34
|
v.name "Scooba"
|
35
35
|
end
|
36
36
|
|
data/spec/dsl/property_spec.rb
CHANGED
@@ -2,11 +2,12 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe Wombat::DSL::Property do
|
4
4
|
it 'should store property data' do
|
5
|
-
|
5
|
+
callback = lambda { false }
|
6
|
+
property = Wombat::DSL::Property.new("title", *["/some/selector", :html], &callback)
|
6
7
|
|
7
8
|
property.wombat_property_name.should == "title"
|
8
9
|
property.selector.should == "/some/selector"
|
9
10
|
property.format.should == :html
|
10
|
-
property.callback.should ==
|
11
|
+
property.callback.should == callback
|
11
12
|
end
|
12
|
-
end
|
13
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe 'following pages referred by relative links' do
|
5
|
+
it 'should follow relative links' do
|
6
|
+
VCR.use_cassette('follow_relative_links') do
|
7
|
+
crawler = Class.new
|
8
|
+
crawler.send(:include, Wombat::Crawler)
|
9
|
+
|
10
|
+
crawler.base_url "http://liteproblog.ru/"
|
11
|
+
crawler.path '/vocabulary'
|
12
|
+
|
13
|
+
crawler.vocabulary 'css=.postcontent ul li a', :follow do
|
14
|
+
entry do
|
15
|
+
word 'css=.post p strong', :text
|
16
|
+
description 'css=.post p'
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
crawler_instance = crawler.new
|
21
|
+
|
22
|
+
results = crawler_instance.crawl
|
23
|
+
|
24
|
+
# There are many entries. It's enough to check first three ones
|
25
|
+
results["vocabulary"][0..2].should == [
|
26
|
+
{"entry"=>{"word"=>"Dmoz", "description"=>"Dmoz - второй по популярности каталог сайтов после Яндекс-Каталога. Адрес каталога Dmoz - .\r\n\r\nЗаметка: Как вы думаете, мебель из Китая дорого стоит? Правильно, она недорогая. поставляет не только мебель, но и китайскую сантехнику, люстры, светильники и многое другое. Если вы хотите здорово съэкономить, то не пропустите такую возможность."}},
|
27
|
+
{"entry"=>{"word"=>"PR", "description"=>"PR - PageRank - показатель Google для конкретной страницы сайта. Зависит от количества ссылок на страницу и от качества этих ссылок. Учитываются и ссылки с внутренних страниц сайта. PR влияет на выдачу в поисковой системе Google. Повысить PR сайту можно внутренней перелинковкой. PR бывает тулбарный и внутренний. Апдейт PR происходит, как правило, несколько раз в год. Сейчас у этого блога PR=2, а у сайта PR равен 3."}},
|
28
|
+
{"entry"=>{"word"=>"Sape (сапа)", "description"=>"Sape (сапа) - это самая популярная в России биржа ссылок. Адрес: www.sape.ru. Веб-мастер может продать ссылки со своего сайта, а оптимизатор купить ссылки. Продажа ссылок осуществляется с ежемесячной оплатой. Цена на ссылки устанавливается веб-мастером для своего сайта. Для продажи ссылок на сайте размещается специальный код системы и в дальнейшем вся продажа происходит автоматически через веб-интерфейс Sape.\r\n\r\nЗаметка: Интересует монтаж и эксплуатация противопожарных металлических ДПМ или ? Читайте технологическую документацию и нормативные документы."}}
|
29
|
+
]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
data/spec/spec_helper.rb
CHANGED
data/wombat.gemspec
CHANGED
@@ -2,14 +2,16 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
+
# stub: wombat 2.3.0 ruby lib
|
5
6
|
|
6
7
|
Gem::Specification.new do |s|
|
7
8
|
s.name = "wombat"
|
8
|
-
s.version = "2.
|
9
|
+
s.version = "2.3.0"
|
9
10
|
|
10
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
|
+
s.require_paths = ["lib"]
|
11
13
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "
|
14
|
+
s.date = "2014-12-02"
|
13
15
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
16
|
s.email = "felipe.lima@gmail.com"
|
15
17
|
s.extra_rdoc_files = [
|
@@ -36,6 +38,7 @@ Gem::Specification.new do |s|
|
|
36
38
|
"fixtures/vcr_cassettes/broken_selector.yml",
|
37
39
|
"fixtures/vcr_cassettes/error_page.yml",
|
38
40
|
"fixtures/vcr_cassettes/follow_links.yml",
|
41
|
+
"fixtures/vcr_cassettes/follow_relative_links.yml",
|
39
42
|
"fixtures/vcr_cassettes/for_each_page.yml",
|
40
43
|
"fixtures/vcr_cassettes/headers_selector.yml",
|
41
44
|
"fixtures/vcr_cassettes/xml_with_namespace.yml",
|
@@ -61,6 +64,7 @@ Gem::Specification.new do |s|
|
|
61
64
|
"spec/crawler_spec.rb",
|
62
65
|
"spec/dsl/property_spec.rb",
|
63
66
|
"spec/helpers/sample_crawler.rb",
|
67
|
+
"spec/integration/follow_relative_links_spec.rb",
|
64
68
|
"spec/integration/integration_spec.rb",
|
65
69
|
"spec/processing/parser_spec.rb",
|
66
70
|
"spec/property/locators/factory_spec.rb",
|
@@ -77,13 +81,12 @@ Gem::Specification.new do |s|
|
|
77
81
|
]
|
78
82
|
s.homepage = "http://felipecsl.github.com/wombat"
|
79
83
|
s.licenses = ["MIT"]
|
80
|
-
s.require_paths = ["lib"]
|
81
84
|
s.required_ruby_version = Gem::Requirement.new(">= 1.9")
|
82
|
-
s.rubygems_version = "
|
85
|
+
s.rubygems_version = "2.2.2"
|
83
86
|
s.summary = "Ruby DSL to scrape web pages"
|
84
87
|
|
85
88
|
if s.respond_to? :specification_version then
|
86
|
-
s.specification_version =
|
89
|
+
s.specification_version = 4
|
87
90
|
|
88
91
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
89
92
|
s.add_runtime_dependency(%q<mechanize>, [">= 0"])
|
metadata
CHANGED
@@ -1,190 +1,167 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
5
|
-
prerelease:
|
4
|
+
version: 2.3.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Felipe Lima
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-12-02 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: mechanize
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- -
|
17
|
+
- - ">="
|
20
18
|
- !ruby/object:Gem::Version
|
21
19
|
version: '0'
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- -
|
24
|
+
- - ">="
|
28
25
|
- !ruby/object:Gem::Version
|
29
26
|
version: '0'
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: activesupport
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- -
|
31
|
+
- - ">="
|
36
32
|
- !ruby/object:Gem::Version
|
37
33
|
version: '0'
|
38
34
|
type: :runtime
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
|
-
- -
|
38
|
+
- - ">="
|
44
39
|
- !ruby/object:Gem::Version
|
45
40
|
version: '0'
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: rest-client
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
|
-
- -
|
45
|
+
- - ">="
|
52
46
|
- !ruby/object:Gem::Version
|
53
47
|
version: '0'
|
54
48
|
type: :runtime
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
|
-
- -
|
52
|
+
- - ">="
|
60
53
|
- !ruby/object:Gem::Version
|
61
54
|
version: '0'
|
62
55
|
- !ruby/object:Gem::Dependency
|
63
56
|
name: bundler
|
64
57
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
58
|
requirements:
|
67
|
-
- -
|
59
|
+
- - ">="
|
68
60
|
- !ruby/object:Gem::Version
|
69
61
|
version: '0'
|
70
62
|
type: :development
|
71
63
|
prerelease: false
|
72
64
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
65
|
requirements:
|
75
|
-
- -
|
66
|
+
- - ">="
|
76
67
|
- !ruby/object:Gem::Version
|
77
68
|
version: '0'
|
78
69
|
- !ruby/object:Gem::Dependency
|
79
70
|
name: rake
|
80
71
|
requirement: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
72
|
requirements:
|
83
|
-
- -
|
73
|
+
- - ">="
|
84
74
|
- !ruby/object:Gem::Version
|
85
75
|
version: '0'
|
86
76
|
type: :development
|
87
77
|
prerelease: false
|
88
78
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
79
|
requirements:
|
91
|
-
- -
|
80
|
+
- - ">="
|
92
81
|
- !ruby/object:Gem::Version
|
93
82
|
version: '0'
|
94
83
|
- !ruby/object:Gem::Dependency
|
95
84
|
name: yard
|
96
85
|
requirement: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
86
|
requirements:
|
99
|
-
- -
|
87
|
+
- - ">="
|
100
88
|
- !ruby/object:Gem::Version
|
101
89
|
version: '0'
|
102
90
|
type: :development
|
103
91
|
prerelease: false
|
104
92
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
93
|
requirements:
|
107
|
-
- -
|
94
|
+
- - ">="
|
108
95
|
- !ruby/object:Gem::Version
|
109
96
|
version: '0'
|
110
97
|
- !ruby/object:Gem::Dependency
|
111
98
|
name: jeweler
|
112
99
|
requirement: !ruby/object:Gem::Requirement
|
113
|
-
none: false
|
114
100
|
requirements:
|
115
|
-
- -
|
101
|
+
- - ">="
|
116
102
|
- !ruby/object:Gem::Version
|
117
103
|
version: '0'
|
118
104
|
type: :development
|
119
105
|
prerelease: false
|
120
106
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
107
|
requirements:
|
123
|
-
- -
|
108
|
+
- - ">="
|
124
109
|
- !ruby/object:Gem::Version
|
125
110
|
version: '0'
|
126
111
|
- !ruby/object:Gem::Dependency
|
127
112
|
name: rspec
|
128
113
|
requirement: !ruby/object:Gem::Requirement
|
129
|
-
none: false
|
130
114
|
requirements:
|
131
|
-
- -
|
115
|
+
- - ">="
|
132
116
|
- !ruby/object:Gem::Version
|
133
117
|
version: '0'
|
134
118
|
type: :development
|
135
119
|
prerelease: false
|
136
120
|
version_requirements: !ruby/object:Gem::Requirement
|
137
|
-
none: false
|
138
121
|
requirements:
|
139
|
-
- -
|
122
|
+
- - ">="
|
140
123
|
- !ruby/object:Gem::Version
|
141
124
|
version: '0'
|
142
125
|
- !ruby/object:Gem::Dependency
|
143
126
|
name: vcr
|
144
127
|
requirement: !ruby/object:Gem::Requirement
|
145
|
-
none: false
|
146
128
|
requirements:
|
147
|
-
- -
|
129
|
+
- - ">="
|
148
130
|
- !ruby/object:Gem::Version
|
149
131
|
version: '0'
|
150
132
|
type: :development
|
151
133
|
prerelease: false
|
152
134
|
version_requirements: !ruby/object:Gem::Requirement
|
153
|
-
none: false
|
154
135
|
requirements:
|
155
|
-
- -
|
136
|
+
- - ">="
|
156
137
|
- !ruby/object:Gem::Version
|
157
138
|
version: '0'
|
158
139
|
- !ruby/object:Gem::Dependency
|
159
140
|
name: fakeweb
|
160
141
|
requirement: !ruby/object:Gem::Requirement
|
161
|
-
none: false
|
162
142
|
requirements:
|
163
|
-
- -
|
143
|
+
- - ">="
|
164
144
|
- !ruby/object:Gem::Version
|
165
145
|
version: '0'
|
166
146
|
type: :development
|
167
147
|
prerelease: false
|
168
148
|
version_requirements: !ruby/object:Gem::Requirement
|
169
|
-
none: false
|
170
149
|
requirements:
|
171
|
-
- -
|
150
|
+
- - ">="
|
172
151
|
- !ruby/object:Gem::Version
|
173
152
|
version: '0'
|
174
153
|
- !ruby/object:Gem::Dependency
|
175
154
|
name: coveralls
|
176
155
|
requirement: !ruby/object:Gem::Requirement
|
177
|
-
none: false
|
178
156
|
requirements:
|
179
|
-
- -
|
157
|
+
- - ">="
|
180
158
|
- !ruby/object:Gem::Version
|
181
159
|
version: '0'
|
182
160
|
type: :development
|
183
161
|
prerelease: false
|
184
162
|
version_requirements: !ruby/object:Gem::Requirement
|
185
|
-
none: false
|
186
163
|
requirements:
|
187
|
-
- -
|
164
|
+
- - ">="
|
188
165
|
- !ruby/object:Gem::Version
|
189
166
|
version: '0'
|
190
167
|
description: Generic Web crawler with a DSL that parses structured data from web pages
|
@@ -195,9 +172,9 @@ extra_rdoc_files:
|
|
195
172
|
- LICENSE.txt
|
196
173
|
- README.md
|
197
174
|
files:
|
198
|
-
- .document
|
199
|
-
- .rspec
|
200
|
-
- .travis.yml
|
175
|
+
- ".document"
|
176
|
+
- ".rspec"
|
177
|
+
- ".travis.yml"
|
201
178
|
- Gemfile
|
202
179
|
- Gemfile.lock
|
203
180
|
- Guardfile
|
@@ -214,6 +191,7 @@ files:
|
|
214
191
|
- fixtures/vcr_cassettes/broken_selector.yml
|
215
192
|
- fixtures/vcr_cassettes/error_page.yml
|
216
193
|
- fixtures/vcr_cassettes/follow_links.yml
|
194
|
+
- fixtures/vcr_cassettes/follow_relative_links.yml
|
217
195
|
- fixtures/vcr_cassettes/for_each_page.yml
|
218
196
|
- fixtures/vcr_cassettes/headers_selector.yml
|
219
197
|
- fixtures/vcr_cassettes/xml_with_namespace.yml
|
@@ -239,6 +217,7 @@ files:
|
|
239
217
|
- spec/crawler_spec.rb
|
240
218
|
- spec/dsl/property_spec.rb
|
241
219
|
- spec/helpers/sample_crawler.rb
|
220
|
+
- spec/integration/follow_relative_links_spec.rb
|
242
221
|
- spec/integration/integration_spec.rb
|
243
222
|
- spec/processing/parser_spec.rb
|
244
223
|
- spec/property/locators/factory_spec.rb
|
@@ -255,26 +234,25 @@ files:
|
|
255
234
|
homepage: http://felipecsl.github.com/wombat
|
256
235
|
licenses:
|
257
236
|
- MIT
|
237
|
+
metadata: {}
|
258
238
|
post_install_message:
|
259
239
|
rdoc_options: []
|
260
240
|
require_paths:
|
261
241
|
- lib
|
262
242
|
required_ruby_version: !ruby/object:Gem::Requirement
|
263
|
-
none: false
|
264
243
|
requirements:
|
265
|
-
- -
|
244
|
+
- - ">="
|
266
245
|
- !ruby/object:Gem::Version
|
267
246
|
version: '1.9'
|
268
247
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
269
|
-
none: false
|
270
248
|
requirements:
|
271
|
-
- -
|
249
|
+
- - ">="
|
272
250
|
- !ruby/object:Gem::Version
|
273
251
|
version: '0'
|
274
252
|
requirements: []
|
275
253
|
rubyforge_project:
|
276
|
-
rubygems_version:
|
254
|
+
rubygems_version: 2.2.2
|
277
255
|
signing_key:
|
278
|
-
specification_version:
|
256
|
+
specification_version: 4
|
279
257
|
summary: Ruby DSL to scrape web pages
|
280
258
|
test_files: []
|