wombat 2.2.1 → 2.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -18,7 +18,14 @@ module Wombat
18
18
  attr_accessor :mechanize, :context, :response_code, :page
19
19
 
20
20
  def initialize
21
- @mechanize = Mechanize.new
21
+ # http://stackoverflow.com/questions/6918277/ruby-mechanize-web-scraper-library-returns-file-instead-of-page
22
+ @mechanize = Mechanize.new { |a|
23
+ a.post_connect_hooks << lambda { |_,_,response,_|
24
+ if response.content_type.nil? || response.content_type.empty?
25
+ response.content_type = 'text/html'
26
+ end
27
+ }
28
+ }
22
29
  @mechanize.set_proxy(*Wombat.proxy_args) if Wombat.proxy_args
23
30
  end
24
31
 
@@ -7,14 +7,40 @@ module Wombat
7
7
  def locate(context, page = nil)
8
8
  super do
9
9
  locate_nodes(context).flat_map do |node|
10
- target_page = page.click node
11
- context = target_page.parser
10
+ retried = false
11
+ begin
12
+ # Certain erroneous pages contain http
13
+ # links with relative href attribute,
14
+ # while browsers actually use them as
15
+ # absolute.
16
+ # So, let wombat try that approach when
17
+ # loading relative link fails.
18
+ #
19
+ target_page = page.click node
20
+ context = target_page.parser
12
21
 
13
- filter_properties(context, page)
22
+ filter_properties(context, page)
23
+ rescue Mechanize::ResponseCodeError => e
24
+ # Either the page is unavailable, or
25
+ # the link is mistakenly relative
26
+ #
27
+ raise e if retried
28
+
29
+ # Give it a try first time
30
+ href = node.attributes && node.attributes["href"]
31
+ if href.respond_to? :value
32
+ href.value = '/' + href.value unless
33
+ href.value.start_with? '/'
34
+ retried = true
35
+ retry
36
+ else
37
+ raise e
38
+ end
39
+ end
14
40
  end
15
41
  end
16
42
  end
17
43
  end
18
44
  end
19
45
  end
20
- end
46
+ end
@@ -13,7 +13,7 @@ describe Wombat::Crawler do
13
13
 
14
14
  @crawler.event { event_called = true }
15
15
 
16
- event_called.should be_true
16
+ event_called.should eq(true)
17
17
  end
18
18
 
19
19
  it 'should provide metadata to yielded block' do
@@ -30,7 +30,7 @@ describe Wombat::Crawler do
30
30
  e.time Time.now
31
31
  end
32
32
 
33
- @crawler.venue do |v|
33
+ @crawler.venue do |v|
34
34
  v.name "Scooba"
35
35
  end
36
36
 
@@ -2,11 +2,12 @@ require 'spec_helper'
2
2
 
3
3
  describe Wombat::DSL::Property do
4
4
  it 'should store property data' do
5
- property = Wombat::DSL::Property.new("title", *["/some/selector", :html]) { false }
5
+ callback = lambda { false }
6
+ property = Wombat::DSL::Property.new("title", *["/some/selector", :html], &callback)
6
7
 
7
8
  property.wombat_property_name.should == "title"
8
9
  property.selector.should == "/some/selector"
9
10
  property.format.should == :html
10
- property.callback.should == lambda { false }
11
+ property.callback.should == callback
11
12
  end
12
- end
13
+ end
@@ -0,0 +1,33 @@
1
+ # coding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe 'following pages referred by relative links' do
5
+ it 'should follow relative links' do
6
+ VCR.use_cassette('follow_relative_links') do
7
+ crawler = Class.new
8
+ crawler.send(:include, Wombat::Crawler)
9
+
10
+ crawler.base_url "http://liteproblog.ru/"
11
+ crawler.path '/vocabulary'
12
+
13
+ crawler.vocabulary 'css=.postcontent ul li a', :follow do
14
+ entry do
15
+ word 'css=.post p strong', :text
16
+ description 'css=.post p'
17
+ end
18
+ end
19
+
20
+ crawler_instance = crawler.new
21
+
22
+ results = crawler_instance.crawl
23
+
24
+ # There are many entries. It's enough to check first three ones
25
+ results["vocabulary"][0..2].should == [
26
+ {"entry"=>{"word"=>"Dmoz", "description"=>"Dmoz - второй по популярности каталог сайтов после Яндекс-Каталога. Адрес каталога Dmoz - .\r\n\r\nЗаметка: Как вы думаете, мебель из Китая дорого стоит? Правильно, она недорогая. поставляет не только мебель, но и китайскую сантехнику, люстры, светильники и многое другое. Если вы хотите здорово съэкономить, то не пропустите такую возможность."}},
27
+ {"entry"=>{"word"=>"PR", "description"=>"PR - PageRank - показатель Google для конкретной страницы сайта. Зависит от количества ссылок на страницу и от качества этих ссылок. Учитываются и ссылки с внутренних страниц сайта. PR влияет на выдачу в поисковой системе Google. Повысить PR сайту можно внутренней перелинковкой. PR бывает тулбарный и внутренний. Апдейт PR происходит, как правило, несколько раз в год. Сейчас у этого блога PR=2, а у сайта PR равен 3."}},
28
+ {"entry"=>{"word"=>"Sape (сапа)", "description"=>"Sape (сапа) - это самая популярная в России биржа ссылок. Адрес: www.sape.ru. Веб-мастер может продать ссылки со своего сайта, а оптимизатор купить ссылки. Продажа ссылок осуществляется с ежемесячной оплатой. Цена на ссылки устанавливается веб-мастером для своего сайта. Для продажи ссылок на сайте размещается специальный код системы и в дальнейшем вся продажа происходит автоматически через веб-интерфейс Sape.\r\n\r\nЗаметка: Интересует монтаж и эксплуатация противопожарных металлических ДПМ или ? Читайте технологическую документацию и нормативные документы."}}
29
+ ]
30
+ end
31
+ end
32
+
33
+ end
@@ -1,6 +1,5 @@
1
1
  require 'wombat'
2
2
  require 'rspec'
3
- require 'rspec-expectations'
4
3
  require 'vcr'
5
4
  require 'coveralls'
6
5
 
@@ -2,14 +2,16 @@
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
3
  # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
+ # stub: wombat 2.3.0 ruby lib
5
6
 
6
7
  Gem::Specification.new do |s|
7
8
  s.name = "wombat"
8
- s.version = "2.2.1"
9
+ s.version = "2.3.0"
9
10
 
10
11
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
+ s.require_paths = ["lib"]
11
13
  s.authors = ["Felipe Lima"]
12
- s.date = "2013-10-20"
14
+ s.date = "2014-12-02"
13
15
  s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
14
16
  s.email = "felipe.lima@gmail.com"
15
17
  s.extra_rdoc_files = [
@@ -36,6 +38,7 @@ Gem::Specification.new do |s|
36
38
  "fixtures/vcr_cassettes/broken_selector.yml",
37
39
  "fixtures/vcr_cassettes/error_page.yml",
38
40
  "fixtures/vcr_cassettes/follow_links.yml",
41
+ "fixtures/vcr_cassettes/follow_relative_links.yml",
39
42
  "fixtures/vcr_cassettes/for_each_page.yml",
40
43
  "fixtures/vcr_cassettes/headers_selector.yml",
41
44
  "fixtures/vcr_cassettes/xml_with_namespace.yml",
@@ -61,6 +64,7 @@ Gem::Specification.new do |s|
61
64
  "spec/crawler_spec.rb",
62
65
  "spec/dsl/property_spec.rb",
63
66
  "spec/helpers/sample_crawler.rb",
67
+ "spec/integration/follow_relative_links_spec.rb",
64
68
  "spec/integration/integration_spec.rb",
65
69
  "spec/processing/parser_spec.rb",
66
70
  "spec/property/locators/factory_spec.rb",
@@ -77,13 +81,12 @@ Gem::Specification.new do |s|
77
81
  ]
78
82
  s.homepage = "http://felipecsl.github.com/wombat"
79
83
  s.licenses = ["MIT"]
80
- s.require_paths = ["lib"]
81
84
  s.required_ruby_version = Gem::Requirement.new(">= 1.9")
82
- s.rubygems_version = "1.8.24"
85
+ s.rubygems_version = "2.2.2"
83
86
  s.summary = "Ruby DSL to scrape web pages"
84
87
 
85
88
  if s.respond_to? :specification_version then
86
- s.specification_version = 3
89
+ s.specification_version = 4
87
90
 
88
91
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
89
92
  s.add_runtime_dependency(%q<mechanize>, [">= 0"])
metadata CHANGED
@@ -1,190 +1,167 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wombat
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.1
5
- prerelease:
4
+ version: 2.3.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Felipe Lima
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-10-20 00:00:00.000000000 Z
11
+ date: 2014-12-02 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: mechanize
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ! '>='
17
+ - - ">="
20
18
  - !ruby/object:Gem::Version
21
19
  version: '0'
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
- - - ! '>='
24
+ - - ">="
28
25
  - !ruby/object:Gem::Version
29
26
  version: '0'
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: activesupport
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ! '>='
31
+ - - ">="
36
32
  - !ruby/object:Gem::Version
37
33
  version: '0'
38
34
  type: :runtime
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
- - - ! '>='
38
+ - - ">="
44
39
  - !ruby/object:Gem::Version
45
40
  version: '0'
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: rest-client
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
- - - ! '>='
45
+ - - ">="
52
46
  - !ruby/object:Gem::Version
53
47
  version: '0'
54
48
  type: :runtime
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
- - - ! '>='
52
+ - - ">="
60
53
  - !ruby/object:Gem::Version
61
54
  version: '0'
62
55
  - !ruby/object:Gem::Dependency
63
56
  name: bundler
64
57
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
58
  requirements:
67
- - - ! '>='
59
+ - - ">="
68
60
  - !ruby/object:Gem::Version
69
61
  version: '0'
70
62
  type: :development
71
63
  prerelease: false
72
64
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
65
  requirements:
75
- - - ! '>='
66
+ - - ">="
76
67
  - !ruby/object:Gem::Version
77
68
  version: '0'
78
69
  - !ruby/object:Gem::Dependency
79
70
  name: rake
80
71
  requirement: !ruby/object:Gem::Requirement
81
- none: false
82
72
  requirements:
83
- - - ! '>='
73
+ - - ">="
84
74
  - !ruby/object:Gem::Version
85
75
  version: '0'
86
76
  type: :development
87
77
  prerelease: false
88
78
  version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
79
  requirements:
91
- - - ! '>='
80
+ - - ">="
92
81
  - !ruby/object:Gem::Version
93
82
  version: '0'
94
83
  - !ruby/object:Gem::Dependency
95
84
  name: yard
96
85
  requirement: !ruby/object:Gem::Requirement
97
- none: false
98
86
  requirements:
99
- - - ! '>='
87
+ - - ">="
100
88
  - !ruby/object:Gem::Version
101
89
  version: '0'
102
90
  type: :development
103
91
  prerelease: false
104
92
  version_requirements: !ruby/object:Gem::Requirement
105
- none: false
106
93
  requirements:
107
- - - ! '>='
94
+ - - ">="
108
95
  - !ruby/object:Gem::Version
109
96
  version: '0'
110
97
  - !ruby/object:Gem::Dependency
111
98
  name: jeweler
112
99
  requirement: !ruby/object:Gem::Requirement
113
- none: false
114
100
  requirements:
115
- - - ! '>='
101
+ - - ">="
116
102
  - !ruby/object:Gem::Version
117
103
  version: '0'
118
104
  type: :development
119
105
  prerelease: false
120
106
  version_requirements: !ruby/object:Gem::Requirement
121
- none: false
122
107
  requirements:
123
- - - ! '>='
108
+ - - ">="
124
109
  - !ruby/object:Gem::Version
125
110
  version: '0'
126
111
  - !ruby/object:Gem::Dependency
127
112
  name: rspec
128
113
  requirement: !ruby/object:Gem::Requirement
129
- none: false
130
114
  requirements:
131
- - - ! '>='
115
+ - - ">="
132
116
  - !ruby/object:Gem::Version
133
117
  version: '0'
134
118
  type: :development
135
119
  prerelease: false
136
120
  version_requirements: !ruby/object:Gem::Requirement
137
- none: false
138
121
  requirements:
139
- - - ! '>='
122
+ - - ">="
140
123
  - !ruby/object:Gem::Version
141
124
  version: '0'
142
125
  - !ruby/object:Gem::Dependency
143
126
  name: vcr
144
127
  requirement: !ruby/object:Gem::Requirement
145
- none: false
146
128
  requirements:
147
- - - ! '>='
129
+ - - ">="
148
130
  - !ruby/object:Gem::Version
149
131
  version: '0'
150
132
  type: :development
151
133
  prerelease: false
152
134
  version_requirements: !ruby/object:Gem::Requirement
153
- none: false
154
135
  requirements:
155
- - - ! '>='
136
+ - - ">="
156
137
  - !ruby/object:Gem::Version
157
138
  version: '0'
158
139
  - !ruby/object:Gem::Dependency
159
140
  name: fakeweb
160
141
  requirement: !ruby/object:Gem::Requirement
161
- none: false
162
142
  requirements:
163
- - - ! '>='
143
+ - - ">="
164
144
  - !ruby/object:Gem::Version
165
145
  version: '0'
166
146
  type: :development
167
147
  prerelease: false
168
148
  version_requirements: !ruby/object:Gem::Requirement
169
- none: false
170
149
  requirements:
171
- - - ! '>='
150
+ - - ">="
172
151
  - !ruby/object:Gem::Version
173
152
  version: '0'
174
153
  - !ruby/object:Gem::Dependency
175
154
  name: coveralls
176
155
  requirement: !ruby/object:Gem::Requirement
177
- none: false
178
156
  requirements:
179
- - - ! '>='
157
+ - - ">="
180
158
  - !ruby/object:Gem::Version
181
159
  version: '0'
182
160
  type: :development
183
161
  prerelease: false
184
162
  version_requirements: !ruby/object:Gem::Requirement
185
- none: false
186
163
  requirements:
187
- - - ! '>='
164
+ - - ">="
188
165
  - !ruby/object:Gem::Version
189
166
  version: '0'
190
167
  description: Generic Web crawler with a DSL that parses structured data from web pages
@@ -195,9 +172,9 @@ extra_rdoc_files:
195
172
  - LICENSE.txt
196
173
  - README.md
197
174
  files:
198
- - .document
199
- - .rspec
200
- - .travis.yml
175
+ - ".document"
176
+ - ".rspec"
177
+ - ".travis.yml"
201
178
  - Gemfile
202
179
  - Gemfile.lock
203
180
  - Guardfile
@@ -214,6 +191,7 @@ files:
214
191
  - fixtures/vcr_cassettes/broken_selector.yml
215
192
  - fixtures/vcr_cassettes/error_page.yml
216
193
  - fixtures/vcr_cassettes/follow_links.yml
194
+ - fixtures/vcr_cassettes/follow_relative_links.yml
217
195
  - fixtures/vcr_cassettes/for_each_page.yml
218
196
  - fixtures/vcr_cassettes/headers_selector.yml
219
197
  - fixtures/vcr_cassettes/xml_with_namespace.yml
@@ -239,6 +217,7 @@ files:
239
217
  - spec/crawler_spec.rb
240
218
  - spec/dsl/property_spec.rb
241
219
  - spec/helpers/sample_crawler.rb
220
+ - spec/integration/follow_relative_links_spec.rb
242
221
  - spec/integration/integration_spec.rb
243
222
  - spec/processing/parser_spec.rb
244
223
  - spec/property/locators/factory_spec.rb
@@ -255,26 +234,25 @@ files:
255
234
  homepage: http://felipecsl.github.com/wombat
256
235
  licenses:
257
236
  - MIT
237
+ metadata: {}
258
238
  post_install_message:
259
239
  rdoc_options: []
260
240
  require_paths:
261
241
  - lib
262
242
  required_ruby_version: !ruby/object:Gem::Requirement
263
- none: false
264
243
  requirements:
265
- - - ! '>='
244
+ - - ">="
266
245
  - !ruby/object:Gem::Version
267
246
  version: '1.9'
268
247
  required_rubygems_version: !ruby/object:Gem::Requirement
269
- none: false
270
248
  requirements:
271
- - - ! '>='
249
+ - - ">="
272
250
  - !ruby/object:Gem::Version
273
251
  version: '0'
274
252
  requirements: []
275
253
  rubyforge_project:
276
- rubygems_version: 1.8.24
254
+ rubygems_version: 2.2.2
277
255
  signing_key:
278
- specification_version: 3
256
+ specification_version: 4
279
257
  summary: Ruby DSL to scrape web pages
280
258
  test_files: []