wombat 2.2.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,7 +18,14 @@ module Wombat
18
18
  attr_accessor :mechanize, :context, :response_code, :page
19
19
 
20
20
  def initialize
21
- @mechanize = Mechanize.new
21
+ # http://stackoverflow.com/questions/6918277/ruby-mechanize-web-scraper-library-returns-file-instead-of-page
22
+ @mechanize = Mechanize.new { |a|
23
+ a.post_connect_hooks << lambda { |_,_,response,_|
24
+ if response.content_type.nil? || response.content_type.empty?
25
+ response.content_type = 'text/html'
26
+ end
27
+ }
28
+ }
22
29
  @mechanize.set_proxy(*Wombat.proxy_args) if Wombat.proxy_args
23
30
  end
24
31
 
@@ -7,14 +7,40 @@ module Wombat
7
7
  def locate(context, page = nil)
8
8
  super do
9
9
  locate_nodes(context).flat_map do |node|
10
- target_page = page.click node
11
- context = target_page.parser
10
+ retried = false
11
+ begin
12
+ # Certain erroneous pages contain http
13
+ # links with relative href attribute,
14
+ # while browsers actually use them as
15
+ # absolute.
16
+ # So, let wombat try that approach when
17
+ # loading relative link fails.
18
+ #
19
+ target_page = page.click node
20
+ context = target_page.parser
12
21
 
13
- filter_properties(context, page)
22
+ filter_properties(context, page)
23
+ rescue Mechanize::ResponseCodeError => e
24
+ # Either the page is unavailable, or
25
+ # the link is mistakenly relative
26
+ #
27
+ raise e if retried
28
+
29
+ # Give it a try first time
30
+ href = node.attributes && node.attributes["href"]
31
+ if href.respond_to? :value
32
+ href.value = '/' + href.value unless
33
+ href.value.start_with? '/'
34
+ retried = true
35
+ retry
36
+ else
37
+ raise e
38
+ end
39
+ end
14
40
  end
15
41
  end
16
42
  end
17
43
  end
18
44
  end
19
45
  end
20
- end
46
+ end
@@ -13,7 +13,7 @@ describe Wombat::Crawler do
13
13
 
14
14
  @crawler.event { event_called = true }
15
15
 
16
- event_called.should be_true
16
+ event_called.should eq(true)
17
17
  end
18
18
 
19
19
  it 'should provide metadata to yielded block' do
@@ -30,7 +30,7 @@ describe Wombat::Crawler do
30
30
  e.time Time.now
31
31
  end
32
32
 
33
- @crawler.venue do |v|
33
+ @crawler.venue do |v|
34
34
  v.name "Scooba"
35
35
  end
36
36
 
@@ -2,11 +2,12 @@ require 'spec_helper'
2
2
 
3
3
  describe Wombat::DSL::Property do
4
4
  it 'should store property data' do
5
- property = Wombat::DSL::Property.new("title", *["/some/selector", :html]) { false }
5
+ callback = lambda { false }
6
+ property = Wombat::DSL::Property.new("title", *["/some/selector", :html], &callback)
6
7
 
7
8
  property.wombat_property_name.should == "title"
8
9
  property.selector.should == "/some/selector"
9
10
  property.format.should == :html
10
- property.callback.should == lambda { false }
11
+ property.callback.should == callback
11
12
  end
12
- end
13
+ end
@@ -0,0 +1,33 @@
1
+ # coding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe 'following pages referred by relative links' do
5
+ it 'should follow relative links' do
6
+ VCR.use_cassette('follow_relative_links') do
7
+ crawler = Class.new
8
+ crawler.send(:include, Wombat::Crawler)
9
+
10
+ crawler.base_url "http://liteproblog.ru/"
11
+ crawler.path '/vocabulary'
12
+
13
+ crawler.vocabulary 'css=.postcontent ul li a', :follow do
14
+ entry do
15
+ word 'css=.post p strong', :text
16
+ description 'css=.post p'
17
+ end
18
+ end
19
+
20
+ crawler_instance = crawler.new
21
+
22
+ results = crawler_instance.crawl
23
+
24
+ # There are many entries. It's enough to check first three ones
25
+ results["vocabulary"][0..2].should == [
26
+ {"entry"=>{"word"=>"Dmoz", "description"=>"Dmoz - второй по популярности каталог сайтов после Яндекс-Каталога. Адрес каталога Dmoz - .\r\n\r\nЗаметка: Как вы думаете, мебель из Китая дорого стоит? Правильно, она недорогая. поставляет не только мебель, но и китайскую сантехнику, люстры, светильники и многое другое. Если вы хотите здорово съэкономить, то не пропустите такую возможность."}},
27
+ {"entry"=>{"word"=>"PR", "description"=>"PR - PageRank - показатель Google для конкретной страницы сайта. Зависит от количества ссылок на страницу и от качества этих ссылок. Учитываются и ссылки с внутренних страниц сайта. PR влияет на выдачу в поисковой системе Google. Повысить PR сайту можно внутренней перелинковкой. PR бывает тулбарный и внутренний. Апдейт PR происходит, как правило, несколько раз в год. Сейчас у этого блога PR=2, а у сайта PR равен 3."}},
28
+ {"entry"=>{"word"=>"Sape (сапа)", "description"=>"Sape (сапа) - это самая популярная в России биржа ссылок. Адрес: www.sape.ru. Веб-мастер может продать ссылки со своего сайта, а оптимизатор купить ссылки. Продажа ссылок осуществляется с ежемесячной оплатой. Цена на ссылки устанавливается веб-мастером для своего сайта. Для продажи ссылок на сайте размещается специальный код системы и в дальнейшем вся продажа происходит автоматически через веб-интерфейс Sape.\r\n\r\nЗаметка: Интересует монтаж и эксплуатация противопожарных металлических ДПМ или ? Читайте технологическую документацию и нормативные документы."}}
29
+ ]
30
+ end
31
+ end
32
+
33
+ end
@@ -1,6 +1,5 @@
1
1
  require 'wombat'
2
2
  require 'rspec'
3
- require 'rspec-expectations'
4
3
  require 'vcr'
5
4
  require 'coveralls'
6
5
 
@@ -2,14 +2,16 @@
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
3
  # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
+ # stub: wombat 2.3.0 ruby lib
5
6
 
6
7
  Gem::Specification.new do |s|
7
8
  s.name = "wombat"
8
- s.version = "2.2.1"
9
+ s.version = "2.3.0"
9
10
 
10
11
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
+ s.require_paths = ["lib"]
11
13
  s.authors = ["Felipe Lima"]
12
- s.date = "2013-10-20"
14
+ s.date = "2014-12-02"
13
15
  s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
14
16
  s.email = "felipe.lima@gmail.com"
15
17
  s.extra_rdoc_files = [
@@ -36,6 +38,7 @@ Gem::Specification.new do |s|
36
38
  "fixtures/vcr_cassettes/broken_selector.yml",
37
39
  "fixtures/vcr_cassettes/error_page.yml",
38
40
  "fixtures/vcr_cassettes/follow_links.yml",
41
+ "fixtures/vcr_cassettes/follow_relative_links.yml",
39
42
  "fixtures/vcr_cassettes/for_each_page.yml",
40
43
  "fixtures/vcr_cassettes/headers_selector.yml",
41
44
  "fixtures/vcr_cassettes/xml_with_namespace.yml",
@@ -61,6 +64,7 @@ Gem::Specification.new do |s|
61
64
  "spec/crawler_spec.rb",
62
65
  "spec/dsl/property_spec.rb",
63
66
  "spec/helpers/sample_crawler.rb",
67
+ "spec/integration/follow_relative_links_spec.rb",
64
68
  "spec/integration/integration_spec.rb",
65
69
  "spec/processing/parser_spec.rb",
66
70
  "spec/property/locators/factory_spec.rb",
@@ -77,13 +81,12 @@ Gem::Specification.new do |s|
77
81
  ]
78
82
  s.homepage = "http://felipecsl.github.com/wombat"
79
83
  s.licenses = ["MIT"]
80
- s.require_paths = ["lib"]
81
84
  s.required_ruby_version = Gem::Requirement.new(">= 1.9")
82
- s.rubygems_version = "1.8.24"
85
+ s.rubygems_version = "2.2.2"
83
86
  s.summary = "Ruby DSL to scrape web pages"
84
87
 
85
88
  if s.respond_to? :specification_version then
86
- s.specification_version = 3
89
+ s.specification_version = 4
87
90
 
88
91
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
89
92
  s.add_runtime_dependency(%q<mechanize>, [">= 0"])
metadata CHANGED
@@ -1,190 +1,167 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wombat
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.1
5
- prerelease:
4
+ version: 2.3.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Felipe Lima
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-10-20 00:00:00.000000000 Z
11
+ date: 2014-12-02 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: mechanize
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ! '>='
17
+ - - ">="
20
18
  - !ruby/object:Gem::Version
21
19
  version: '0'
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
- - - ! '>='
24
+ - - ">="
28
25
  - !ruby/object:Gem::Version
29
26
  version: '0'
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: activesupport
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ! '>='
31
+ - - ">="
36
32
  - !ruby/object:Gem::Version
37
33
  version: '0'
38
34
  type: :runtime
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
- - - ! '>='
38
+ - - ">="
44
39
  - !ruby/object:Gem::Version
45
40
  version: '0'
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: rest-client
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
- - - ! '>='
45
+ - - ">="
52
46
  - !ruby/object:Gem::Version
53
47
  version: '0'
54
48
  type: :runtime
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
- - - ! '>='
52
+ - - ">="
60
53
  - !ruby/object:Gem::Version
61
54
  version: '0'
62
55
  - !ruby/object:Gem::Dependency
63
56
  name: bundler
64
57
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
58
  requirements:
67
- - - ! '>='
59
+ - - ">="
68
60
  - !ruby/object:Gem::Version
69
61
  version: '0'
70
62
  type: :development
71
63
  prerelease: false
72
64
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
65
  requirements:
75
- - - ! '>='
66
+ - - ">="
76
67
  - !ruby/object:Gem::Version
77
68
  version: '0'
78
69
  - !ruby/object:Gem::Dependency
79
70
  name: rake
80
71
  requirement: !ruby/object:Gem::Requirement
81
- none: false
82
72
  requirements:
83
- - - ! '>='
73
+ - - ">="
84
74
  - !ruby/object:Gem::Version
85
75
  version: '0'
86
76
  type: :development
87
77
  prerelease: false
88
78
  version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
79
  requirements:
91
- - - ! '>='
80
+ - - ">="
92
81
  - !ruby/object:Gem::Version
93
82
  version: '0'
94
83
  - !ruby/object:Gem::Dependency
95
84
  name: yard
96
85
  requirement: !ruby/object:Gem::Requirement
97
- none: false
98
86
  requirements:
99
- - - ! '>='
87
+ - - ">="
100
88
  - !ruby/object:Gem::Version
101
89
  version: '0'
102
90
  type: :development
103
91
  prerelease: false
104
92
  version_requirements: !ruby/object:Gem::Requirement
105
- none: false
106
93
  requirements:
107
- - - ! '>='
94
+ - - ">="
108
95
  - !ruby/object:Gem::Version
109
96
  version: '0'
110
97
  - !ruby/object:Gem::Dependency
111
98
  name: jeweler
112
99
  requirement: !ruby/object:Gem::Requirement
113
- none: false
114
100
  requirements:
115
- - - ! '>='
101
+ - - ">="
116
102
  - !ruby/object:Gem::Version
117
103
  version: '0'
118
104
  type: :development
119
105
  prerelease: false
120
106
  version_requirements: !ruby/object:Gem::Requirement
121
- none: false
122
107
  requirements:
123
- - - ! '>='
108
+ - - ">="
124
109
  - !ruby/object:Gem::Version
125
110
  version: '0'
126
111
  - !ruby/object:Gem::Dependency
127
112
  name: rspec
128
113
  requirement: !ruby/object:Gem::Requirement
129
- none: false
130
114
  requirements:
131
- - - ! '>='
115
+ - - ">="
132
116
  - !ruby/object:Gem::Version
133
117
  version: '0'
134
118
  type: :development
135
119
  prerelease: false
136
120
  version_requirements: !ruby/object:Gem::Requirement
137
- none: false
138
121
  requirements:
139
- - - ! '>='
122
+ - - ">="
140
123
  - !ruby/object:Gem::Version
141
124
  version: '0'
142
125
  - !ruby/object:Gem::Dependency
143
126
  name: vcr
144
127
  requirement: !ruby/object:Gem::Requirement
145
- none: false
146
128
  requirements:
147
- - - ! '>='
129
+ - - ">="
148
130
  - !ruby/object:Gem::Version
149
131
  version: '0'
150
132
  type: :development
151
133
  prerelease: false
152
134
  version_requirements: !ruby/object:Gem::Requirement
153
- none: false
154
135
  requirements:
155
- - - ! '>='
136
+ - - ">="
156
137
  - !ruby/object:Gem::Version
157
138
  version: '0'
158
139
  - !ruby/object:Gem::Dependency
159
140
  name: fakeweb
160
141
  requirement: !ruby/object:Gem::Requirement
161
- none: false
162
142
  requirements:
163
- - - ! '>='
143
+ - - ">="
164
144
  - !ruby/object:Gem::Version
165
145
  version: '0'
166
146
  type: :development
167
147
  prerelease: false
168
148
  version_requirements: !ruby/object:Gem::Requirement
169
- none: false
170
149
  requirements:
171
- - - ! '>='
150
+ - - ">="
172
151
  - !ruby/object:Gem::Version
173
152
  version: '0'
174
153
  - !ruby/object:Gem::Dependency
175
154
  name: coveralls
176
155
  requirement: !ruby/object:Gem::Requirement
177
- none: false
178
156
  requirements:
179
- - - ! '>='
157
+ - - ">="
180
158
  - !ruby/object:Gem::Version
181
159
  version: '0'
182
160
  type: :development
183
161
  prerelease: false
184
162
  version_requirements: !ruby/object:Gem::Requirement
185
- none: false
186
163
  requirements:
187
- - - ! '>='
164
+ - - ">="
188
165
  - !ruby/object:Gem::Version
189
166
  version: '0'
190
167
  description: Generic Web crawler with a DSL that parses structured data from web pages
@@ -195,9 +172,9 @@ extra_rdoc_files:
195
172
  - LICENSE.txt
196
173
  - README.md
197
174
  files:
198
- - .document
199
- - .rspec
200
- - .travis.yml
175
+ - ".document"
176
+ - ".rspec"
177
+ - ".travis.yml"
201
178
  - Gemfile
202
179
  - Gemfile.lock
203
180
  - Guardfile
@@ -214,6 +191,7 @@ files:
214
191
  - fixtures/vcr_cassettes/broken_selector.yml
215
192
  - fixtures/vcr_cassettes/error_page.yml
216
193
  - fixtures/vcr_cassettes/follow_links.yml
194
+ - fixtures/vcr_cassettes/follow_relative_links.yml
217
195
  - fixtures/vcr_cassettes/for_each_page.yml
218
196
  - fixtures/vcr_cassettes/headers_selector.yml
219
197
  - fixtures/vcr_cassettes/xml_with_namespace.yml
@@ -239,6 +217,7 @@ files:
239
217
  - spec/crawler_spec.rb
240
218
  - spec/dsl/property_spec.rb
241
219
  - spec/helpers/sample_crawler.rb
220
+ - spec/integration/follow_relative_links_spec.rb
242
221
  - spec/integration/integration_spec.rb
243
222
  - spec/processing/parser_spec.rb
244
223
  - spec/property/locators/factory_spec.rb
@@ -255,26 +234,25 @@ files:
255
234
  homepage: http://felipecsl.github.com/wombat
256
235
  licenses:
257
236
  - MIT
237
+ metadata: {}
258
238
  post_install_message:
259
239
  rdoc_options: []
260
240
  require_paths:
261
241
  - lib
262
242
  required_ruby_version: !ruby/object:Gem::Requirement
263
- none: false
264
243
  requirements:
265
- - - ! '>='
244
+ - - ">="
266
245
  - !ruby/object:Gem::Version
267
246
  version: '1.9'
268
247
  required_rubygems_version: !ruby/object:Gem::Requirement
269
- none: false
270
248
  requirements:
271
- - - ! '>='
249
+ - - ">="
272
250
  - !ruby/object:Gem::Version
273
251
  version: '0'
274
252
  requirements: []
275
253
  rubyforge_project:
276
- rubygems_version: 1.8.24
254
+ rubygems_version: 2.2.2
277
255
  signing_key:
278
- specification_version: 3
256
+ specification_version: 4
279
257
  summary: Ruby DSL to scrape web pages
280
258
  test_files: []