ruby-readability 0.5.2 → 0.5.3

Sign up to get free protection for your applications and to get access to all the features.
data/.rspec CHANGED
@@ -1,3 +1,2 @@
1
1
  --colour
2
2
  --format s -c
3
- --debugger
data/README.markdown CHANGED
@@ -30,7 +30,8 @@ You may provide options to Readability::Document.new, including:
30
30
  :attributes - whitelist of allowed attributes
31
31
  :debug - provide debugging output, defaults false
32
32
  :encoding - if the page is of a known encoding, you can specify it; if left unspecified,
33
- the encoding will be guessed (only in Ruby 1.9.x)
33
+ the encoding will be guessed (only in Ruby 1.9.x). If you wish to disable guessing,
34
+ supply :do_not_guess_encoding => true.
34
35
  :html_headers - in Ruby 1.9.x these will be passed to the guess_html_encoding gem
35
36
  to aid with guessing the HTML encoding
36
37
  :ignore_image_format - for use with .images. For example: :ignore_image_format => ["gif", "png"]
@@ -48,13 +49,14 @@ Readability comes with a command-line tool for experimentation in bin/readabilit
48
49
 
49
50
  ## Images
50
51
 
51
- You can get a list of images in the content area with `.images`. This feature requires that the `mini_magick` gem be installed.
52
+ You can get a list of images in the content area with `.images`. This feature requires that the `mini_magick` gem be installed
52
53
 
53
- p Readability::Document.new(source).images
54
+ rbody = Readability::Document.new(body, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false)
55
+ rbody.images
54
56
 
55
57
  ## Potential Issues
56
58
 
57
- If you're on a Mac and are getting segmentation faults, see the discussion at https://github.com/tenderlove/nokogiri/issues/404 and consider updating your version of libxml2. Version 2.7.8 of libxml2 with the following worked for me:
59
+ If you're on a Mac and are getting segmentation faults, see the discussion at https://github.com/tenderlove/nokogiri/issues/404 and consider updating your version of libxml2. Version 2.7.8 of libxml2, installed with `brew`, worked for me:
58
60
 
59
61
  gem install nokogiri -- --with-xml2-include=/usr/local/Cellar/libxml2/2.7.8/include/libxml2 --with-xml2-lib=/usr/local/Cellar/libxml2/2.7.8/lib --with-xslt-dir=/usr/local/Cellar/libxslt/1.1.26
60
62
 
@@ -70,4 +72,4 @@ Or if you're using bundler and Rails 3, you can run this command to make bundler
70
72
 
71
73
  This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
72
74
 
73
- Ruby port by starrhorne, libc, and iterationlabs. Special thanks to fizx and marcosinger.
75
+ Ruby port by starrhorne, libc, and iterationlabs. Special thanks to fizx and marcosinger.
data/lib/readability.rb CHANGED
@@ -39,13 +39,18 @@ module Readability
39
39
  @html.css("script, style").each { |i| i.remove }
40
40
  remove_unlikely_candidates! if @remove_unlikely_candidates
41
41
  transform_misused_divs_into_paragraphs!
42
-
42
+
43
43
  @candidates = score_paragraphs(options[:min_text_length])
44
44
  @best_candidate = select_best_candidate(@candidates)
45
45
  end
46
46
 
47
47
  def make_html
48
48
  @html = Nokogiri::HTML(@input, nil, @options[:encoding])
49
+ # In case document has no body, such as from empty string or redirect
50
+ @html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0
51
+
52
+ # Remove html comment tags
53
+ @html.xpath('//comment()').each { |i| i.remove }
49
54
  end
50
55
 
51
56
  def images(content=nil, reload=false)
@@ -174,8 +179,9 @@ module Readability
174
179
  end
175
180
 
176
181
  if append
177
- sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
178
- output << sibling
182
+ sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects
183
+ sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase)
184
+ output << sibling_dup
179
185
  end
180
186
  end
181
187
 
@@ -185,7 +191,7 @@ module Readability
185
191
  def select_best_candidate(candidates)
186
192
  sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
187
193
 
188
- debug("Top 5 canidates:")
194
+ debug("Top 5 candidates:")
189
195
  sorted_candidates[0...5].each do |candidate|
190
196
  debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
191
197
  end
@@ -281,7 +287,7 @@ module Readability
281
287
  def remove_unlikely_candidates!
282
288
  @html.css("*").each do |elem|
283
289
  str = "#{elem[:class]}#{elem[:id]}"
284
- if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
290
+ if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body')
285
291
  debug("Removing unlikely candidate - #{str}")
286
292
  elem.remove
287
293
  end
@@ -308,7 +314,7 @@ module Readability
308
314
  end
309
315
  end
310
316
 
311
- def sanitize(node, candidates, options = {})
317
+ def sanitize(node, candidates, options = {})
312
318
  node.css("h1, h2, h3, h4, h5, h6").each do |header|
313
319
  header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
314
320
  end
@@ -408,6 +414,5 @@ module Readability
408
414
  end
409
415
  end
410
416
  end
411
-
412
417
  end
413
418
  end
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "ruby-readability"
6
- s.version = '0.5.2'
6
+ s.version = '0.5.3'
7
7
  s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
8
8
  s.email = ["andrew@iterationlabs.com"]
9
9
  s.homepage = "http://github.com/iterationlabs/ruby-readability"
@@ -1,6 +1,7 @@
1
1
  # encoding: UTF-8
2
2
 
3
3
  require 'spec_helper'
4
+ require 'readability'
4
5
 
5
6
  describe Readability do
6
7
  before do
@@ -347,4 +348,54 @@ describe Readability do
347
348
  end
348
349
  end
349
350
  end
351
+
352
+ describe "#make_html" do
353
+ it "should strip the html comments tag" do
354
+ doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!<!-- bye~ --></div></body></html>")
355
+ content = doc.content
356
+ content.should include("hi!")
357
+ content.should_not include("bye")
358
+ end
359
+
360
+ it "should not error with empty content" do
361
+ Readability::Document.new('').content.should == '<div><div></div></div>'
362
+ end
363
+
364
+ it "should not error with a document with no <body>" do
365
+ Readability::Document.new('<html><head><meta http-equiv="refresh" content="0;URL=http://example.com"></head></html>').content.should == '<div><div></div></div>'
366
+ end
367
+ end
368
+
369
+ describe "No side-effects" do
370
+ before do
371
+ @bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
372
+ @nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
373
+ @thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
374
+ end
375
+
376
+ it "should not have any side-effects when calling content() and then images()" do
377
+ @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
378
+ :do_not_guess_encoding => true)
379
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
380
+ @doc.content
381
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
382
+ end
383
+
384
+ it "should not have any side-effects when calling content() multiple times" do
385
+ @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
386
+ :do_not_guess_encoding => true)
387
+ @doc.content.should == @doc.content
388
+ end
389
+
390
+ it "should not have any side-effects when calling content and images multiple times" do
391
+ @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
392
+ :do_not_guess_encoding => true)
393
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
394
+ @doc.content.should == @doc.content
395
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
396
+ end
397
+
398
+ end
399
+
400
+
350
401
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.5.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -12,11 +12,11 @@ authors:
12
12
  autorequire:
13
13
  bindir: bin
14
14
  cert_chain: []
15
- date: 2012-04-04 00:00:00.000000000Z
15
+ date: 2012-04-21 00:00:00.000000000 Z
16
16
  dependencies:
17
17
  - !ruby/object:Gem::Dependency
18
18
  name: rspec
19
- requirement: &82084870 !ruby/object:Gem::Requirement
19
+ requirement: !ruby/object:Gem::Requirement
20
20
  none: false
21
21
  requirements:
22
22
  - - ! '>='
@@ -24,10 +24,15 @@ dependencies:
24
24
  version: '2.8'
25
25
  type: :development
26
26
  prerelease: false
27
- version_requirements: *82084870
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '2.8'
28
33
  - !ruby/object:Gem::Dependency
29
34
  name: rspec-expectations
30
- requirement: &82084630 !ruby/object:Gem::Requirement
35
+ requirement: !ruby/object:Gem::Requirement
31
36
  none: false
32
37
  requirements:
33
38
  - - ! '>='
@@ -35,10 +40,15 @@ dependencies:
35
40
  version: '2.8'
36
41
  type: :development
37
42
  prerelease: false
38
- version_requirements: *82084630
43
+ version_requirements: !ruby/object:Gem::Requirement
44
+ none: false
45
+ requirements:
46
+ - - ! '>='
47
+ - !ruby/object:Gem::Version
48
+ version: '2.8'
39
49
  - !ruby/object:Gem::Dependency
40
50
  name: rr
41
- requirement: &82084400 !ruby/object:Gem::Requirement
51
+ requirement: !ruby/object:Gem::Requirement
42
52
  none: false
43
53
  requirements:
44
54
  - - ! '>='
@@ -46,10 +56,15 @@ dependencies:
46
56
  version: '1.0'
47
57
  type: :development
48
58
  prerelease: false
49
- version_requirements: *82084400
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ! '>='
63
+ - !ruby/object:Gem::Version
64
+ version: '1.0'
50
65
  - !ruby/object:Gem::Dependency
51
66
  name: nokogiri
52
- requirement: &82084170 !ruby/object:Gem::Requirement
67
+ requirement: !ruby/object:Gem::Requirement
53
68
  none: false
54
69
  requirements:
55
70
  - - ! '>='
@@ -57,10 +72,15 @@ dependencies:
57
72
  version: 1.4.2
58
73
  type: :runtime
59
74
  prerelease: false
60
- version_requirements: *82084170
75
+ version_requirements: !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ! '>='
79
+ - !ruby/object:Gem::Version
80
+ version: 1.4.2
61
81
  - !ruby/object:Gem::Dependency
62
82
  name: guess_html_encoding
63
- requirement: &82083940 !ruby/object:Gem::Requirement
83
+ requirement: !ruby/object:Gem::Requirement
64
84
  none: false
65
85
  requirements:
66
86
  - - ! '>='
@@ -68,7 +88,12 @@ dependencies:
68
88
  version: 0.0.4
69
89
  type: :runtime
70
90
  prerelease: false
71
- version_requirements: *82083940
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ! '>='
95
+ - !ruby/object:Gem::Version
96
+ version: 0.0.4
72
97
  description: Port of arc90's readability project to ruby
73
98
  email:
74
99
  - andrew@iterationlabs.com
@@ -125,8 +150,26 @@ required_rubygems_version: !ruby/object:Gem::Requirement
125
150
  version: '0'
126
151
  requirements: []
127
152
  rubyforge_project: ruby-readability
128
- rubygems_version: 1.8.16
153
+ rubygems_version: 1.8.21
129
154
  signing_key:
130
155
  specification_version: 3
131
156
  summary: Port of arc90's readability project to ruby
132
- test_files: []
157
+ test_files:
158
+ - spec/fixtures/bbc.html
159
+ - spec/fixtures/cant_read.html
160
+ - spec/fixtures/images/dim_1416768a.jpg
161
+ - spec/fixtures/nytimes.html
162
+ - spec/fixtures/sample.html
163
+ - spec/fixtures/samples/blogpost_with_links-fragments.rb
164
+ - spec/fixtures/samples/blogpost_with_links.html
165
+ - spec/fixtures/samples/channel4-1-fragments.rb
166
+ - spec/fixtures/samples/channel4-1.html
167
+ - spec/fixtures/samples/foxnews-india1-fragments.rb
168
+ - spec/fixtures/samples/foxnews-india1.html
169
+ - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
170
+ - spec/fixtures/samples/globemail-ottawa-cuts.html
171
+ - spec/fixtures/should_not_truncate.txt
172
+ - spec/fixtures/thesun.html
173
+ - spec/readability_spec.rb
174
+ - spec/spec.opts
175
+ - spec/spec_helper.rb