ruby-readability 0.5.2 → 0.5.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +0 -1
- data/README.markdown +7 -5
- data/lib/readability.rb +12 -7
- data/ruby-readability.gemspec +1 -1
- data/spec/readability_spec.rb +51 -0
- metadata +57 -14
data/.rspec
CHANGED
data/README.markdown
CHANGED
@@ -30,7 +30,8 @@ You may provide options to Readability::Document.new, including:
|
|
30
30
|
:attributes - whitelist of allowed attributes
|
31
31
|
:debug - provide debugging output, defaults false
|
32
32
|
:encoding - if the page is of a known encoding, you can specify it; if left unspecified,
|
33
|
-
the encoding will be guessed (only in Ruby 1.9.x)
|
33
|
+
the encoding will be guessed (only in Ruby 1.9.x). If you wish to disable guessing,
|
34
|
+
supply :do_not_guess_encoding => true.
|
34
35
|
:html_headers - in Ruby 1.9.x these will be passed to the guess_html_encoding gem
|
35
36
|
to aid with guessing the HTML encoding
|
36
37
|
:ignore_image_format - for use with .images. For example: :ignore_image_format => ["gif", "png"]
|
@@ -48,13 +49,14 @@ Readability comes with a command-line tool for experimentation in bin/readabilit
|
|
48
49
|
|
49
50
|
## Images
|
50
51
|
|
51
|
-
You can get a list of images in the content area with `.images`. This feature requires that the `mini_magick` gem be installed
|
52
|
+
You can get a list of images in the content area with `.images`. This feature requires that the `mini_magick` gem be installed
|
52
53
|
|
53
|
-
|
54
|
+
rbody = Readability::Document.new(body, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false)
|
55
|
+
rbody.images
|
54
56
|
|
55
57
|
## Potential Issues
|
56
58
|
|
57
|
-
If you're on a Mac and are getting segmentation faults, see the discussion at https://github.com/tenderlove/nokogiri/issues/404 and consider updating your version of libxml2. Version 2.7.8 of libxml2 with
|
59
|
+
If you're on a Mac and are getting segmentation faults, see the discussion at https://github.com/tenderlove/nokogiri/issues/404 and consider updating your version of libxml2. Version 2.7.8 of libxml2, installed with `brew`, worked for me:
|
58
60
|
|
59
61
|
gem install nokogiri -- --with-xml2-include=/usr/local/Cellar/libxml2/2.7.8/include/libxml2 --with-xml2-lib=/usr/local/Cellar/libxml2/2.7.8/lib --with-xslt-dir=/usr/local/Cellar/libxslt/1.1.26
|
60
62
|
|
@@ -70,4 +72,4 @@ Or if you're using bundler and Rails 3, you can run this command to make bundler
|
|
70
72
|
|
71
73
|
This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
|
72
74
|
|
73
|
-
Ruby port by starrhorne, libc, and iterationlabs. Special thanks to fizx and marcosinger.
|
75
|
+
Ruby port by starrhorne, libc, and iterationlabs. Special thanks to fizx and marcosinger.
|
data/lib/readability.rb
CHANGED
@@ -39,13 +39,18 @@ module Readability
|
|
39
39
|
@html.css("script, style").each { |i| i.remove }
|
40
40
|
remove_unlikely_candidates! if @remove_unlikely_candidates
|
41
41
|
transform_misused_divs_into_paragraphs!
|
42
|
-
|
42
|
+
|
43
43
|
@candidates = score_paragraphs(options[:min_text_length])
|
44
44
|
@best_candidate = select_best_candidate(@candidates)
|
45
45
|
end
|
46
46
|
|
47
47
|
def make_html
|
48
48
|
@html = Nokogiri::HTML(@input, nil, @options[:encoding])
|
49
|
+
# In case document has no body, such as from empty string or redirect
|
50
|
+
@html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0
|
51
|
+
|
52
|
+
# Remove html comment tags
|
53
|
+
@html.xpath('//comment()').each { |i| i.remove }
|
49
54
|
end
|
50
55
|
|
51
56
|
def images(content=nil, reload=false)
|
@@ -174,8 +179,9 @@ module Readability
|
|
174
179
|
end
|
175
180
|
|
176
181
|
if append
|
177
|
-
sibling.
|
178
|
-
|
182
|
+
sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects
|
183
|
+
sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase)
|
184
|
+
output << sibling_dup
|
179
185
|
end
|
180
186
|
end
|
181
187
|
|
@@ -185,7 +191,7 @@ module Readability
|
|
185
191
|
def select_best_candidate(candidates)
|
186
192
|
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
|
187
193
|
|
188
|
-
debug("Top 5
|
194
|
+
debug("Top 5 candidates:")
|
189
195
|
sorted_candidates[0...5].each do |candidate|
|
190
196
|
debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
|
191
197
|
end
|
@@ -281,7 +287,7 @@ module Readability
|
|
281
287
|
def remove_unlikely_candidates!
|
282
288
|
@html.css("*").each do |elem|
|
283
289
|
str = "#{elem[:class]}#{elem[:id]}"
|
284
|
-
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
|
290
|
+
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body')
|
285
291
|
debug("Removing unlikely candidate - #{str}")
|
286
292
|
elem.remove
|
287
293
|
end
|
@@ -308,7 +314,7 @@ module Readability
|
|
308
314
|
end
|
309
315
|
end
|
310
316
|
|
311
|
-
def sanitize(node, candidates, options = {})
|
317
|
+
def sanitize(node, candidates, options = {})
|
312
318
|
node.css("h1, h2, h3, h4, h5, h6").each do |header|
|
313
319
|
header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
|
314
320
|
end
|
@@ -408,6 +414,5 @@ module Readability
|
|
408
414
|
end
|
409
415
|
end
|
410
416
|
end
|
411
|
-
|
412
417
|
end
|
413
418
|
end
|
data/ruby-readability.gemspec
CHANGED
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "ruby-readability"
|
6
|
-
s.version = '0.5.
|
6
|
+
s.version = '0.5.3'
|
7
7
|
s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
|
8
8
|
s.email = ["andrew@iterationlabs.com"]
|
9
9
|
s.homepage = "http://github.com/iterationlabs/ruby-readability"
|
data/spec/readability_spec.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
3
|
require 'spec_helper'
|
4
|
+
require 'readability'
|
4
5
|
|
5
6
|
describe Readability do
|
6
7
|
before do
|
@@ -347,4 +348,54 @@ describe Readability do
|
|
347
348
|
end
|
348
349
|
end
|
349
350
|
end
|
351
|
+
|
352
|
+
describe "#make_html" do
|
353
|
+
it "should strip the html comments tag" do
|
354
|
+
doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!<!-- bye~ --></div></body></html>")
|
355
|
+
content = doc.content
|
356
|
+
content.should include("hi!")
|
357
|
+
content.should_not include("bye")
|
358
|
+
end
|
359
|
+
|
360
|
+
it "should not error with empty content" do
|
361
|
+
Readability::Document.new('').content.should == '<div><div></div></div>'
|
362
|
+
end
|
363
|
+
|
364
|
+
it "should not error with a document with no <body>" do
|
365
|
+
Readability::Document.new('<html><head><meta http-equiv="refresh" content="0;URL=http://example.com"></head></html>').content.should == '<div><div></div></div>'
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
describe "No side-effects" do
|
370
|
+
before do
|
371
|
+
@bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
|
372
|
+
@nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
|
373
|
+
@thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
|
374
|
+
end
|
375
|
+
|
376
|
+
it "should not have any side-effects when calling content() and then images()" do
|
377
|
+
@doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
|
378
|
+
:do_not_guess_encoding => true)
|
379
|
+
@doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
|
380
|
+
@doc.content
|
381
|
+
@doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
|
382
|
+
end
|
383
|
+
|
384
|
+
it "should not have any side-effects when calling content() multiple times" do
|
385
|
+
@doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
|
386
|
+
:do_not_guess_encoding => true)
|
387
|
+
@doc.content.should == @doc.content
|
388
|
+
end
|
389
|
+
|
390
|
+
it "should not have any side-effects when calling content and images multiple times" do
|
391
|
+
@doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
|
392
|
+
:do_not_guess_encoding => true)
|
393
|
+
@doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
|
394
|
+
@doc.content.should == @doc.content
|
395
|
+
@doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
|
396
|
+
end
|
397
|
+
|
398
|
+
end
|
399
|
+
|
400
|
+
|
350
401
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-readability
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -12,11 +12,11 @@ authors:
|
|
12
12
|
autorequire:
|
13
13
|
bindir: bin
|
14
14
|
cert_chain: []
|
15
|
-
date: 2012-04-
|
15
|
+
date: 2012-04-21 00:00:00.000000000 Z
|
16
16
|
dependencies:
|
17
17
|
- !ruby/object:Gem::Dependency
|
18
18
|
name: rspec
|
19
|
-
requirement:
|
19
|
+
requirement: !ruby/object:Gem::Requirement
|
20
20
|
none: false
|
21
21
|
requirements:
|
22
22
|
- - ! '>='
|
@@ -24,10 +24,15 @@ dependencies:
|
|
24
24
|
version: '2.8'
|
25
25
|
type: :development
|
26
26
|
prerelease: false
|
27
|
-
version_requirements:
|
27
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '2.8'
|
28
33
|
- !ruby/object:Gem::Dependency
|
29
34
|
name: rspec-expectations
|
30
|
-
requirement:
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
31
36
|
none: false
|
32
37
|
requirements:
|
33
38
|
- - ! '>='
|
@@ -35,10 +40,15 @@ dependencies:
|
|
35
40
|
version: '2.8'
|
36
41
|
type: :development
|
37
42
|
prerelease: false
|
38
|
-
version_requirements:
|
43
|
+
version_requirements: !ruby/object:Gem::Requirement
|
44
|
+
none: false
|
45
|
+
requirements:
|
46
|
+
- - ! '>='
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '2.8'
|
39
49
|
- !ruby/object:Gem::Dependency
|
40
50
|
name: rr
|
41
|
-
requirement:
|
51
|
+
requirement: !ruby/object:Gem::Requirement
|
42
52
|
none: false
|
43
53
|
requirements:
|
44
54
|
- - ! '>='
|
@@ -46,10 +56,15 @@ dependencies:
|
|
46
56
|
version: '1.0'
|
47
57
|
type: :development
|
48
58
|
prerelease: false
|
49
|
-
version_requirements:
|
59
|
+
version_requirements: !ruby/object:Gem::Requirement
|
60
|
+
none: false
|
61
|
+
requirements:
|
62
|
+
- - ! '>='
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: '1.0'
|
50
65
|
- !ruby/object:Gem::Dependency
|
51
66
|
name: nokogiri
|
52
|
-
requirement:
|
67
|
+
requirement: !ruby/object:Gem::Requirement
|
53
68
|
none: false
|
54
69
|
requirements:
|
55
70
|
- - ! '>='
|
@@ -57,10 +72,15 @@ dependencies:
|
|
57
72
|
version: 1.4.2
|
58
73
|
type: :runtime
|
59
74
|
prerelease: false
|
60
|
-
version_requirements:
|
75
|
+
version_requirements: !ruby/object:Gem::Requirement
|
76
|
+
none: false
|
77
|
+
requirements:
|
78
|
+
- - ! '>='
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: 1.4.2
|
61
81
|
- !ruby/object:Gem::Dependency
|
62
82
|
name: guess_html_encoding
|
63
|
-
requirement:
|
83
|
+
requirement: !ruby/object:Gem::Requirement
|
64
84
|
none: false
|
65
85
|
requirements:
|
66
86
|
- - ! '>='
|
@@ -68,7 +88,12 @@ dependencies:
|
|
68
88
|
version: 0.0.4
|
69
89
|
type: :runtime
|
70
90
|
prerelease: false
|
71
|
-
version_requirements:
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.0.4
|
72
97
|
description: Port of arc90's readability project to ruby
|
73
98
|
email:
|
74
99
|
- andrew@iterationlabs.com
|
@@ -125,8 +150,26 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
125
150
|
version: '0'
|
126
151
|
requirements: []
|
127
152
|
rubyforge_project: ruby-readability
|
128
|
-
rubygems_version: 1.8.
|
153
|
+
rubygems_version: 1.8.21
|
129
154
|
signing_key:
|
130
155
|
specification_version: 3
|
131
156
|
summary: Port of arc90's readability project to ruby
|
132
|
-
test_files:
|
157
|
+
test_files:
|
158
|
+
- spec/fixtures/bbc.html
|
159
|
+
- spec/fixtures/cant_read.html
|
160
|
+
- spec/fixtures/images/dim_1416768a.jpg
|
161
|
+
- spec/fixtures/nytimes.html
|
162
|
+
- spec/fixtures/sample.html
|
163
|
+
- spec/fixtures/samples/blogpost_with_links-fragments.rb
|
164
|
+
- spec/fixtures/samples/blogpost_with_links.html
|
165
|
+
- spec/fixtures/samples/channel4-1-fragments.rb
|
166
|
+
- spec/fixtures/samples/channel4-1.html
|
167
|
+
- spec/fixtures/samples/foxnews-india1-fragments.rb
|
168
|
+
- spec/fixtures/samples/foxnews-india1.html
|
169
|
+
- spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
|
170
|
+
- spec/fixtures/samples/globemail-ottawa-cuts.html
|
171
|
+
- spec/fixtures/should_not_truncate.txt
|
172
|
+
- spec/fixtures/thesun.html
|
173
|
+
- spec/readability_spec.rb
|
174
|
+
- spec/spec.opts
|
175
|
+
- spec/spec_helper.rb
|