ruby-readability 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +0 -1
- data/README.markdown +7 -5
- data/lib/readability.rb +12 -7
- data/ruby-readability.gemspec +1 -1
- data/spec/readability_spec.rb +51 -0
- metadata +57 -14
data/.rspec
CHANGED
data/README.markdown
CHANGED
@@ -30,7 +30,8 @@ You may provide options to Readability::Document.new, including:
|
|
30
30
|
:attributes - whitelist of allowed attributes
|
31
31
|
:debug - provide debugging output, defaults false
|
32
32
|
:encoding - if the page is of a known encoding, you can specify it; if left unspecified,
|
33
|
-
the encoding will be guessed (only in Ruby 1.9.x)
|
33
|
+
the encoding will be guessed (only in Ruby 1.9.x). If you wish to disable guessing,
|
34
|
+
supply :do_not_guess_encoding => true.
|
34
35
|
:html_headers - in Ruby 1.9.x these will be passed to the guess_html_encoding gem
|
35
36
|
to aid with guessing the HTML encoding
|
36
37
|
:ignore_image_format - for use with .images. For example: :ignore_image_format => ["gif", "png"]
|
@@ -48,13 +49,14 @@ Readability comes with a command-line tool for experimentation in bin/readabilit
|
|
48
49
|
|
49
50
|
## Images
|
50
51
|
|
51
|
-
You can get a list of images in the content area with `.images`. This feature requires that the `mini_magick` gem be installed
|
52
|
+
You can get a list of images in the content area with `.images`. This feature requires that the `mini_magick` gem be installed
|
52
53
|
|
53
|
-
|
54
|
+
rbody = Readability::Document.new(body, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false)
|
55
|
+
rbody.images
|
54
56
|
|
55
57
|
## Potential Issues
|
56
58
|
|
57
|
-
If you're on a Mac and are getting segmentation faults, see the discussion at https://github.com/tenderlove/nokogiri/issues/404 and consider updating your version of libxml2. Version 2.7.8 of libxml2 with
|
59
|
+
If you're on a Mac and are getting segmentation faults, see the discussion at https://github.com/tenderlove/nokogiri/issues/404 and consider updating your version of libxml2. Version 2.7.8 of libxml2, installed with `brew`, worked for me:
|
58
60
|
|
59
61
|
gem install nokogiri -- --with-xml2-include=/usr/local/Cellar/libxml2/2.7.8/include/libxml2 --with-xml2-lib=/usr/local/Cellar/libxml2/2.7.8/lib --with-xslt-dir=/usr/local/Cellar/libxslt/1.1.26
|
60
62
|
|
@@ -70,4 +72,4 @@ Or if you're using bundler and Rails 3, you can run this command to make bundler
|
|
70
72
|
|
71
73
|
This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
|
72
74
|
|
73
|
-
Ruby port by starrhorne, libc, and iterationlabs. Special thanks to fizx and marcosinger.
|
75
|
+
Ruby port by starrhorne, libc, and iterationlabs. Special thanks to fizx and marcosinger.
|
data/lib/readability.rb
CHANGED
@@ -39,13 +39,18 @@ module Readability
|
|
39
39
|
@html.css("script, style").each { |i| i.remove }
|
40
40
|
remove_unlikely_candidates! if @remove_unlikely_candidates
|
41
41
|
transform_misused_divs_into_paragraphs!
|
42
|
-
|
42
|
+
|
43
43
|
@candidates = score_paragraphs(options[:min_text_length])
|
44
44
|
@best_candidate = select_best_candidate(@candidates)
|
45
45
|
end
|
46
46
|
|
47
47
|
def make_html
|
48
48
|
@html = Nokogiri::HTML(@input, nil, @options[:encoding])
|
49
|
+
# In case document has no body, such as from empty string or redirect
|
50
|
+
@html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0
|
51
|
+
|
52
|
+
# Remove html comment tags
|
53
|
+
@html.xpath('//comment()').each { |i| i.remove }
|
49
54
|
end
|
50
55
|
|
51
56
|
def images(content=nil, reload=false)
|
@@ -174,8 +179,9 @@ module Readability
|
|
174
179
|
end
|
175
180
|
|
176
181
|
if append
|
177
|
-
sibling.
|
178
|
-
|
182
|
+
sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects
|
183
|
+
sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase)
|
184
|
+
output << sibling_dup
|
179
185
|
end
|
180
186
|
end
|
181
187
|
|
@@ -185,7 +191,7 @@ module Readability
|
|
185
191
|
def select_best_candidate(candidates)
|
186
192
|
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
|
187
193
|
|
188
|
-
debug("Top 5
|
194
|
+
debug("Top 5 candidates:")
|
189
195
|
sorted_candidates[0...5].each do |candidate|
|
190
196
|
debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
|
191
197
|
end
|
@@ -281,7 +287,7 @@ module Readability
|
|
281
287
|
def remove_unlikely_candidates!
|
282
288
|
@html.css("*").each do |elem|
|
283
289
|
str = "#{elem[:class]}#{elem[:id]}"
|
284
|
-
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
|
290
|
+
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body')
|
285
291
|
debug("Removing unlikely candidate - #{str}")
|
286
292
|
elem.remove
|
287
293
|
end
|
@@ -308,7 +314,7 @@ module Readability
|
|
308
314
|
end
|
309
315
|
end
|
310
316
|
|
311
|
-
def sanitize(node, candidates, options = {})
|
317
|
+
def sanitize(node, candidates, options = {})
|
312
318
|
node.css("h1, h2, h3, h4, h5, h6").each do |header|
|
313
319
|
header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
|
314
320
|
end
|
@@ -408,6 +414,5 @@ module Readability
|
|
408
414
|
end
|
409
415
|
end
|
410
416
|
end
|
411
|
-
|
412
417
|
end
|
413
418
|
end
|
data/ruby-readability.gemspec
CHANGED
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "ruby-readability"
|
6
|
-
s.version = '0.5.
|
6
|
+
s.version = '0.5.3'
|
7
7
|
s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
|
8
8
|
s.email = ["andrew@iterationlabs.com"]
|
9
9
|
s.homepage = "http://github.com/iterationlabs/ruby-readability"
|
data/spec/readability_spec.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
3
|
require 'spec_helper'
|
4
|
+
require 'readability'
|
4
5
|
|
5
6
|
describe Readability do
|
6
7
|
before do
|
@@ -347,4 +348,54 @@ describe Readability do
|
|
347
348
|
end
|
348
349
|
end
|
349
350
|
end
|
351
|
+
|
352
|
+
describe "#make_html" do
|
353
|
+
it "should strip the html comments tag" do
|
354
|
+
doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!<!-- bye~ --></div></body></html>")
|
355
|
+
content = doc.content
|
356
|
+
content.should include("hi!")
|
357
|
+
content.should_not include("bye")
|
358
|
+
end
|
359
|
+
|
360
|
+
it "should not error with empty content" do
|
361
|
+
Readability::Document.new('').content.should == '<div><div></div></div>'
|
362
|
+
end
|
363
|
+
|
364
|
+
it "should not error with a document with no <body>" do
|
365
|
+
Readability::Document.new('<html><head><meta http-equiv="refresh" content="0;URL=http://example.com"></head></html>').content.should == '<div><div></div></div>'
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
describe "No side-effects" do
|
370
|
+
before do
|
371
|
+
@bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
|
372
|
+
@nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
|
373
|
+
@thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
|
374
|
+
end
|
375
|
+
|
376
|
+
it "should not have any side-effects when calling content() and then images()" do
|
377
|
+
@doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
|
378
|
+
:do_not_guess_encoding => true)
|
379
|
+
@doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
|
380
|
+
@doc.content
|
381
|
+
@doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
|
382
|
+
end
|
383
|
+
|
384
|
+
it "should not have any side-effects when calling content() multiple times" do
|
385
|
+
@doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
|
386
|
+
:do_not_guess_encoding => true)
|
387
|
+
@doc.content.should == @doc.content
|
388
|
+
end
|
389
|
+
|
390
|
+
it "should not have any side-effects when calling content and images multiple times" do
|
391
|
+
@doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
|
392
|
+
:do_not_guess_encoding => true)
|
393
|
+
@doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
|
394
|
+
@doc.content.should == @doc.content
|
395
|
+
@doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
|
396
|
+
end
|
397
|
+
|
398
|
+
end
|
399
|
+
|
400
|
+
|
350
401
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-readability
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -12,11 +12,11 @@ authors:
|
|
12
12
|
autorequire:
|
13
13
|
bindir: bin
|
14
14
|
cert_chain: []
|
15
|
-
date: 2012-04-
|
15
|
+
date: 2012-04-21 00:00:00.000000000 Z
|
16
16
|
dependencies:
|
17
17
|
- !ruby/object:Gem::Dependency
|
18
18
|
name: rspec
|
19
|
-
requirement:
|
19
|
+
requirement: !ruby/object:Gem::Requirement
|
20
20
|
none: false
|
21
21
|
requirements:
|
22
22
|
- - ! '>='
|
@@ -24,10 +24,15 @@ dependencies:
|
|
24
24
|
version: '2.8'
|
25
25
|
type: :development
|
26
26
|
prerelease: false
|
27
|
-
version_requirements:
|
27
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '2.8'
|
28
33
|
- !ruby/object:Gem::Dependency
|
29
34
|
name: rspec-expectations
|
30
|
-
requirement:
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
31
36
|
none: false
|
32
37
|
requirements:
|
33
38
|
- - ! '>='
|
@@ -35,10 +40,15 @@ dependencies:
|
|
35
40
|
version: '2.8'
|
36
41
|
type: :development
|
37
42
|
prerelease: false
|
38
|
-
version_requirements:
|
43
|
+
version_requirements: !ruby/object:Gem::Requirement
|
44
|
+
none: false
|
45
|
+
requirements:
|
46
|
+
- - ! '>='
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '2.8'
|
39
49
|
- !ruby/object:Gem::Dependency
|
40
50
|
name: rr
|
41
|
-
requirement:
|
51
|
+
requirement: !ruby/object:Gem::Requirement
|
42
52
|
none: false
|
43
53
|
requirements:
|
44
54
|
- - ! '>='
|
@@ -46,10 +56,15 @@ dependencies:
|
|
46
56
|
version: '1.0'
|
47
57
|
type: :development
|
48
58
|
prerelease: false
|
49
|
-
version_requirements:
|
59
|
+
version_requirements: !ruby/object:Gem::Requirement
|
60
|
+
none: false
|
61
|
+
requirements:
|
62
|
+
- - ! '>='
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: '1.0'
|
50
65
|
- !ruby/object:Gem::Dependency
|
51
66
|
name: nokogiri
|
52
|
-
requirement:
|
67
|
+
requirement: !ruby/object:Gem::Requirement
|
53
68
|
none: false
|
54
69
|
requirements:
|
55
70
|
- - ! '>='
|
@@ -57,10 +72,15 @@ dependencies:
|
|
57
72
|
version: 1.4.2
|
58
73
|
type: :runtime
|
59
74
|
prerelease: false
|
60
|
-
version_requirements:
|
75
|
+
version_requirements: !ruby/object:Gem::Requirement
|
76
|
+
none: false
|
77
|
+
requirements:
|
78
|
+
- - ! '>='
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: 1.4.2
|
61
81
|
- !ruby/object:Gem::Dependency
|
62
82
|
name: guess_html_encoding
|
63
|
-
requirement:
|
83
|
+
requirement: !ruby/object:Gem::Requirement
|
64
84
|
none: false
|
65
85
|
requirements:
|
66
86
|
- - ! '>='
|
@@ -68,7 +88,12 @@ dependencies:
|
|
68
88
|
version: 0.0.4
|
69
89
|
type: :runtime
|
70
90
|
prerelease: false
|
71
|
-
version_requirements:
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.0.4
|
72
97
|
description: Port of arc90's readability project to ruby
|
73
98
|
email:
|
74
99
|
- andrew@iterationlabs.com
|
@@ -125,8 +150,26 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
125
150
|
version: '0'
|
126
151
|
requirements: []
|
127
152
|
rubyforge_project: ruby-readability
|
128
|
-
rubygems_version: 1.8.
|
153
|
+
rubygems_version: 1.8.21
|
129
154
|
signing_key:
|
130
155
|
specification_version: 3
|
131
156
|
summary: Port of arc90's readability project to ruby
|
132
|
-
test_files:
|
157
|
+
test_files:
|
158
|
+
- spec/fixtures/bbc.html
|
159
|
+
- spec/fixtures/cant_read.html
|
160
|
+
- spec/fixtures/images/dim_1416768a.jpg
|
161
|
+
- spec/fixtures/nytimes.html
|
162
|
+
- spec/fixtures/sample.html
|
163
|
+
- spec/fixtures/samples/blogpost_with_links-fragments.rb
|
164
|
+
- spec/fixtures/samples/blogpost_with_links.html
|
165
|
+
- spec/fixtures/samples/channel4-1-fragments.rb
|
166
|
+
- spec/fixtures/samples/channel4-1.html
|
167
|
+
- spec/fixtures/samples/foxnews-india1-fragments.rb
|
168
|
+
- spec/fixtures/samples/foxnews-india1.html
|
169
|
+
- spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
|
170
|
+
- spec/fixtures/samples/globemail-ottawa-cuts.html
|
171
|
+
- spec/fixtures/should_not_truncate.txt
|
172
|
+
- spec/fixtures/thesun.html
|
173
|
+
- spec/readability_spec.rb
|
174
|
+
- spec/spec.opts
|
175
|
+
- spec/spec_helper.rb
|