ruby-readability 0.6.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/readability.rb +46 -18
- data/ruby-readability.gemspec +2 -2
- data/spec/readability_spec.rb +41 -0
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b9f4f443e32b774c8c2b14856c78e7c593c6ef41
|
4
|
+
data.tar.gz: 3f6916bfc9b1c88c3c45f5e839fe0e2a4b882ab5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fdf2bb73b0ff4db4617c34996e72f23465d33d90a7631eaaa979235fd8f1f8c529dcf39f7930dc447df72e35e640726b0a3567e3cf0abdafb1ab88e46eb4e3ac
|
7
|
+
data.tar.gz: e75ebfeb153e89fbe52e94e0eab2f33865b32c75ed89e5411387d2cfa6a2f92d0671ecc000229d1ac3cf2027d18e7b7050053c32ab44dca05c8f9a35b20a1194
|
data/lib/readability.rb
CHANGED
@@ -19,7 +19,21 @@ module Readability
|
|
19
19
|
:blacklist => nil,
|
20
20
|
:whitelist => nil
|
21
21
|
}.freeze
|
22
|
-
|
22
|
+
|
23
|
+
REGEXES = {
|
24
|
+
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
25
|
+
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
|
26
|
+
:positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
|
27
|
+
:negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
|
28
|
+
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
29
|
+
:replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
|
30
|
+
:replaceFontsRe => /<(\/?)font[^>]*>/i,
|
31
|
+
:trimRe => /^\s+|\s+$/,
|
32
|
+
:normalizeRe => /\s{2,}/,
|
33
|
+
:killBreaksRe => /(<br\s*\/?>(\s| ?)*){1,}/,
|
34
|
+
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
35
|
+
}
|
36
|
+
|
23
37
|
attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
|
24
38
|
|
25
39
|
def initialize(input, options = {})
|
@@ -129,6 +143,31 @@ module Readability
|
|
129
143
|
|
130
144
|
(list_images.empty? and content != @html) ? images(@html, true) : list_images
|
131
145
|
end
|
146
|
+
|
147
|
+
def images_with_fqdn_uris!(source_uri)
|
148
|
+
images_with_fqdn_uris(@html, source_uri)
|
149
|
+
end
|
150
|
+
|
151
|
+
def images_with_fqdn_uris(document = @html.dup, source_uri)
|
152
|
+
uri = URI.parse(source_uri)
|
153
|
+
host = uri.host
|
154
|
+
scheme = uri.scheme
|
155
|
+
port = uri.port # defaults to 80
|
156
|
+
|
157
|
+
base = "#{scheme}://#{host}:#{port}/"
|
158
|
+
|
159
|
+
images = []
|
160
|
+
document.css("img").each do |elem|
|
161
|
+
begin
|
162
|
+
elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
|
163
|
+
images << elem['src'].to_s
|
164
|
+
rescue URI::InvalidURIError => exc
|
165
|
+
elem.remove
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
images(document,true)
|
170
|
+
end
|
132
171
|
|
133
172
|
def get_image_size(url)
|
134
173
|
w, h = FastImage.size(url)
|
@@ -144,20 +183,6 @@ module Readability
|
|
144
183
|
image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
|
145
184
|
end
|
146
185
|
|
147
|
-
REGEXES = {
|
148
|
-
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
149
|
-
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
|
150
|
-
:positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
|
151
|
-
:negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
|
152
|
-
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
153
|
-
:replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
|
154
|
-
:replaceFontsRe => /<(\/?)font[^>]*>/i,
|
155
|
-
:trimRe => /^\s+|\s+$/,
|
156
|
-
:normalizeRe => /\s{2,}/,
|
157
|
-
:killBreaksRe => /(<br\s*\/?>(\s| ?)*){1,}/,
|
158
|
-
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
159
|
-
}
|
160
|
-
|
161
186
|
def title
|
162
187
|
title = @html.css("title").first
|
163
188
|
title ? title.text : nil
|
@@ -444,7 +469,7 @@ module Readability
|
|
444
469
|
weight = class_weight(el)
|
445
470
|
content_score = candidates[el] ? candidates[el][:content_score] : 0
|
446
471
|
name = el.name.downcase
|
447
|
-
|
472
|
+
|
448
473
|
if weight + content_score < 0
|
449
474
|
el.remove
|
450
475
|
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
|
@@ -452,6 +477,9 @@ module Readability
|
|
452
477
|
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
|
453
478
|
counts["li"] -= 100
|
454
479
|
|
480
|
+
# For every img under a noscript tag discount one from the count to avoid double counting
|
481
|
+
counts["img"] -= el.css("noscript").css("img").length
|
482
|
+
|
455
483
|
content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
|
456
484
|
link_density = get_link_density(el)
|
457
485
|
|
@@ -465,13 +493,13 @@ module Readability
|
|
465
493
|
end
|
466
494
|
|
467
495
|
def clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
|
468
|
-
if counts["img"] > counts["p"]
|
496
|
+
if (counts["img"] > counts["p"]) && (counts["img"] > 1)
|
469
497
|
"too many images"
|
470
498
|
elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
|
471
499
|
"more <li>s than <p>s"
|
472
500
|
elsif counts["input"] > (counts["p"] / 3).to_i
|
473
501
|
"less than 3x <p>s than <input>s"
|
474
|
-
elsif content_length <
|
502
|
+
elsif (content_length < options[:min_text_length]) && (counts["img"] != 1)
|
475
503
|
"too short a content length without a single image"
|
476
504
|
elsif weight < 25 && link_density > 0.2
|
477
505
|
"too many links for its weight (#{weight})"
|
data/ruby-readability.gemspec
CHANGED
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "ruby-readability"
|
6
|
-
s.version = '0.
|
6
|
+
s.version = '0.7.0'
|
7
7
|
s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
|
8
8
|
s.email = ["andrew@iterationlabs.com"]
|
9
9
|
s.homepage = "http://github.com/cantino/ruby-readability"
|
@@ -20,6 +20,6 @@ Gem::Specification.new do |s|
|
|
20
20
|
s.add_development_dependency "rspec", ">= 2.8"
|
21
21
|
s.add_development_dependency "rspec-expectations", ">= 2.8"
|
22
22
|
s.add_development_dependency "rr", ">= 1.0"
|
23
|
-
s.add_dependency 'nokogiri', '>= 1.
|
23
|
+
s.add_dependency 'nokogiri', '>= 1.6.0'
|
24
24
|
s.add_dependency 'guess_html_encoding', '>= 0.0.4'
|
25
25
|
end
|
data/spec/readability_spec.rb
CHANGED
@@ -19,6 +19,35 @@ describe Readability do
|
|
19
19
|
</body>
|
20
20
|
</html>
|
21
21
|
HTML
|
22
|
+
|
23
|
+
@simple_html_with_img_no_text = <<-HTML
|
24
|
+
<html>
|
25
|
+
<head>
|
26
|
+
<title>title!</title>
|
27
|
+
</head>
|
28
|
+
<body class='main'>
|
29
|
+
<div class="article-img">
|
30
|
+
<img src="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg">
|
31
|
+
</div>
|
32
|
+
</body>
|
33
|
+
</html>
|
34
|
+
HTML
|
35
|
+
|
36
|
+
@simple_html_with_img_in_noscript = <<-HTML
|
37
|
+
<html>
|
38
|
+
<head>
|
39
|
+
<title>title!</title>
|
40
|
+
</head>
|
41
|
+
<body class='main'>
|
42
|
+
<div class="article-img">
|
43
|
+
<img src="http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif" width="660"
|
44
|
+
height="317" alt="test" class="lazy"
|
45
|
+
data-original="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg">
|
46
|
+
<noscript><img src="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"></noscript>
|
47
|
+
</div>
|
48
|
+
</body>
|
49
|
+
</html>
|
50
|
+
HTML
|
22
51
|
end
|
23
52
|
|
24
53
|
describe "images" do
|
@@ -36,6 +65,7 @@ describe Readability do
|
|
36
65
|
|
37
66
|
FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif",
|
38
67
|
:body => File.read(File.dirname(__FILE__) + "/fixtures/images/sign_up_emails_682__703712a.gif"))
|
68
|
+
|
39
69
|
end
|
40
70
|
|
41
71
|
it "should show one image, but outside of the best candidate" do
|
@@ -115,6 +145,17 @@ describe Readability do
|
|
115
145
|
@doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
|
116
146
|
@doc.best_candidate_has_image.should == true
|
117
147
|
end
|
148
|
+
|
149
|
+
it "should not miss an image if it exists by itself in a div without text" do
|
150
|
+
@doc = Readability::Document.new(@simple_html_with_img_no_text,:tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false, :do_not_guess_encoding => true)
|
151
|
+
@doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
|
152
|
+
end
|
153
|
+
|
154
|
+
it "should not double count an image between script and noscript" do
|
155
|
+
@doc = Readability::Document.new(@simple_html_with_img_in_noscript,:tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false, :do_not_guess_encoding => true)
|
156
|
+
@doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
|
157
|
+
end
|
158
|
+
|
118
159
|
end
|
119
160
|
end
|
120
161
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-readability
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Cantino
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2014-
|
14
|
+
date: 2014-08-17 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rspec
|
@@ -61,14 +61,14 @@ dependencies:
|
|
61
61
|
requirements:
|
62
62
|
- - '>='
|
63
63
|
- !ruby/object:Gem::Version
|
64
|
-
version: 1.
|
64
|
+
version: 1.6.0
|
65
65
|
type: :runtime
|
66
66
|
prerelease: false
|
67
67
|
version_requirements: !ruby/object:Gem::Requirement
|
68
68
|
requirements:
|
69
69
|
- - '>='
|
70
70
|
- !ruby/object:Gem::Version
|
71
|
-
version: 1.
|
71
|
+
version: 1.6.0
|
72
72
|
- !ruby/object:Gem::Dependency
|
73
73
|
name: guess_html_encoding
|
74
74
|
requirement: !ruby/object:Gem::Requirement
|