ruby-readability 0.6.2 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/readability.rb +46 -18
- data/ruby-readability.gemspec +2 -2
- data/spec/readability_spec.rb +41 -0
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b9f4f443e32b774c8c2b14856c78e7c593c6ef41
|
4
|
+
data.tar.gz: 3f6916bfc9b1c88c3c45f5e839fe0e2a4b882ab5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fdf2bb73b0ff4db4617c34996e72f23465d33d90a7631eaaa979235fd8f1f8c529dcf39f7930dc447df72e35e640726b0a3567e3cf0abdafb1ab88e46eb4e3ac
|
7
|
+
data.tar.gz: e75ebfeb153e89fbe52e94e0eab2f33865b32c75ed89e5411387d2cfa6a2f92d0671ecc000229d1ac3cf2027d18e7b7050053c32ab44dca05c8f9a35b20a1194
|
data/lib/readability.rb
CHANGED
@@ -19,7 +19,21 @@ module Readability
|
|
19
19
|
:blacklist => nil,
|
20
20
|
:whitelist => nil
|
21
21
|
}.freeze
|
22
|
-
|
22
|
+
|
23
|
+
REGEXES = {
|
24
|
+
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
25
|
+
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
|
26
|
+
:positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
|
27
|
+
:negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
|
28
|
+
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
29
|
+
:replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
|
30
|
+
:replaceFontsRe => /<(\/?)font[^>]*>/i,
|
31
|
+
:trimRe => /^\s+|\s+$/,
|
32
|
+
:normalizeRe => /\s{2,}/,
|
33
|
+
:killBreaksRe => /(<br\s*\/?>(\s| ?)*){1,}/,
|
34
|
+
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
35
|
+
}
|
36
|
+
|
23
37
|
attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
|
24
38
|
|
25
39
|
def initialize(input, options = {})
|
@@ -129,6 +143,31 @@ module Readability
|
|
129
143
|
|
130
144
|
(list_images.empty? and content != @html) ? images(@html, true) : list_images
|
131
145
|
end
|
146
|
+
|
147
|
+
def images_with_fqdn_uris!(source_uri)
|
148
|
+
images_with_fqdn_uris(@html, source_uri)
|
149
|
+
end
|
150
|
+
|
151
|
+
def images_with_fqdn_uris(document = @html.dup, source_uri)
|
152
|
+
uri = URI.parse(source_uri)
|
153
|
+
host = uri.host
|
154
|
+
scheme = uri.scheme
|
155
|
+
port = uri.port # defaults to 80
|
156
|
+
|
157
|
+
base = "#{scheme}://#{host}:#{port}/"
|
158
|
+
|
159
|
+
images = []
|
160
|
+
document.css("img").each do |elem|
|
161
|
+
begin
|
162
|
+
elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
|
163
|
+
images << elem['src'].to_s
|
164
|
+
rescue URI::InvalidURIError => exc
|
165
|
+
elem.remove
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
images(document,true)
|
170
|
+
end
|
132
171
|
|
133
172
|
def get_image_size(url)
|
134
173
|
w, h = FastImage.size(url)
|
@@ -144,20 +183,6 @@ module Readability
|
|
144
183
|
image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
|
145
184
|
end
|
146
185
|
|
147
|
-
REGEXES = {
|
148
|
-
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
149
|
-
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
|
150
|
-
:positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
|
151
|
-
:negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
|
152
|
-
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
153
|
-
:replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
|
154
|
-
:replaceFontsRe => /<(\/?)font[^>]*>/i,
|
155
|
-
:trimRe => /^\s+|\s+$/,
|
156
|
-
:normalizeRe => /\s{2,}/,
|
157
|
-
:killBreaksRe => /(<br\s*\/?>(\s| ?)*){1,}/,
|
158
|
-
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
159
|
-
}
|
160
|
-
|
161
186
|
def title
|
162
187
|
title = @html.css("title").first
|
163
188
|
title ? title.text : nil
|
@@ -444,7 +469,7 @@ module Readability
|
|
444
469
|
weight = class_weight(el)
|
445
470
|
content_score = candidates[el] ? candidates[el][:content_score] : 0
|
446
471
|
name = el.name.downcase
|
447
|
-
|
472
|
+
|
448
473
|
if weight + content_score < 0
|
449
474
|
el.remove
|
450
475
|
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
|
@@ -452,6 +477,9 @@ module Readability
|
|
452
477
|
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
|
453
478
|
counts["li"] -= 100
|
454
479
|
|
480
|
+
# For every img under a noscript tag discount one from the count to avoid double counting
|
481
|
+
counts["img"] -= el.css("noscript").css("img").length
|
482
|
+
|
455
483
|
content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
|
456
484
|
link_density = get_link_density(el)
|
457
485
|
|
@@ -465,13 +493,13 @@ module Readability
|
|
465
493
|
end
|
466
494
|
|
467
495
|
def clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
|
468
|
-
if counts["img"] > counts["p"]
|
496
|
+
if (counts["img"] > counts["p"]) && (counts["img"] > 1)
|
469
497
|
"too many images"
|
470
498
|
elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
|
471
499
|
"more <li>s than <p>s"
|
472
500
|
elsif counts["input"] > (counts["p"] / 3).to_i
|
473
501
|
"less than 3x <p>s than <input>s"
|
474
|
-
elsif content_length <
|
502
|
+
elsif (content_length < options[:min_text_length]) && (counts["img"] != 1)
|
475
503
|
"too short a content length without a single image"
|
476
504
|
elsif weight < 25 && link_density > 0.2
|
477
505
|
"too many links for its weight (#{weight})"
|
data/ruby-readability.gemspec
CHANGED
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "ruby-readability"
|
6
|
-
s.version = '0.
|
6
|
+
s.version = '0.7.0'
|
7
7
|
s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
|
8
8
|
s.email = ["andrew@iterationlabs.com"]
|
9
9
|
s.homepage = "http://github.com/cantino/ruby-readability"
|
@@ -20,6 +20,6 @@ Gem::Specification.new do |s|
|
|
20
20
|
s.add_development_dependency "rspec", ">= 2.8"
|
21
21
|
s.add_development_dependency "rspec-expectations", ">= 2.8"
|
22
22
|
s.add_development_dependency "rr", ">= 1.0"
|
23
|
-
s.add_dependency 'nokogiri', '>= 1.
|
23
|
+
s.add_dependency 'nokogiri', '>= 1.6.0'
|
24
24
|
s.add_dependency 'guess_html_encoding', '>= 0.0.4'
|
25
25
|
end
|
data/spec/readability_spec.rb
CHANGED
@@ -19,6 +19,35 @@ describe Readability do
|
|
19
19
|
</body>
|
20
20
|
</html>
|
21
21
|
HTML
|
22
|
+
|
23
|
+
@simple_html_with_img_no_text = <<-HTML
|
24
|
+
<html>
|
25
|
+
<head>
|
26
|
+
<title>title!</title>
|
27
|
+
</head>
|
28
|
+
<body class='main'>
|
29
|
+
<div class="article-img">
|
30
|
+
<img src="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg">
|
31
|
+
</div>
|
32
|
+
</body>
|
33
|
+
</html>
|
34
|
+
HTML
|
35
|
+
|
36
|
+
@simple_html_with_img_in_noscript = <<-HTML
|
37
|
+
<html>
|
38
|
+
<head>
|
39
|
+
<title>title!</title>
|
40
|
+
</head>
|
41
|
+
<body class='main'>
|
42
|
+
<div class="article-img">
|
43
|
+
<img src="http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif" width="660"
|
44
|
+
height="317" alt="test" class="lazy"
|
45
|
+
data-original="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg">
|
46
|
+
<noscript><img src="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"></noscript>
|
47
|
+
</div>
|
48
|
+
</body>
|
49
|
+
</html>
|
50
|
+
HTML
|
22
51
|
end
|
23
52
|
|
24
53
|
describe "images" do
|
@@ -36,6 +65,7 @@ describe Readability do
|
|
36
65
|
|
37
66
|
FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif",
|
38
67
|
:body => File.read(File.dirname(__FILE__) + "/fixtures/images/sign_up_emails_682__703712a.gif"))
|
68
|
+
|
39
69
|
end
|
40
70
|
|
41
71
|
it "should show one image, but outside of the best candidate" do
|
@@ -115,6 +145,17 @@ describe Readability do
|
|
115
145
|
@doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
|
116
146
|
@doc.best_candidate_has_image.should == true
|
117
147
|
end
|
148
|
+
|
149
|
+
it "should not miss an image if it exists by itself in a div without text" do
|
150
|
+
@doc = Readability::Document.new(@simple_html_with_img_no_text,:tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false, :do_not_guess_encoding => true)
|
151
|
+
@doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
|
152
|
+
end
|
153
|
+
|
154
|
+
it "should not double count an image between script and noscript" do
|
155
|
+
@doc = Readability::Document.new(@simple_html_with_img_in_noscript,:tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false, :do_not_guess_encoding => true)
|
156
|
+
@doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
|
157
|
+
end
|
158
|
+
|
118
159
|
end
|
119
160
|
end
|
120
161
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-readability
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Cantino
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2014-
|
14
|
+
date: 2014-08-17 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rspec
|
@@ -61,14 +61,14 @@ dependencies:
|
|
61
61
|
requirements:
|
62
62
|
- - '>='
|
63
63
|
- !ruby/object:Gem::Version
|
64
|
-
version: 1.
|
64
|
+
version: 1.6.0
|
65
65
|
type: :runtime
|
66
66
|
prerelease: false
|
67
67
|
version_requirements: !ruby/object:Gem::Requirement
|
68
68
|
requirements:
|
69
69
|
- - '>='
|
70
70
|
- !ruby/object:Gem::Version
|
71
|
-
version: 1.
|
71
|
+
version: 1.6.0
|
72
72
|
- !ruby/object:Gem::Dependency
|
73
73
|
name: guess_html_encoding
|
74
74
|
requirement: !ruby/object:Gem::Requirement
|