ruby-readability 0.6.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 93fa92e0a04e5193bacf3e09a77cb8d25ddcff54
4
- data.tar.gz: 7d1f0cd78ead23386d15345d0f356c75fe8c6e2a
3
+ metadata.gz: b9f4f443e32b774c8c2b14856c78e7c593c6ef41
4
+ data.tar.gz: 3f6916bfc9b1c88c3c45f5e839fe0e2a4b882ab5
5
5
  SHA512:
6
- metadata.gz: 93b0d6d8fc0bc0800cfd1f03cebb72d61720eaa348996752be1d04b06504130d453d59d6214d3813af050f17f37f40f0b807bc701ef4c3771dc957b45e129a9a
7
- data.tar.gz: fb3faf6b214acd44e7f27dda0342f27b25ffe1decc2deeaf2267388d9ab109225aa4959d9099b9f0815f9b014bc24bbe93bbbe8533b94a57fc3ec1b06c8afd6c
6
+ metadata.gz: fdf2bb73b0ff4db4617c34996e72f23465d33d90a7631eaaa979235fd8f1f8c529dcf39f7930dc447df72e35e640726b0a3567e3cf0abdafb1ab88e46eb4e3ac
7
+ data.tar.gz: e75ebfeb153e89fbe52e94e0eab2f33865b32c75ed89e5411387d2cfa6a2f92d0671ecc000229d1ac3cf2027d18e7b7050053c32ab44dca05c8f9a35b20a1194
@@ -19,7 +19,21 @@ module Readability
19
19
  :blacklist => nil,
20
20
  :whitelist => nil
21
21
  }.freeze
22
-
22
+
23
+ REGEXES = {
24
+ :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
25
+ :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
26
+ :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
27
+ :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
28
+ :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
29
+ :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
30
+ :replaceFontsRe => /<(\/?)font[^>]*>/i,
31
+ :trimRe => /^\s+|\s+$/,
32
+ :normalizeRe => /\s{2,}/,
33
+ :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
34
+ :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
35
+ }
36
+
23
37
  attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
24
38
 
25
39
  def initialize(input, options = {})
@@ -129,6 +143,31 @@ module Readability
129
143
 
130
144
  (list_images.empty? and content != @html) ? images(@html, true) : list_images
131
145
  end
146
+
147
+ def images_with_fqdn_uris!(source_uri)
148
+ images_with_fqdn_uris(@html, source_uri)
149
+ end
150
+
151
+ def images_with_fqdn_uris(document = @html.dup, source_uri)
152
+ uri = URI.parse(source_uri)
153
+ host = uri.host
154
+ scheme = uri.scheme
155
+ port = uri.port # defaults to 80
156
+
157
+ base = "#{scheme}://#{host}:#{port}/"
158
+
159
+ images = []
160
+ document.css("img").each do |elem|
161
+ begin
162
+ elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
163
+ images << elem['src'].to_s
164
+ rescue URI::InvalidURIError => exc
165
+ elem.remove
166
+ end
167
+ end
168
+
169
+ images(document,true)
170
+ end
132
171
 
133
172
  def get_image_size(url)
134
173
  w, h = FastImage.size(url)
@@ -144,20 +183,6 @@ module Readability
144
183
  image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
145
184
  end
146
185
 
147
- REGEXES = {
148
- :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
149
- :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
150
- :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
151
- :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
152
- :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
153
- :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
154
- :replaceFontsRe => /<(\/?)font[^>]*>/i,
155
- :trimRe => /^\s+|\s+$/,
156
- :normalizeRe => /\s{2,}/,
157
- :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
158
- :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
159
- }
160
-
161
186
  def title
162
187
  title = @html.css("title").first
163
188
  title ? title.text : nil
@@ -444,7 +469,7 @@ module Readability
444
469
  weight = class_weight(el)
445
470
  content_score = candidates[el] ? candidates[el][:content_score] : 0
446
471
  name = el.name.downcase
447
-
472
+
448
473
  if weight + content_score < 0
449
474
  el.remove
450
475
  debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
@@ -452,6 +477,9 @@ module Readability
452
477
  counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
453
478
  counts["li"] -= 100
454
479
 
480
+ # For every img under a noscript tag discount one from the count to avoid double counting
481
+ counts["img"] -= el.css("noscript").css("img").length
482
+
455
483
  content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
456
484
  link_density = get_link_density(el)
457
485
 
@@ -465,13 +493,13 @@ module Readability
465
493
  end
466
494
 
467
495
  def clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
468
- if counts["img"] > counts["p"]
496
+ if (counts["img"] > counts["p"]) && (counts["img"] > 1)
469
497
  "too many images"
470
498
  elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
471
499
  "more <li>s than <p>s"
472
500
  elsif counts["input"] > (counts["p"] / 3).to_i
473
501
  "less than 3x <p>s than <input>s"
474
- elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
502
+ elsif (content_length < options[:min_text_length]) && (counts["img"] != 1)
475
503
  "too short a content length without a single image"
476
504
  elsif weight < 25 && link_density > 0.2
477
505
  "too many links for its weight (#{weight})"
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "ruby-readability"
6
- s.version = '0.6.2'
6
+ s.version = '0.7.0'
7
7
  s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
8
8
  s.email = ["andrew@iterationlabs.com"]
9
9
  s.homepage = "http://github.com/cantino/ruby-readability"
@@ -20,6 +20,6 @@ Gem::Specification.new do |s|
20
20
  s.add_development_dependency "rspec", ">= 2.8"
21
21
  s.add_development_dependency "rspec-expectations", ">= 2.8"
22
22
  s.add_development_dependency "rr", ">= 1.0"
23
- s.add_dependency 'nokogiri', '>= 1.4.2'
23
+ s.add_dependency 'nokogiri', '>= 1.6.0'
24
24
  s.add_dependency 'guess_html_encoding', '>= 0.0.4'
25
25
  end
@@ -19,6 +19,35 @@ describe Readability do
19
19
  </body>
20
20
  </html>
21
21
  HTML
22
+
23
+ @simple_html_with_img_no_text = <<-HTML
24
+ <html>
25
+ <head>
26
+ <title>title!</title>
27
+ </head>
28
+ <body class='main'>
29
+ <div class="article-img">
30
+ <img src="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg">
31
+ </div>
32
+ </body>
33
+ </html>
34
+ HTML
35
+
36
+ @simple_html_with_img_in_noscript = <<-HTML
37
+ <html>
38
+ <head>
39
+ <title>title!</title>
40
+ </head>
41
+ <body class='main'>
42
+ <div class="article-img">
43
+ <img src="http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif" width="660"
44
+ height="317" alt="test" class="lazy"
45
+ data-original="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg">
46
+ <noscript><img src="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"></noscript>
47
+ </div>
48
+ </body>
49
+ </html>
50
+ HTML
22
51
  end
23
52
 
24
53
  describe "images" do
@@ -36,6 +65,7 @@ describe Readability do
36
65
 
37
66
  FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif",
38
67
  :body => File.read(File.dirname(__FILE__) + "/fixtures/images/sign_up_emails_682__703712a.gif"))
68
+
39
69
  end
40
70
 
41
71
  it "should show one image, but outside of the best candidate" do
@@ -115,6 +145,17 @@ describe Readability do
115
145
  @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
116
146
  @doc.best_candidate_has_image.should == true
117
147
  end
148
+
149
+ it "should not miss an image if it exists by itself in a div without text" do
150
+ @doc = Readability::Document.new(@simple_html_with_img_no_text,:tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false, :do_not_guess_encoding => true)
151
+ @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
152
+ end
153
+
154
+ it "should not double count an image between script and noscript" do
155
+ @doc = Readability::Document.new(@simple_html_with_img_in_noscript,:tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false, :do_not_guess_encoding => true)
156
+ @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
157
+ end
158
+
118
159
  end
119
160
  end
120
161
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Cantino
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2014-04-18 00:00:00.000000000 Z
14
+ date: 2014-08-17 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rspec
@@ -61,14 +61,14 @@ dependencies:
61
61
  requirements:
62
62
  - - '>='
63
63
  - !ruby/object:Gem::Version
64
- version: 1.4.2
64
+ version: 1.6.0
65
65
  type: :runtime
66
66
  prerelease: false
67
67
  version_requirements: !ruby/object:Gem::Requirement
68
68
  requirements:
69
69
  - - '>='
70
70
  - !ruby/object:Gem::Version
71
- version: 1.4.2
71
+ version: 1.6.0
72
72
  - !ruby/object:Gem::Dependency
73
73
  name: guess_html_encoding
74
74
  requirement: !ruby/object:Gem::Requirement