ruby-readability 0.6.2 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 93fa92e0a04e5193bacf3e09a77cb8d25ddcff54
4
- data.tar.gz: 7d1f0cd78ead23386d15345d0f356c75fe8c6e2a
3
+ metadata.gz: b9f4f443e32b774c8c2b14856c78e7c593c6ef41
4
+ data.tar.gz: 3f6916bfc9b1c88c3c45f5e839fe0e2a4b882ab5
5
5
  SHA512:
6
- metadata.gz: 93b0d6d8fc0bc0800cfd1f03cebb72d61720eaa348996752be1d04b06504130d453d59d6214d3813af050f17f37f40f0b807bc701ef4c3771dc957b45e129a9a
7
- data.tar.gz: fb3faf6b214acd44e7f27dda0342f27b25ffe1decc2deeaf2267388d9ab109225aa4959d9099b9f0815f9b014bc24bbe93bbbe8533b94a57fc3ec1b06c8afd6c
6
+ metadata.gz: fdf2bb73b0ff4db4617c34996e72f23465d33d90a7631eaaa979235fd8f1f8c529dcf39f7930dc447df72e35e640726b0a3567e3cf0abdafb1ab88e46eb4e3ac
7
+ data.tar.gz: e75ebfeb153e89fbe52e94e0eab2f33865b32c75ed89e5411387d2cfa6a2f92d0671ecc000229d1ac3cf2027d18e7b7050053c32ab44dca05c8f9a35b20a1194
@@ -19,7 +19,21 @@ module Readability
19
19
  :blacklist => nil,
20
20
  :whitelist => nil
21
21
  }.freeze
22
-
22
+
23
+ REGEXES = {
24
+ :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
25
+ :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
26
+ :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
27
+ :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
28
+ :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
29
+ :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
30
+ :replaceFontsRe => /<(\/?)font[^>]*>/i,
31
+ :trimRe => /^\s+|\s+$/,
32
+ :normalizeRe => /\s{2,}/,
33
+ :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
34
+ :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
35
+ }
36
+
23
37
  attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
24
38
 
25
39
  def initialize(input, options = {})
@@ -129,6 +143,31 @@ module Readability
129
143
 
130
144
  (list_images.empty? and content != @html) ? images(@html, true) : list_images
131
145
  end
146
+
147
+ def images_with_fqdn_uris!(source_uri)
148
+ images_with_fqdn_uris(@html, source_uri)
149
+ end
150
+
151
+ def images_with_fqdn_uris(document = @html.dup, source_uri)
152
+ uri = URI.parse(source_uri)
153
+ host = uri.host
154
+ scheme = uri.scheme
155
+ port = uri.port # defaults to 80
156
+
157
+ base = "#{scheme}://#{host}:#{port}/"
158
+
159
+ images = []
160
+ document.css("img").each do |elem|
161
+ begin
162
+ elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
163
+ images << elem['src'].to_s
164
+ rescue URI::InvalidURIError => exc
165
+ elem.remove
166
+ end
167
+ end
168
+
169
+ images(document,true)
170
+ end
132
171
 
133
172
  def get_image_size(url)
134
173
  w, h = FastImage.size(url)
@@ -144,20 +183,6 @@ module Readability
144
183
  image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
145
184
  end
146
185
 
147
- REGEXES = {
148
- :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
149
- :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
150
- :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
151
- :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
152
- :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
153
- :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
154
- :replaceFontsRe => /<(\/?)font[^>]*>/i,
155
- :trimRe => /^\s+|\s+$/,
156
- :normalizeRe => /\s{2,}/,
157
- :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
158
- :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
159
- }
160
-
161
186
  def title
162
187
  title = @html.css("title").first
163
188
  title ? title.text : nil
@@ -444,7 +469,7 @@ module Readability
444
469
  weight = class_weight(el)
445
470
  content_score = candidates[el] ? candidates[el][:content_score] : 0
446
471
  name = el.name.downcase
447
-
472
+
448
473
  if weight + content_score < 0
449
474
  el.remove
450
475
  debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
@@ -452,6 +477,9 @@ module Readability
452
477
  counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
453
478
  counts["li"] -= 100
454
479
 
480
+ # For every img under a noscript tag discount one from the count to avoid double counting
481
+ counts["img"] -= el.css("noscript").css("img").length
482
+
455
483
  content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
456
484
  link_density = get_link_density(el)
457
485
 
@@ -465,13 +493,13 @@ module Readability
465
493
  end
466
494
 
467
495
  def clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
468
- if counts["img"] > counts["p"]
496
+ if (counts["img"] > counts["p"]) && (counts["img"] > 1)
469
497
  "too many images"
470
498
  elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
471
499
  "more <li>s than <p>s"
472
500
  elsif counts["input"] > (counts["p"] / 3).to_i
473
501
  "less than 3x <p>s than <input>s"
474
- elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
502
+ elsif (content_length < options[:min_text_length]) && (counts["img"] != 1)
475
503
  "too short a content length without a single image"
476
504
  elsif weight < 25 && link_density > 0.2
477
505
  "too many links for its weight (#{weight})"
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "ruby-readability"
6
- s.version = '0.6.2'
6
+ s.version = '0.7.0'
7
7
  s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
8
8
  s.email = ["andrew@iterationlabs.com"]
9
9
  s.homepage = "http://github.com/cantino/ruby-readability"
@@ -20,6 +20,6 @@ Gem::Specification.new do |s|
20
20
  s.add_development_dependency "rspec", ">= 2.8"
21
21
  s.add_development_dependency "rspec-expectations", ">= 2.8"
22
22
  s.add_development_dependency "rr", ">= 1.0"
23
- s.add_dependency 'nokogiri', '>= 1.4.2'
23
+ s.add_dependency 'nokogiri', '>= 1.6.0'
24
24
  s.add_dependency 'guess_html_encoding', '>= 0.0.4'
25
25
  end
@@ -19,6 +19,35 @@ describe Readability do
19
19
  </body>
20
20
  </html>
21
21
  HTML
22
+
23
+ @simple_html_with_img_no_text = <<-HTML
24
+ <html>
25
+ <head>
26
+ <title>title!</title>
27
+ </head>
28
+ <body class='main'>
29
+ <div class="article-img">
30
+ <img src="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg">
31
+ </div>
32
+ </body>
33
+ </html>
34
+ HTML
35
+
36
+ @simple_html_with_img_in_noscript = <<-HTML
37
+ <html>
38
+ <head>
39
+ <title>title!</title>
40
+ </head>
41
+ <body class='main'>
42
+ <div class="article-img">
43
+ <img src="http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif" width="660"
44
+ height="317" alt="test" class="lazy"
45
+ data-original="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg">
46
+ <noscript><img src="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"></noscript>
47
+ </div>
48
+ </body>
49
+ </html>
50
+ HTML
22
51
  end
23
52
 
24
53
  describe "images" do
@@ -36,6 +65,7 @@ describe Readability do
36
65
 
37
66
  FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif",
38
67
  :body => File.read(File.dirname(__FILE__) + "/fixtures/images/sign_up_emails_682__703712a.gif"))
68
+
39
69
  end
40
70
 
41
71
  it "should show one image, but outside of the best candidate" do
@@ -115,6 +145,17 @@ describe Readability do
115
145
  @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
116
146
  @doc.best_candidate_has_image.should == true
117
147
  end
148
+
149
+ it "should not miss an image if it exists by itself in a div without text" do
150
+ @doc = Readability::Document.new(@simple_html_with_img_no_text,:tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false, :do_not_guess_encoding => true)
151
+ @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
152
+ end
153
+
154
+ it "should not double count an image between script and noscript" do
155
+ @doc = Readability::Document.new(@simple_html_with_img_in_noscript,:tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false, :do_not_guess_encoding => true)
156
+ @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
157
+ end
158
+
118
159
  end
119
160
  end
120
161
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Cantino
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2014-04-18 00:00:00.000000000 Z
14
+ date: 2014-08-17 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rspec
@@ -61,14 +61,14 @@ dependencies:
61
61
  requirements:
62
62
  - - '>='
63
63
  - !ruby/object:Gem::Version
64
- version: 1.4.2
64
+ version: 1.6.0
65
65
  type: :runtime
66
66
  prerelease: false
67
67
  version_requirements: !ruby/object:Gem::Requirement
68
68
  requirements:
69
69
  - - '>='
70
70
  - !ruby/object:Gem::Version
71
- version: 1.4.2
71
+ version: 1.6.0
72
72
  - !ruby/object:Gem::Dependency
73
73
  name: guess_html_encoding
74
74
  requirement: !ruby/object:Gem::Requirement