ruby-readability 0.7.1 → 0.7.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 906a25fd00e8fc221c84aa41fedf38bbd3045aa0e4a543ff16a1d494e59c3a92
4
- data.tar.gz: bf28e458f7fb7f87a49ea71f16e736191c53130b91bdf2203cf260e6dce99aee
3
+ metadata.gz: f83eb55e4c0c4c30ad54e8e7104d68da8a5eb2b4d9cc76b45255055d89bf4b5c
4
+ data.tar.gz: 4d003c39b589477449bedd34634c5482dd503e94bfe24b9a5c29ea94f9b49f83
5
5
  SHA512:
6
- metadata.gz: e2d262b6c4f0d7a2146718d3e16c0dd8973b217a9fe0ba850d03a456c68b7bd4355cbdd0a78454b09f6f50717c87ac8da524d42d99e78e0f362830c554376fdd
7
- data.tar.gz: 6306f195c8d40842c0a4ed8ab2cfab1648fc562b03ba3137a0fd8c68ecb7a3668357c83abefd2b76bcac06efc961cdd042be10f44760aa102e34cdce2fe5d6d4
6
+ metadata.gz: e799e831297b18b381c3b1caad19531f99fe084f640afbddd1cf91e75fe234d3af4618f07e02a0c6214824726e3afe79accbb8ea5f0d66d9117b13112d22e8ef
7
+ data.tar.gz: 404d3a1bc702f3bd609e8c3ba8e37d6f023b2a3c126c278e7463a3dfee1cc5bf683f6c0c75cfabbb14e477f582b33cc8204d8682f33ed9a235b6fac8e90d9ad2
data/README.md CHANGED
@@ -41,7 +41,7 @@ You may provide options to `Readability::Document.new`, including:
41
41
  * `:remove_empty_nodes`: remove `<p>` tags that have no text content; also
42
42
  removes `<p>` tags that contain only images;
43
43
  * `:attributes`: whitelist of allowed attributes;
44
- * `:debug`: provide debugging output, defaults false;
44
+ * `:debug`: provide debugging output, defaults false; supports setting a Proc;
45
45
  * `:encoding`: if the page is of a known encoding, you can specify it; if left
46
46
  unspecified, the encoding will be guessed (only in Ruby 1.9.x). If you wish
47
47
  to disable guessing, supply `:do_not_guess_encoding => true`;
data/lib/readability.rb CHANGED
@@ -19,9 +19,10 @@ module Readability
19
19
  :blacklist => nil,
20
20
  :whitelist => nil,
21
21
  :elements_to_score => ["p", "td", "pre"],
22
- :likely_siblings => ["p"]
22
+ :likely_siblings => ["p"],
23
+ :ignore_redundant_nesting => false
23
24
  }.freeze
24
-
25
+
25
26
  REGEXES = {
26
27
  :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
27
28
  :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
@@ -35,7 +36,7 @@ module Readability
35
36
  :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
36
37
  :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
37
38
  }
38
-
39
+
39
40
  attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
40
41
 
41
42
  def initialize(input, options = {})
@@ -50,7 +51,7 @@ module Readability
50
51
  @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
51
52
  @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
52
53
  @weight_classes = @options[:weight_classes]
53
- @clean_conditionally = @options[:clean_conditionally]
54
+ @clean_conditionally = !!@options[:clean_conditionally]
54
55
  @best_candidate_has_image = true
55
56
  make_html
56
57
  handle_exclusions!(@options[:whitelist], @options[:blacklist])
@@ -145,11 +146,11 @@ module Readability
145
146
 
146
147
  (list_images.empty? and content != @html) ? images(@html, true) : list_images
147
148
  end
148
-
149
+
149
150
  def images_with_fqdn_uris!(source_uri)
150
151
  images_with_fqdn_uris(@html, source_uri)
151
152
  end
152
-
153
+
153
154
  def images_with_fqdn_uris(document = @html.dup, source_uri)
154
155
  uri = URI.parse(source_uri)
155
156
  host = uri.host
@@ -161,7 +162,7 @@ module Readability
161
162
  images = []
162
163
  document.css("img").each do |elem|
163
164
  begin
164
- elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
165
+ elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
165
166
  images << elem['src'].to_s
166
167
  rescue URI::InvalidURIError => exc
167
168
  elem.remove
@@ -264,14 +265,25 @@ module Readability
264
265
  sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
265
266
  downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
266
267
  output = Nokogiri::XML::Node.new('div', @html)
267
- best_candidate[:elem].parent.children.each do |sibling|
268
+
269
+ # If the best candidate is the only element in its parent then we will never find any siblings. Therefore,
270
+ # find the closest ancestor that has siblings (if :ignore_redundant_nesting is true). This improves the
271
+ # related content detection, but could lead to false positives. Not supported in arc90's readability.
272
+ node =
273
+ if options[:ignore_redundant_nesting]
274
+ closest_node_with_siblings(best_candidate[:elem])
275
+ else
276
+ best_candidate[:elem] # This is the default behaviour for consistency with arc90's readability.
277
+ end
278
+
279
+ node.parent.children.each do |sibling|
268
280
  append = false
269
- append = true if sibling == best_candidate[:elem]
281
+ append = true if sibling == node
270
282
  append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
271
283
 
272
284
  if downcased_likely_siblings.include?(sibling.name.downcase)
273
285
  link_density = get_link_density(sibling)
274
- node_content = sibling.text
286
+ node_content = sibling.text.strip
275
287
  node_length = node_content.length
276
288
 
277
289
  append = if node_length > 80 && link_density < 0.25
@@ -291,6 +303,23 @@ module Readability
291
303
  output
292
304
  end
293
305
 
306
+ def closest_node_with_siblings(element)
307
+ node = element
308
+
309
+ until node.node_name == 'body'
310
+ siblings = node.parent.children
311
+ non_empty = siblings.reject { |sibling| sibling.text? && sibling.text.strip.empty? }
312
+
313
+ if non_empty.size > 1
314
+ return node
315
+ else
316
+ node = node.parent
317
+ end
318
+ end
319
+
320
+ node
321
+ end
322
+
294
323
  def select_best_candidate(candidates)
295
324
  sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
296
325
 
@@ -372,7 +401,11 @@ module Readability
372
401
  end
373
402
 
374
403
  def debug(str)
375
- puts str if options[:debug]
404
+ if options[:debug].respond_to?(:call)
405
+ options[:debug].call(str)
406
+ elsif options[:debug]
407
+ puts str
408
+ end
376
409
  end
377
410
 
378
411
  def remove_unlikely_candidates!
@@ -426,7 +459,8 @@ module Readability
426
459
 
427
460
  # We'll sanitize all elements using a whitelist
428
461
  base_whitelist = @options[:tags] || %w[div p]
429
- all_whitelisted = base_whitelist.include?("*")
462
+ all_tags_whitelisted = base_whitelist.include?("*")
463
+ all_attr_whitelisted = @options[:attributes] && @options[:attributes].include?("*")
430
464
 
431
465
  # We'll add whitespace instead of block elements,
432
466
  # so a<br>b will have a nice space between them
@@ -440,8 +474,8 @@ module Readability
440
474
 
441
475
  ([node] + node.css("*")).each do |el|
442
476
  # If element is in whitelist, delete all its attributes
443
- if all_whitelisted || whitelist[el.node_name]
444
- el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
477
+ if all_tags_whitelisted || whitelist[el.node_name]
478
+ el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } unless all_attr_whitelisted
445
479
 
446
480
  # Otherwise, replace the element with its contents
447
481
  else
@@ -470,30 +504,43 @@ module Readability
470
504
 
471
505
  def clean_conditionally(node, candidates, selector)
472
506
  return unless @clean_conditionally
507
+
473
508
  node.css(selector).each do |el|
474
509
  weight = class_weight(el)
475
510
  content_score = candidates[el] ? candidates[el][:content_score] : 0
476
511
  name = el.name.downcase
477
-
512
+ remove = false
513
+ message = nil
514
+
478
515
  if weight + content_score < 0
479
- el.remove
480
- debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
516
+ remove = true
517
+ message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero."
481
518
  elsif el.text.count(",") < 10
482
519
  counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
483
520
  counts["li"] -= 100
484
521
 
485
522
  # For every img under a noscript tag discount one from the count to avoid double counting
486
523
  counts["img"] -= el.css("noscript").css("img").length
487
-
524
+
488
525
  content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
489
526
  link_density = get_link_density(el)
490
527
 
491
528
  reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
492
529
  if reason
493
- debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
494
- el.remove
530
+ message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}."
531
+ remove = true
495
532
  end
496
533
  end
534
+
535
+ if options[:clean_conditionally].respond_to?(:call)
536
+ context = { remove: remove, message: message, weight: weight, content_score: content_score, el: el }
537
+ remove = options[:clean_conditionally].call(context) # Allow the user to override the decision for whether to remove the element.
538
+ end
539
+
540
+ if remove
541
+ debug(message || "Conditionally cleaned by user-specified function.")
542
+ el.remove
543
+ end
497
544
  end
498
545
  end
499
546
 
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "ruby-readability"
6
- s.version = '0.7.1'
6
+ s.version = '0.7.2'
7
7
  s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
8
8
  s.email = ["andrew@iterationlabs.com"]
9
9
  s.homepage = "http://github.com/cantino/ruby-readability"
@@ -115,6 +115,11 @@ describe Readability do
115
115
  expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
116
116
  end
117
117
 
118
+ it "should be able to whitelist all attributes" do
119
+ @doc = Readability::Document.new(@nested, attributes: ["*"], tags: ["*"])
120
+ expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
121
+ end
122
+
118
123
  it "should not try to download local images" do
119
124
  @doc = Readability::Document.new(<<-HTML)
120
125
  <html>
@@ -498,6 +503,39 @@ describe Readability do
498
503
  <p>This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.</p>
499
504
  <p>The likely_siblings now include the section tag so it should be included in the output.</p>
500
505
  </section>
506
+ <section>
507
+ <p>too short when stripped </p>
508
+ </section>
509
+ #{'<a href="/">This link lowers the body score.</a>' * 5}
510
+ </body>
511
+ </html>
512
+ HTML
513
+
514
+ expect(@doc.content).to include("Paragraph 1")
515
+ expect(@doc.content).to include("Paragraph 2")
516
+ expect(@doc.content).to include("should be included")
517
+ expect(@doc.content).not_to include("too short when stripped")
518
+ end
519
+
520
+ it "climbs the DOM tree to the closest ancestor that has siblings when checking for related siblings" do
521
+ @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"], ignore_redundant_nesting: true)
522
+ <html>
523
+ <head>
524
+ <title>title!</title>
525
+ </head>
526
+ <body>
527
+ <div> <!-- This is the closest node of the best candidate that has siblings. -->
528
+ <div>
529
+ <section>
530
+ <p>Paragraph 1</p>
531
+ #{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
532
+ </section>
533
+ </div>
534
+ </div>
535
+ <section>
536
+ <p>This paragraph is longer than 80 characters and inside a section that is a sibling of the ancestor node.</p>
537
+ <p>The likely_siblings now include the section tag so it should be included in the output.</p>
538
+ </section>
501
539
  #{'<a href="/">This link lowers the body score.</a>' * 5}
502
540
  </body>
503
541
  </html>
@@ -739,11 +777,33 @@ describe Readability do
739
777
  end
740
778
 
741
779
  describe "clean_conditionally_reason?" do
742
- let (:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }
780
+ let(:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }
743
781
 
744
782
  it "does not raise error" do
745
783
  @doc = Readability::Document.new(list_fixture)
746
784
  expect { @doc.content }.to_not raise_error
747
785
  end
748
786
  end
787
+
788
+ describe "clean_conditionally" do
789
+ let(:fixture) { "<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>" }
790
+
791
+ it "can set a clean_conditionally function to allow overriding the default decision" do
792
+ clean_conditionally_fn = lambda { |context| !context[:remove] } # Flip the decision.
793
+ content = Readability::Document.new(fixture, clean_conditionally: clean_conditionally_fn, min_text_length: 0, retry_length: 1).content
794
+
795
+ expect(content).to include("sidebar")
796
+ expect(content).not_to include('Some content')
797
+ end
798
+ end
799
+
800
+ describe "debug" do
801
+ it "can set a debug function, e.g. to send output to Rails logger" do
802
+ output = []
803
+ debug_fn = lambda { |str| output << str }
804
+
805
+ Readability::Document.new(@simple_html_fixture, debug: debug_fn).content
806
+ expect(output).not_to be_empty
807
+ end
808
+ end
749
809
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.1
4
+ version: 0.7.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Cantino
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2024-06-11 00:00:00.000000000 Z
14
+ date: 2024-08-29 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rspec
@@ -134,7 +134,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
134
134
  - !ruby/object:Gem::Version
135
135
  version: '0'
136
136
  requirements: []
137
- rubygems_version: 3.5.10
137
+ rubygems_version: 3.5.14
138
138
  signing_key:
139
139
  specification_version: 4
140
140
  summary: Port of arc90's readability project to ruby