ruby-readability 0.7.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 906a25fd00e8fc221c84aa41fedf38bbd3045aa0e4a543ff16a1d494e59c3a92
4
- data.tar.gz: bf28e458f7fb7f87a49ea71f16e736191c53130b91bdf2203cf260e6dce99aee
3
+ metadata.gz: f83eb55e4c0c4c30ad54e8e7104d68da8a5eb2b4d9cc76b45255055d89bf4b5c
4
+ data.tar.gz: 4d003c39b589477449bedd34634c5482dd503e94bfe24b9a5c29ea94f9b49f83
5
5
  SHA512:
6
- metadata.gz: e2d262b6c4f0d7a2146718d3e16c0dd8973b217a9fe0ba850d03a456c68b7bd4355cbdd0a78454b09f6f50717c87ac8da524d42d99e78e0f362830c554376fdd
7
- data.tar.gz: 6306f195c8d40842c0a4ed8ab2cfab1648fc562b03ba3137a0fd8c68ecb7a3668357c83abefd2b76bcac06efc961cdd042be10f44760aa102e34cdce2fe5d6d4
6
+ metadata.gz: e799e831297b18b381c3b1caad19531f99fe084f640afbddd1cf91e75fe234d3af4618f07e02a0c6214824726e3afe79accbb8ea5f0d66d9117b13112d22e8ef
7
+ data.tar.gz: 404d3a1bc702f3bd609e8c3ba8e37d6f023b2a3c126c278e7463a3dfee1cc5bf683f6c0c75cfabbb14e477f582b33cc8204d8682f33ed9a235b6fac8e90d9ad2
data/README.md CHANGED
@@ -41,7 +41,7 @@ You may provide options to `Readability::Document.new`, including:
41
41
  * `:remove_empty_nodes`: remove `<p>` tags that have no text content; also
42
42
  removes `<p>` tags that contain only images;
43
43
  * `:attributes`: whitelist of allowed attributes;
44
- * `:debug`: provide debugging output, defaults false;
44
+ * `:debug`: provide debugging output, defaults false; supports setting a Proc;
45
45
  * `:encoding`: if the page is of a known encoding, you can specify it; if left
46
46
  unspecified, the encoding will be guessed (only in Ruby 1.9.x). If you wish
47
47
  to disable guessing, supply `:do_not_guess_encoding => true`;
data/lib/readability.rb CHANGED
@@ -19,9 +19,10 @@ module Readability
19
19
  :blacklist => nil,
20
20
  :whitelist => nil,
21
21
  :elements_to_score => ["p", "td", "pre"],
22
- :likely_siblings => ["p"]
22
+ :likely_siblings => ["p"],
23
+ :ignore_redundant_nesting => false
23
24
  }.freeze
24
-
25
+
25
26
  REGEXES = {
26
27
  :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
27
28
  :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
@@ -35,7 +36,7 @@ module Readability
35
36
  :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
36
37
  :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
37
38
  }
38
-
39
+
39
40
  attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
40
41
 
41
42
  def initialize(input, options = {})
@@ -50,7 +51,7 @@ module Readability
50
51
  @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
51
52
  @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
52
53
  @weight_classes = @options[:weight_classes]
53
- @clean_conditionally = @options[:clean_conditionally]
54
+ @clean_conditionally = !!@options[:clean_conditionally]
54
55
  @best_candidate_has_image = true
55
56
  make_html
56
57
  handle_exclusions!(@options[:whitelist], @options[:blacklist])
@@ -145,11 +146,11 @@ module Readability
145
146
 
146
147
  (list_images.empty? and content != @html) ? images(@html, true) : list_images
147
148
  end
148
-
149
+
149
150
  def images_with_fqdn_uris!(source_uri)
150
151
  images_with_fqdn_uris(@html, source_uri)
151
152
  end
152
-
153
+
153
154
  def images_with_fqdn_uris(document = @html.dup, source_uri)
154
155
  uri = URI.parse(source_uri)
155
156
  host = uri.host
@@ -161,7 +162,7 @@ module Readability
161
162
  images = []
162
163
  document.css("img").each do |elem|
163
164
  begin
164
- elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
165
+ elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
165
166
  images << elem['src'].to_s
166
167
  rescue URI::InvalidURIError => exc
167
168
  elem.remove
@@ -264,14 +265,25 @@ module Readability
264
265
  sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
265
266
  downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
266
267
  output = Nokogiri::XML::Node.new('div', @html)
267
- best_candidate[:elem].parent.children.each do |sibling|
268
+
269
+ # If the best candidate is the only element in its parent then we will never find any siblings. Therefore,
270
+ # find the closest ancestor that has siblings (if :ignore_redundant_nesting is true). This improves the
271
+ # related content detection, but could lead to false positives. Not supported in arc90's readability.
272
+ node =
273
+ if options[:ignore_redundant_nesting]
274
+ closest_node_with_siblings(best_candidate[:elem])
275
+ else
276
+ best_candidate[:elem] # This is the default behaviour for consistency with arc90's readability.
277
+ end
278
+
279
+ node.parent.children.each do |sibling|
268
280
  append = false
269
- append = true if sibling == best_candidate[:elem]
281
+ append = true if sibling == node
270
282
  append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
271
283
 
272
284
  if downcased_likely_siblings.include?(sibling.name.downcase)
273
285
  link_density = get_link_density(sibling)
274
- node_content = sibling.text
286
+ node_content = sibling.text.strip
275
287
  node_length = node_content.length
276
288
 
277
289
  append = if node_length > 80 && link_density < 0.25
@@ -291,6 +303,23 @@ module Readability
291
303
  output
292
304
  end
293
305
 
306
+ def closest_node_with_siblings(element)
307
+ node = element
308
+
309
+ until node.node_name == 'body'
310
+ siblings = node.parent.children
311
+ non_empty = siblings.reject { |sibling| sibling.text? && sibling.text.strip.empty? }
312
+
313
+ if non_empty.size > 1
314
+ return node
315
+ else
316
+ node = node.parent
317
+ end
318
+ end
319
+
320
+ node
321
+ end
322
+
294
323
  def select_best_candidate(candidates)
295
324
  sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
296
325
 
@@ -372,7 +401,11 @@ module Readability
372
401
  end
373
402
 
374
403
  def debug(str)
375
- puts str if options[:debug]
404
+ if options[:debug].respond_to?(:call)
405
+ options[:debug].call(str)
406
+ elsif options[:debug]
407
+ puts str
408
+ end
376
409
  end
377
410
 
378
411
  def remove_unlikely_candidates!
@@ -426,7 +459,8 @@ module Readability
426
459
 
427
460
  # We'll sanitize all elements using a whitelist
428
461
  base_whitelist = @options[:tags] || %w[div p]
429
- all_whitelisted = base_whitelist.include?("*")
462
+ all_tags_whitelisted = base_whitelist.include?("*")
463
+ all_attr_whitelisted = @options[:attributes] && @options[:attributes].include?("*")
430
464
 
431
465
  # We'll add whitespace instead of block elements,
432
466
  # so a<br>b will have a nice space between them
@@ -440,8 +474,8 @@ module Readability
440
474
 
441
475
  ([node] + node.css("*")).each do |el|
442
476
  # If element is in whitelist, delete all its attributes
443
- if all_whitelisted || whitelist[el.node_name]
444
- el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
477
+ if all_tags_whitelisted || whitelist[el.node_name]
478
+ el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } unless all_attr_whitelisted
445
479
 
446
480
  # Otherwise, replace the element with its contents
447
481
  else
@@ -470,30 +504,43 @@ module Readability
470
504
 
471
505
  def clean_conditionally(node, candidates, selector)
472
506
  return unless @clean_conditionally
507
+
473
508
  node.css(selector).each do |el|
474
509
  weight = class_weight(el)
475
510
  content_score = candidates[el] ? candidates[el][:content_score] : 0
476
511
  name = el.name.downcase
477
-
512
+ remove = false
513
+ message = nil
514
+
478
515
  if weight + content_score < 0
479
- el.remove
480
- debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
516
+ remove = true
517
+ message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero."
481
518
  elsif el.text.count(",") < 10
482
519
  counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
483
520
  counts["li"] -= 100
484
521
 
485
522
  # For every img under a noscript tag discount one from the count to avoid double counting
486
523
  counts["img"] -= el.css("noscript").css("img").length
487
-
524
+
488
525
  content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
489
526
  link_density = get_link_density(el)
490
527
 
491
528
  reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
492
529
  if reason
493
- debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
494
- el.remove
530
+ message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}."
531
+ remove = true
495
532
  end
496
533
  end
534
+
535
+ if options[:clean_conditionally].respond_to?(:call)
536
+ context = { remove: remove, message: message, weight: weight, content_score: content_score, el: el }
537
+ remove = options[:clean_conditionally].call(context) # Allow the user to override the decision for whether to remove the element.
538
+ end
539
+
540
+ if remove
541
+ debug(message || "Conditionally cleaned by user-specified function.")
542
+ el.remove
543
+ end
497
544
  end
498
545
  end
499
546
 
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "ruby-readability"
6
- s.version = '0.7.1'
6
+ s.version = '0.7.2'
7
7
  s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
8
8
  s.email = ["andrew@iterationlabs.com"]
9
9
  s.homepage = "http://github.com/cantino/ruby-readability"
@@ -115,6 +115,11 @@ describe Readability do
115
115
  expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
116
116
  end
117
117
 
118
+ it "should be able to whitelist all attributes" do
119
+ @doc = Readability::Document.new(@nested, attributes: ["*"], tags: ["*"])
120
+ expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
121
+ end
122
+
118
123
  it "should not try to download local images" do
119
124
  @doc = Readability::Document.new(<<-HTML)
120
125
  <html>
@@ -498,6 +503,39 @@ describe Readability do
498
503
  <p>This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.</p>
499
504
  <p>The likely_siblings now include the section tag so it should be included in the output.</p>
500
505
  </section>
506
+ <section>
507
+ <p>too short when stripped </p>
508
+ </section>
509
+ #{'<a href="/">This link lowers the body score.</a>' * 5}
510
+ </body>
511
+ </html>
512
+ HTML
513
+
514
+ expect(@doc.content).to include("Paragraph 1")
515
+ expect(@doc.content).to include("Paragraph 2")
516
+ expect(@doc.content).to include("should be included")
517
+ expect(@doc.content).not_to include("too short when stripped")
518
+ end
519
+
520
+ it "climbs the DOM tree to the closest ancestor that has siblings when checking for related siblings" do
521
+ @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"], ignore_redundant_nesting: true)
522
+ <html>
523
+ <head>
524
+ <title>title!</title>
525
+ </head>
526
+ <body>
527
+ <div> <!-- This is the closest node of the best candidate that has siblings. -->
528
+ <div>
529
+ <section>
530
+ <p>Paragraph 1</p>
531
+ #{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
532
+ </section>
533
+ </div>
534
+ </div>
535
+ <section>
536
+ <p>This paragraph is longer than 80 characters and inside a section that is a sibling of the ancestor node.</p>
537
+ <p>The likely_siblings now include the section tag so it should be included in the output.</p>
538
+ </section>
501
539
  #{'<a href="/">This link lowers the body score.</a>' * 5}
502
540
  </body>
503
541
  </html>
@@ -739,11 +777,33 @@ describe Readability do
739
777
  end
740
778
 
741
779
  describe "clean_conditionally_reason?" do
742
- let (:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }
780
+ let(:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }
743
781
 
744
782
  it "does not raise error" do
745
783
  @doc = Readability::Document.new(list_fixture)
746
784
  expect { @doc.content }.to_not raise_error
747
785
  end
748
786
  end
787
+
788
+ describe "clean_conditionally" do
789
+ let(:fixture) { "<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>" }
790
+
791
+ it "can set a clean_conditionally function to allow overriding the default decision" do
792
+ clean_conditionally_fn = lambda { |context| !context[:remove] } # Flip the decision.
793
+ content = Readability::Document.new(fixture, clean_conditionally: clean_conditionally_fn, min_text_length: 0, retry_length: 1).content
794
+
795
+ expect(content).to include("sidebar")
796
+ expect(content).not_to include('Some content')
797
+ end
798
+ end
799
+
800
+ describe "debug" do
801
+ it "can set a debug function, e.g. to send output to Rails logger" do
802
+ output = []
803
+ debug_fn = lambda { |str| output << str }
804
+
805
+ Readability::Document.new(@simple_html_fixture, debug: debug_fn).content
806
+ expect(output).not_to be_empty
807
+ end
808
+ end
749
809
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.1
4
+ version: 0.7.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Cantino
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2024-06-11 00:00:00.000000000 Z
14
+ date: 2024-08-29 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rspec
@@ -134,7 +134,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
134
134
  - !ruby/object:Gem::Version
135
135
  version: '0'
136
136
  requirements: []
137
- rubygems_version: 3.5.10
137
+ rubygems_version: 3.5.14
138
138
  signing_key:
139
139
  specification_version: 4
140
140
  summary: Port of arc90's readability project to ruby