ruby-readability 0.7.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/readability.rb +67 -20
- data/ruby-readability.gemspec +1 -1
- data/spec/readability_spec.rb +61 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f83eb55e4c0c4c30ad54e8e7104d68da8a5eb2b4d9cc76b45255055d89bf4b5c
|
4
|
+
data.tar.gz: 4d003c39b589477449bedd34634c5482dd503e94bfe24b9a5c29ea94f9b49f83
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e799e831297b18b381c3b1caad19531f99fe084f640afbddd1cf91e75fe234d3af4618f07e02a0c6214824726e3afe79accbb8ea5f0d66d9117b13112d22e8ef
|
7
|
+
data.tar.gz: 404d3a1bc702f3bd609e8c3ba8e37d6f023b2a3c126c278e7463a3dfee1cc5bf683f6c0c75cfabbb14e477f582b33cc8204d8682f33ed9a235b6fac8e90d9ad2
|
data/README.md
CHANGED
@@ -41,7 +41,7 @@ You may provide options to `Readability::Document.new`, including:
|
|
41
41
|
* `:remove_empty_nodes`: remove `<p>` tags that have no text content; also
|
42
42
|
removes `<p>` tags that contain only images;
|
43
43
|
* `:attributes`: whitelist of allowed attributes;
|
44
|
-
* `:debug`: provide debugging output, defaults false;
|
44
|
+
* `:debug`: provide debugging output, defaults false; supports setting a Proc;
|
45
45
|
* `:encoding`: if the page is of a known encoding, you can specify it; if left
|
46
46
|
unspecified, the encoding will be guessed (only in Ruby 1.9.x). If you wish
|
47
47
|
to disable guessing, supply `:do_not_guess_encoding => true`;
|
data/lib/readability.rb
CHANGED
@@ -19,9 +19,10 @@ module Readability
|
|
19
19
|
:blacklist => nil,
|
20
20
|
:whitelist => nil,
|
21
21
|
:elements_to_score => ["p", "td", "pre"],
|
22
|
-
:likely_siblings => ["p"]
|
22
|
+
:likely_siblings => ["p"],
|
23
|
+
:ignore_redundant_nesting => false
|
23
24
|
}.freeze
|
24
|
-
|
25
|
+
|
25
26
|
REGEXES = {
|
26
27
|
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
27
28
|
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
|
@@ -35,7 +36,7 @@ module Readability
|
|
35
36
|
:killBreaksRe => /(<br\s*\/?>(\s| ?)*){1,}/,
|
36
37
|
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
37
38
|
}
|
38
|
-
|
39
|
+
|
39
40
|
attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
|
40
41
|
|
41
42
|
def initialize(input, options = {})
|
@@ -50,7 +51,7 @@ module Readability
|
|
50
51
|
@input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
|
51
52
|
@remove_unlikely_candidates = @options[:remove_unlikely_candidates]
|
52
53
|
@weight_classes = @options[:weight_classes]
|
53
|
-
@clean_conditionally =
|
54
|
+
@clean_conditionally = !!@options[:clean_conditionally]
|
54
55
|
@best_candidate_has_image = true
|
55
56
|
make_html
|
56
57
|
handle_exclusions!(@options[:whitelist], @options[:blacklist])
|
@@ -145,11 +146,11 @@ module Readability
|
|
145
146
|
|
146
147
|
(list_images.empty? and content != @html) ? images(@html, true) : list_images
|
147
148
|
end
|
148
|
-
|
149
|
+
|
149
150
|
def images_with_fqdn_uris!(source_uri)
|
150
151
|
images_with_fqdn_uris(@html, source_uri)
|
151
152
|
end
|
152
|
-
|
153
|
+
|
153
154
|
def images_with_fqdn_uris(document = @html.dup, source_uri)
|
154
155
|
uri = URI.parse(source_uri)
|
155
156
|
host = uri.host
|
@@ -161,7 +162,7 @@ module Readability
|
|
161
162
|
images = []
|
162
163
|
document.css("img").each do |elem|
|
163
164
|
begin
|
164
|
-
elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
|
165
|
+
elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
|
165
166
|
images << elem['src'].to_s
|
166
167
|
rescue URI::InvalidURIError => exc
|
167
168
|
elem.remove
|
@@ -264,14 +265,25 @@ module Readability
|
|
264
265
|
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
|
265
266
|
downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
|
266
267
|
output = Nokogiri::XML::Node.new('div', @html)
|
267
|
-
|
268
|
+
|
269
|
+
# If the best candidate is the only element in its parent then we will never find any siblings. Therefore,
|
270
|
+
# find the closest ancestor that has siblings (if :ignore_redundant_nesting is true). This improves the
|
271
|
+
# related content detection, but could lead to false positives. Not supported in arc90's readability.
|
272
|
+
node =
|
273
|
+
if options[:ignore_redundant_nesting]
|
274
|
+
closest_node_with_siblings(best_candidate[:elem])
|
275
|
+
else
|
276
|
+
best_candidate[:elem] # This is the default behaviour for consistency with arc90's readability.
|
277
|
+
end
|
278
|
+
|
279
|
+
node.parent.children.each do |sibling|
|
268
280
|
append = false
|
269
|
-
append = true if sibling ==
|
281
|
+
append = true if sibling == node
|
270
282
|
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
|
271
283
|
|
272
284
|
if downcased_likely_siblings.include?(sibling.name.downcase)
|
273
285
|
link_density = get_link_density(sibling)
|
274
|
-
node_content = sibling.text
|
286
|
+
node_content = sibling.text.strip
|
275
287
|
node_length = node_content.length
|
276
288
|
|
277
289
|
append = if node_length > 80 && link_density < 0.25
|
@@ -291,6 +303,23 @@ module Readability
|
|
291
303
|
output
|
292
304
|
end
|
293
305
|
|
306
|
+
def closest_node_with_siblings(element)
|
307
|
+
node = element
|
308
|
+
|
309
|
+
until node.node_name == 'body'
|
310
|
+
siblings = node.parent.children
|
311
|
+
non_empty = siblings.reject { |sibling| sibling.text? && sibling.text.strip.empty? }
|
312
|
+
|
313
|
+
if non_empty.size > 1
|
314
|
+
return node
|
315
|
+
else
|
316
|
+
node = node.parent
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
node
|
321
|
+
end
|
322
|
+
|
294
323
|
def select_best_candidate(candidates)
|
295
324
|
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
|
296
325
|
|
@@ -372,7 +401,11 @@ module Readability
|
|
372
401
|
end
|
373
402
|
|
374
403
|
def debug(str)
|
375
|
-
|
404
|
+
if options[:debug].respond_to?(:call)
|
405
|
+
options[:debug].call(str)
|
406
|
+
elsif options[:debug]
|
407
|
+
puts str
|
408
|
+
end
|
376
409
|
end
|
377
410
|
|
378
411
|
def remove_unlikely_candidates!
|
@@ -426,7 +459,8 @@ module Readability
|
|
426
459
|
|
427
460
|
# We'll sanitize all elements using a whitelist
|
428
461
|
base_whitelist = @options[:tags] || %w[div p]
|
429
|
-
|
462
|
+
all_tags_whitelisted = base_whitelist.include?("*")
|
463
|
+
all_attr_whitelisted = @options[:attributes] && @options[:attributes].include?("*")
|
430
464
|
|
431
465
|
# We'll add whitespace instead of block elements,
|
432
466
|
# so a<br>b will have a nice space between them
|
@@ -440,8 +474,8 @@ module Readability
|
|
440
474
|
|
441
475
|
([node] + node.css("*")).each do |el|
|
442
476
|
# If element is in whitelist, delete all its attributes
|
443
|
-
if
|
444
|
-
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
|
477
|
+
if all_tags_whitelisted || whitelist[el.node_name]
|
478
|
+
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } unless all_attr_whitelisted
|
445
479
|
|
446
480
|
# Otherwise, replace the element with its contents
|
447
481
|
else
|
@@ -470,30 +504,43 @@ module Readability
|
|
470
504
|
|
471
505
|
def clean_conditionally(node, candidates, selector)
|
472
506
|
return unless @clean_conditionally
|
507
|
+
|
473
508
|
node.css(selector).each do |el|
|
474
509
|
weight = class_weight(el)
|
475
510
|
content_score = candidates[el] ? candidates[el][:content_score] : 0
|
476
511
|
name = el.name.downcase
|
477
|
-
|
512
|
+
remove = false
|
513
|
+
message = nil
|
514
|
+
|
478
515
|
if weight + content_score < 0
|
479
|
-
|
480
|
-
|
516
|
+
remove = true
|
517
|
+
message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero."
|
481
518
|
elsif el.text.count(",") < 10
|
482
519
|
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
|
483
520
|
counts["li"] -= 100
|
484
521
|
|
485
522
|
# For every img under a noscript tag discount one from the count to avoid double counting
|
486
523
|
counts["img"] -= el.css("noscript").css("img").length
|
487
|
-
|
524
|
+
|
488
525
|
content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
|
489
526
|
link_density = get_link_density(el)
|
490
527
|
|
491
528
|
reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
|
492
529
|
if reason
|
493
|
-
|
494
|
-
|
530
|
+
message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}."
|
531
|
+
remove = true
|
495
532
|
end
|
496
533
|
end
|
534
|
+
|
535
|
+
if options[:clean_conditionally].respond_to?(:call)
|
536
|
+
context = { remove: remove, message: message, weight: weight, content_score: content_score, el: el }
|
537
|
+
remove = options[:clean_conditionally].call(context) # Allow the user to override the decision for whether to remove the element.
|
538
|
+
end
|
539
|
+
|
540
|
+
if remove
|
541
|
+
debug(message || "Conditionally cleaned by user-specified function.")
|
542
|
+
el.remove
|
543
|
+
end
|
497
544
|
end
|
498
545
|
end
|
499
546
|
|
data/ruby-readability.gemspec
CHANGED
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "ruby-readability"
|
6
|
-
s.version = '0.7.
|
6
|
+
s.version = '0.7.2'
|
7
7
|
s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
|
8
8
|
s.email = ["andrew@iterationlabs.com"]
|
9
9
|
s.homepage = "http://github.com/cantino/ruby-readability"
|
data/spec/readability_spec.rb
CHANGED
@@ -115,6 +115,11 @@ describe Readability do
|
|
115
115
|
expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
|
116
116
|
end
|
117
117
|
|
118
|
+
it "should be able to whitelist all attributes" do
|
119
|
+
@doc = Readability::Document.new(@nested, attributes: ["*"], tags: ["*"])
|
120
|
+
expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
|
121
|
+
end
|
122
|
+
|
118
123
|
it "should not try to download local images" do
|
119
124
|
@doc = Readability::Document.new(<<-HTML)
|
120
125
|
<html>
|
@@ -498,6 +503,39 @@ describe Readability do
|
|
498
503
|
<p>This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.</p>
|
499
504
|
<p>The likely_siblings now include the section tag so it should be included in the output.</p>
|
500
505
|
</section>
|
506
|
+
<section>
|
507
|
+
<p>too short when stripped </p>
|
508
|
+
</section>
|
509
|
+
#{'<a href="/">This link lowers the body score.</a>' * 5}
|
510
|
+
</body>
|
511
|
+
</html>
|
512
|
+
HTML
|
513
|
+
|
514
|
+
expect(@doc.content).to include("Paragraph 1")
|
515
|
+
expect(@doc.content).to include("Paragraph 2")
|
516
|
+
expect(@doc.content).to include("should be included")
|
517
|
+
expect(@doc.content).not_to include("too short when stripped")
|
518
|
+
end
|
519
|
+
|
520
|
+
it "climbs the DOM tree to the closest ancestor that has siblings when checking for related siblings" do
|
521
|
+
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"], ignore_redundant_nesting: true)
|
522
|
+
<html>
|
523
|
+
<head>
|
524
|
+
<title>title!</title>
|
525
|
+
</head>
|
526
|
+
<body>
|
527
|
+
<div> <!-- This is the closest node of the best candidate that has siblings. -->
|
528
|
+
<div>
|
529
|
+
<section>
|
530
|
+
<p>Paragraph 1</p>
|
531
|
+
#{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
|
532
|
+
</section>
|
533
|
+
</div>
|
534
|
+
</div>
|
535
|
+
<section>
|
536
|
+
<p>This paragraph is longer than 80 characters and inside a section that is a sibling of the ancestor node.</p>
|
537
|
+
<p>The likely_siblings now include the section tag so it should be included in the output.</p>
|
538
|
+
</section>
|
501
539
|
#{'<a href="/">This link lowers the body score.</a>' * 5}
|
502
540
|
</body>
|
503
541
|
</html>
|
@@ -739,11 +777,33 @@ describe Readability do
|
|
739
777
|
end
|
740
778
|
|
741
779
|
describe "clean_conditionally_reason?" do
|
742
|
-
let
|
780
|
+
let(:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }
|
743
781
|
|
744
782
|
it "does not raise error" do
|
745
783
|
@doc = Readability::Document.new(list_fixture)
|
746
784
|
expect { @doc.content }.to_not raise_error
|
747
785
|
end
|
748
786
|
end
|
787
|
+
|
788
|
+
describe "clean_conditionally" do
|
789
|
+
let(:fixture) { "<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>" }
|
790
|
+
|
791
|
+
it "can set a clean_conditionally function to allow overriding the default decision" do
|
792
|
+
clean_conditionally_fn = lambda { |context| !context[:remove] } # Flip the decision.
|
793
|
+
content = Readability::Document.new(fixture, clean_conditionally: clean_conditionally_fn, min_text_length: 0, retry_length: 1).content
|
794
|
+
|
795
|
+
expect(content).to include("sidebar")
|
796
|
+
expect(content).not_to include('Some content')
|
797
|
+
end
|
798
|
+
end
|
799
|
+
|
800
|
+
describe "debug" do
|
801
|
+
it "can set a debug function, e.g. to send output to Rails logger" do
|
802
|
+
output = []
|
803
|
+
debug_fn = lambda { |str| output << str }
|
804
|
+
|
805
|
+
Readability::Document.new(@simple_html_fixture, debug: debug_fn).content
|
806
|
+
expect(output).not_to be_empty
|
807
|
+
end
|
808
|
+
end
|
749
809
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-readability
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Cantino
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2024-
|
14
|
+
date: 2024-08-29 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rspec
|
@@ -134,7 +134,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
134
134
|
- !ruby/object:Gem::Version
|
135
135
|
version: '0'
|
136
136
|
requirements: []
|
137
|
-
rubygems_version: 3.5.
|
137
|
+
rubygems_version: 3.5.14
|
138
138
|
signing_key:
|
139
139
|
specification_version: 4
|
140
140
|
summary: Port of arc90's readability project to ruby
|