ruby-readability 0.7.1 → 0.7.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/readability.rb +67 -20
- data/ruby-readability.gemspec +1 -1
- data/spec/readability_spec.rb +61 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f83eb55e4c0c4c30ad54e8e7104d68da8a5eb2b4d9cc76b45255055d89bf4b5c
|
4
|
+
data.tar.gz: 4d003c39b589477449bedd34634c5482dd503e94bfe24b9a5c29ea94f9b49f83
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e799e831297b18b381c3b1caad19531f99fe084f640afbddd1cf91e75fe234d3af4618f07e02a0c6214824726e3afe79accbb8ea5f0d66d9117b13112d22e8ef
|
7
|
+
data.tar.gz: 404d3a1bc702f3bd609e8c3ba8e37d6f023b2a3c126c278e7463a3dfee1cc5bf683f6c0c75cfabbb14e477f582b33cc8204d8682f33ed9a235b6fac8e90d9ad2
|
data/README.md
CHANGED
@@ -41,7 +41,7 @@ You may provide options to `Readability::Document.new`, including:
|
|
41
41
|
* `:remove_empty_nodes`: remove `<p>` tags that have no text content; also
|
42
42
|
removes `<p>` tags that contain only images;
|
43
43
|
* `:attributes`: whitelist of allowed attributes;
|
44
|
-
* `:debug`: provide debugging output, defaults false;
|
44
|
+
* `:debug`: provide debugging output, defaults false; supports setting a Proc;
|
45
45
|
* `:encoding`: if the page is of a known encoding, you can specify it; if left
|
46
46
|
unspecified, the encoding will be guessed (only in Ruby 1.9.x). If you wish
|
47
47
|
to disable guessing, supply `:do_not_guess_encoding => true`;
|
data/lib/readability.rb
CHANGED
@@ -19,9 +19,10 @@ module Readability
|
|
19
19
|
:blacklist => nil,
|
20
20
|
:whitelist => nil,
|
21
21
|
:elements_to_score => ["p", "td", "pre"],
|
22
|
-
:likely_siblings => ["p"]
|
22
|
+
:likely_siblings => ["p"],
|
23
|
+
:ignore_redundant_nesting => false
|
23
24
|
}.freeze
|
24
|
-
|
25
|
+
|
25
26
|
REGEXES = {
|
26
27
|
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
27
28
|
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
|
@@ -35,7 +36,7 @@ module Readability
|
|
35
36
|
:killBreaksRe => /(<br\s*\/?>(\s| ?)*){1,}/,
|
36
37
|
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
37
38
|
}
|
38
|
-
|
39
|
+
|
39
40
|
attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
|
40
41
|
|
41
42
|
def initialize(input, options = {})
|
@@ -50,7 +51,7 @@ module Readability
|
|
50
51
|
@input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
|
51
52
|
@remove_unlikely_candidates = @options[:remove_unlikely_candidates]
|
52
53
|
@weight_classes = @options[:weight_classes]
|
53
|
-
@clean_conditionally =
|
54
|
+
@clean_conditionally = !!@options[:clean_conditionally]
|
54
55
|
@best_candidate_has_image = true
|
55
56
|
make_html
|
56
57
|
handle_exclusions!(@options[:whitelist], @options[:blacklist])
|
@@ -145,11 +146,11 @@ module Readability
|
|
145
146
|
|
146
147
|
(list_images.empty? and content != @html) ? images(@html, true) : list_images
|
147
148
|
end
|
148
|
-
|
149
|
+
|
149
150
|
def images_with_fqdn_uris!(source_uri)
|
150
151
|
images_with_fqdn_uris(@html, source_uri)
|
151
152
|
end
|
152
|
-
|
153
|
+
|
153
154
|
def images_with_fqdn_uris(document = @html.dup, source_uri)
|
154
155
|
uri = URI.parse(source_uri)
|
155
156
|
host = uri.host
|
@@ -161,7 +162,7 @@ module Readability
|
|
161
162
|
images = []
|
162
163
|
document.css("img").each do |elem|
|
163
164
|
begin
|
164
|
-
elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
|
165
|
+
elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
|
165
166
|
images << elem['src'].to_s
|
166
167
|
rescue URI::InvalidURIError => exc
|
167
168
|
elem.remove
|
@@ -264,14 +265,25 @@ module Readability
|
|
264
265
|
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
|
265
266
|
downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
|
266
267
|
output = Nokogiri::XML::Node.new('div', @html)
|
267
|
-
|
268
|
+
|
269
|
+
# If the best candidate is the only element in its parent then we will never find any siblings. Therefore,
|
270
|
+
# find the closest ancestor that has siblings (if :ignore_redundant_nesting is true). This improves the
|
271
|
+
# related content detection, but could lead to false positives. Not supported in arc90's readability.
|
272
|
+
node =
|
273
|
+
if options[:ignore_redundant_nesting]
|
274
|
+
closest_node_with_siblings(best_candidate[:elem])
|
275
|
+
else
|
276
|
+
best_candidate[:elem] # This is the default behaviour for consistency with arc90's readability.
|
277
|
+
end
|
278
|
+
|
279
|
+
node.parent.children.each do |sibling|
|
268
280
|
append = false
|
269
|
-
append = true if sibling ==
|
281
|
+
append = true if sibling == node
|
270
282
|
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
|
271
283
|
|
272
284
|
if downcased_likely_siblings.include?(sibling.name.downcase)
|
273
285
|
link_density = get_link_density(sibling)
|
274
|
-
node_content = sibling.text
|
286
|
+
node_content = sibling.text.strip
|
275
287
|
node_length = node_content.length
|
276
288
|
|
277
289
|
append = if node_length > 80 && link_density < 0.25
|
@@ -291,6 +303,23 @@ module Readability
|
|
291
303
|
output
|
292
304
|
end
|
293
305
|
|
306
|
+
def closest_node_with_siblings(element)
|
307
|
+
node = element
|
308
|
+
|
309
|
+
until node.node_name == 'body'
|
310
|
+
siblings = node.parent.children
|
311
|
+
non_empty = siblings.reject { |sibling| sibling.text? && sibling.text.strip.empty? }
|
312
|
+
|
313
|
+
if non_empty.size > 1
|
314
|
+
return node
|
315
|
+
else
|
316
|
+
node = node.parent
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
node
|
321
|
+
end
|
322
|
+
|
294
323
|
def select_best_candidate(candidates)
|
295
324
|
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
|
296
325
|
|
@@ -372,7 +401,11 @@ module Readability
|
|
372
401
|
end
|
373
402
|
|
374
403
|
def debug(str)
|
375
|
-
|
404
|
+
if options[:debug].respond_to?(:call)
|
405
|
+
options[:debug].call(str)
|
406
|
+
elsif options[:debug]
|
407
|
+
puts str
|
408
|
+
end
|
376
409
|
end
|
377
410
|
|
378
411
|
def remove_unlikely_candidates!
|
@@ -426,7 +459,8 @@ module Readability
|
|
426
459
|
|
427
460
|
# We'll sanitize all elements using a whitelist
|
428
461
|
base_whitelist = @options[:tags] || %w[div p]
|
429
|
-
|
462
|
+
all_tags_whitelisted = base_whitelist.include?("*")
|
463
|
+
all_attr_whitelisted = @options[:attributes] && @options[:attributes].include?("*")
|
430
464
|
|
431
465
|
# We'll add whitespace instead of block elements,
|
432
466
|
# so a<br>b will have a nice space between them
|
@@ -440,8 +474,8 @@ module Readability
|
|
440
474
|
|
441
475
|
([node] + node.css("*")).each do |el|
|
442
476
|
# If element is in whitelist, delete all its attributes
|
443
|
-
if
|
444
|
-
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
|
477
|
+
if all_tags_whitelisted || whitelist[el.node_name]
|
478
|
+
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } unless all_attr_whitelisted
|
445
479
|
|
446
480
|
# Otherwise, replace the element with its contents
|
447
481
|
else
|
@@ -470,30 +504,43 @@ module Readability
|
|
470
504
|
|
471
505
|
def clean_conditionally(node, candidates, selector)
|
472
506
|
return unless @clean_conditionally
|
507
|
+
|
473
508
|
node.css(selector).each do |el|
|
474
509
|
weight = class_weight(el)
|
475
510
|
content_score = candidates[el] ? candidates[el][:content_score] : 0
|
476
511
|
name = el.name.downcase
|
477
|
-
|
512
|
+
remove = false
|
513
|
+
message = nil
|
514
|
+
|
478
515
|
if weight + content_score < 0
|
479
|
-
|
480
|
-
|
516
|
+
remove = true
|
517
|
+
message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero."
|
481
518
|
elsif el.text.count(",") < 10
|
482
519
|
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
|
483
520
|
counts["li"] -= 100
|
484
521
|
|
485
522
|
# For every img under a noscript tag discount one from the count to avoid double counting
|
486
523
|
counts["img"] -= el.css("noscript").css("img").length
|
487
|
-
|
524
|
+
|
488
525
|
content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
|
489
526
|
link_density = get_link_density(el)
|
490
527
|
|
491
528
|
reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
|
492
529
|
if reason
|
493
|
-
|
494
|
-
|
530
|
+
message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}."
|
531
|
+
remove = true
|
495
532
|
end
|
496
533
|
end
|
534
|
+
|
535
|
+
if options[:clean_conditionally].respond_to?(:call)
|
536
|
+
context = { remove: remove, message: message, weight: weight, content_score: content_score, el: el }
|
537
|
+
remove = options[:clean_conditionally].call(context) # Allow the user to override the decision for whether to remove the element.
|
538
|
+
end
|
539
|
+
|
540
|
+
if remove
|
541
|
+
debug(message || "Conditionally cleaned by user-specified function.")
|
542
|
+
el.remove
|
543
|
+
end
|
497
544
|
end
|
498
545
|
end
|
499
546
|
|
data/ruby-readability.gemspec
CHANGED
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "ruby-readability"
|
6
|
-
s.version = '0.7.
|
6
|
+
s.version = '0.7.2'
|
7
7
|
s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
|
8
8
|
s.email = ["andrew@iterationlabs.com"]
|
9
9
|
s.homepage = "http://github.com/cantino/ruby-readability"
|
data/spec/readability_spec.rb
CHANGED
@@ -115,6 +115,11 @@ describe Readability do
|
|
115
115
|
expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
|
116
116
|
end
|
117
117
|
|
118
|
+
it "should be able to whitelist all attributes" do
|
119
|
+
@doc = Readability::Document.new(@nested, attributes: ["*"], tags: ["*"])
|
120
|
+
expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
|
121
|
+
end
|
122
|
+
|
118
123
|
it "should not try to download local images" do
|
119
124
|
@doc = Readability::Document.new(<<-HTML)
|
120
125
|
<html>
|
@@ -498,6 +503,39 @@ describe Readability do
|
|
498
503
|
<p>This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.</p>
|
499
504
|
<p>The likely_siblings now include the section tag so it should be included in the output.</p>
|
500
505
|
</section>
|
506
|
+
<section>
|
507
|
+
<p>too short when stripped </p>
|
508
|
+
</section>
|
509
|
+
#{'<a href="/">This link lowers the body score.</a>' * 5}
|
510
|
+
</body>
|
511
|
+
</html>
|
512
|
+
HTML
|
513
|
+
|
514
|
+
expect(@doc.content).to include("Paragraph 1")
|
515
|
+
expect(@doc.content).to include("Paragraph 2")
|
516
|
+
expect(@doc.content).to include("should be included")
|
517
|
+
expect(@doc.content).not_to include("too short when stripped")
|
518
|
+
end
|
519
|
+
|
520
|
+
it "climbs the DOM tree to the closest ancestor that has siblings when checking for related siblings" do
|
521
|
+
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"], ignore_redundant_nesting: true)
|
522
|
+
<html>
|
523
|
+
<head>
|
524
|
+
<title>title!</title>
|
525
|
+
</head>
|
526
|
+
<body>
|
527
|
+
<div> <!-- This is the closest node of the best candidate that has siblings. -->
|
528
|
+
<div>
|
529
|
+
<section>
|
530
|
+
<p>Paragraph 1</p>
|
531
|
+
#{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
|
532
|
+
</section>
|
533
|
+
</div>
|
534
|
+
</div>
|
535
|
+
<section>
|
536
|
+
<p>This paragraph is longer than 80 characters and inside a section that is a sibling of the ancestor node.</p>
|
537
|
+
<p>The likely_siblings now include the section tag so it should be included in the output.</p>
|
538
|
+
</section>
|
501
539
|
#{'<a href="/">This link lowers the body score.</a>' * 5}
|
502
540
|
</body>
|
503
541
|
</html>
|
@@ -739,11 +777,33 @@ describe Readability do
|
|
739
777
|
end
|
740
778
|
|
741
779
|
describe "clean_conditionally_reason?" do
|
742
|
-
let
|
780
|
+
let(:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }
|
743
781
|
|
744
782
|
it "does not raise error" do
|
745
783
|
@doc = Readability::Document.new(list_fixture)
|
746
784
|
expect { @doc.content }.to_not raise_error
|
747
785
|
end
|
748
786
|
end
|
787
|
+
|
788
|
+
describe "clean_conditionally" do
|
789
|
+
let(:fixture) { "<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>" }
|
790
|
+
|
791
|
+
it "can set a clean_conditionally function to allow overriding the default decision" do
|
792
|
+
clean_conditionally_fn = lambda { |context| !context[:remove] } # Flip the decision.
|
793
|
+
content = Readability::Document.new(fixture, clean_conditionally: clean_conditionally_fn, min_text_length: 0, retry_length: 1).content
|
794
|
+
|
795
|
+
expect(content).to include("sidebar")
|
796
|
+
expect(content).not_to include('Some content')
|
797
|
+
end
|
798
|
+
end
|
799
|
+
|
800
|
+
describe "debug" do
|
801
|
+
it "can set a debug function, e.g. to send output to Rails logger" do
|
802
|
+
output = []
|
803
|
+
debug_fn = lambda { |str| output << str }
|
804
|
+
|
805
|
+
Readability::Document.new(@simple_html_fixture, debug: debug_fn).content
|
806
|
+
expect(output).not_to be_empty
|
807
|
+
end
|
808
|
+
end
|
749
809
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-readability
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Cantino
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2024-
|
14
|
+
date: 2024-08-29 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rspec
|
@@ -134,7 +134,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
134
134
|
- !ruby/object:Gem::Version
|
135
135
|
version: '0'
|
136
136
|
requirements: []
|
137
|
-
rubygems_version: 3.5.
|
137
|
+
rubygems_version: 3.5.14
|
138
138
|
signing_key:
|
139
139
|
specification_version: 4
|
140
140
|
summary: Port of arc90's readability project to ruby
|