readability-rb 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/readability/cleaner.rb +10 -8
- data/lib/readability/document.rb +38 -7
- data/lib/readability/scoring.rb +2 -2
- data/lib/readability/utils.rb +1 -1
- data/lib/readability/version.rb +1 -1
- metadata +4 -7
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b7c8a5adc3a628af9f665a4a90129612b5be891797f0b90681ed96be5566a9fb
|
|
4
|
+
data.tar.gz: 76eb683b6a38605b637cbcbda8c25b96ef3060f7f8d277acfd3f4885537a6708
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 94f81c5b0502338b55ed611a6c289dfbd8250b1c058df5b45dff87bd126bcff2038518e85da4dfff0f6baea366f4cdb76305e258936862d063e1f964e621e1bf
|
|
7
|
+
data.tar.gz: ae056bc80680ad1416829e7454f0045c49ac7978450269ea2a5ee29955a7b84a11511100fe4ae2283abba835d4449be3fc22caf037e51fb9cfad4580cc61b5ef
|
data/lib/readability/cleaner.rb
CHANGED
|
@@ -447,11 +447,15 @@ module Readability
|
|
|
447
447
|
next true
|
|
448
448
|
end
|
|
449
449
|
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
450
|
+
inner_text = get_inner_text(node)
|
|
451
|
+
|
|
452
|
+
if inner_text.split(COMMAS).length - 1 < 10
|
|
453
|
+
tag_counts = Hash.new(0)
|
|
454
|
+
node.css("p, img, li, input").each { |n| tag_counts[n.name] += 1 }
|
|
455
|
+
p_count = tag_counts["p"]
|
|
456
|
+
img_count = tag_counts["img"]
|
|
457
|
+
li_count = tag_counts["li"] - 100
|
|
458
|
+
input_count = tag_counts["input"]
|
|
455
459
|
heading_density = get_text_density(node, ["h1", "h2", "h3", "h4", "h5", "h6"])
|
|
456
460
|
|
|
457
461
|
embed_count = 0
|
|
@@ -477,15 +481,13 @@ module Readability
|
|
|
477
481
|
end
|
|
478
482
|
next false if skip_removal
|
|
479
483
|
|
|
480
|
-
inner_text = get_inner_text(node)
|
|
481
|
-
|
|
482
484
|
# Toss any node whose inner text contains nothing but suspicious words
|
|
483
485
|
if AD_WORDS.match?(inner_text) || LOADING_WORDS.match?(inner_text)
|
|
484
486
|
next true
|
|
485
487
|
end
|
|
486
488
|
|
|
487
489
|
content_length = inner_text.length
|
|
488
|
-
link_density = get_link_density(node)
|
|
490
|
+
link_density = get_link_density(node, text_length: content_length)
|
|
489
491
|
textish_tags = %w[span li td] + DIV_TO_P_ELEMS.to_a
|
|
490
492
|
text_density = get_text_density(node, textish_tags)
|
|
491
493
|
is_figure_child = has_ancestor_tag?(node, "figure")
|
data/lib/readability/document.rb
CHANGED
|
@@ -42,9 +42,14 @@ module Readability
|
|
|
42
42
|
def parse
|
|
43
43
|
# Avoid parsing too large documents
|
|
44
44
|
if @max_elems_to_parse > 0
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
45
|
+
count = 0
|
|
46
|
+
@doc.traverse do |n|
|
|
47
|
+
if n.element?
|
|
48
|
+
count += 1
|
|
49
|
+
if count > @max_elems_to_parse
|
|
50
|
+
raise "Aborting parsing document; #{count} elements found"
|
|
51
|
+
end
|
|
52
|
+
end
|
|
48
53
|
end
|
|
49
54
|
end
|
|
50
55
|
|
|
@@ -59,6 +64,9 @@ module Readability
|
|
|
59
64
|
|
|
60
65
|
prep_document
|
|
61
66
|
|
|
67
|
+
# Cache the prepped body HTML for retry re-parsing (avoids innerHTML= cost)
|
|
68
|
+
@prepped_body_html = @doc.at_css("body")&.inner_html
|
|
69
|
+
|
|
62
70
|
metadata = get_article_metadata(json_ld)
|
|
63
71
|
@metadata = metadata
|
|
64
72
|
@article_title = metadata["title"]
|
|
@@ -109,7 +117,9 @@ module Readability
|
|
|
109
117
|
return nil
|
|
110
118
|
end
|
|
111
119
|
|
|
112
|
-
|
|
120
|
+
# Preserve the lang attribute from the HTML element before any retry re-parsing
|
|
121
|
+
preserved_article_lang = @doc.root && @doc.root["lang"]
|
|
122
|
+
preserved_article_dir = @doc.root && @doc.root["dir"]
|
|
113
123
|
|
|
114
124
|
while true
|
|
115
125
|
log("Starting grabArticle loop")
|
|
@@ -497,10 +507,10 @@ module Readability
|
|
|
497
507
|
text_length = get_inner_text(article_content, true).length
|
|
498
508
|
if text_length < @char_threshold
|
|
499
509
|
parse_successful = false
|
|
500
|
-
page.inner_html = page_cache_html
|
|
501
510
|
|
|
511
|
+
# Store serialized HTML instead of node references to avoid pinning old documents
|
|
502
512
|
@attempts << {
|
|
503
|
-
|
|
513
|
+
html: article_content.inner_html,
|
|
504
514
|
text_length: text_length
|
|
505
515
|
}
|
|
506
516
|
|
|
@@ -517,9 +527,30 @@ module Readability
|
|
|
517
527
|
# But first check if we actually have something
|
|
518
528
|
return nil if @attempts[0][:text_length] == 0
|
|
519
529
|
|
|
520
|
-
|
|
530
|
+
# Re-parse the best attempt from serialized HTML
|
|
531
|
+
best_doc = Nokogiri::HTML5("<html><body>#{@attempts[0][:html]}</body></html>")
|
|
532
|
+
best_doc.root["lang"] = preserved_article_lang if preserved_article_lang
|
|
533
|
+
best_doc.root["dir"] = preserved_article_dir if preserved_article_dir
|
|
534
|
+
article_content = best_doc.at_css("body")
|
|
535
|
+
@doc = best_doc
|
|
521
536
|
parse_successful = true
|
|
522
537
|
end
|
|
538
|
+
|
|
539
|
+
unless parse_successful
|
|
540
|
+
# Create a fresh document from the prepped body HTML, allowing the old one to be GC'd
|
|
541
|
+
@doc = Nokogiri::HTML5("<html><head></head><body>#{@prepped_body_html}</body></html>")
|
|
542
|
+
# Restore the lang attribute on the new HTML element so it's picked up during traversal
|
|
543
|
+
@doc.root["lang"] = preserved_article_lang if preserved_article_lang
|
|
544
|
+
@doc.root["dir"] = preserved_article_dir if preserved_article_dir
|
|
545
|
+
page = @doc.at_css("body")
|
|
546
|
+
|
|
547
|
+
# Clear node-referencing instance variables since they point to the old document
|
|
548
|
+
@candidates = {}
|
|
549
|
+
@data_tables = Set.new
|
|
550
|
+
@article_byline = nil
|
|
551
|
+
@article_dir = nil
|
|
552
|
+
@article_lang = preserved_article_lang
|
|
553
|
+
end
|
|
523
554
|
end
|
|
524
555
|
|
|
525
556
|
if parse_successful
|
data/lib/readability/scoring.rb
CHANGED
|
@@ -50,8 +50,8 @@ module Readability
|
|
|
50
50
|
# Port of _getLinkDensity (JS line 2143)
|
|
51
51
|
# Returns the ratio of anchor text length to total text length.
|
|
52
52
|
# Fragment-only links (#...) count at 0.3 coefficient.
|
|
53
|
-
def get_link_density(element)
|
|
54
|
-
text_length
|
|
53
|
+
def get_link_density(element, text_length: nil)
|
|
54
|
+
text_length ||= get_inner_text(element).length
|
|
55
55
|
return 0 if text_length == 0
|
|
56
56
|
|
|
57
57
|
link_length = 0.0
|
data/lib/readability/utils.rb
CHANGED
|
@@ -76,7 +76,7 @@ module Readability
|
|
|
76
76
|
js_trim(node.text).empty? &&
|
|
77
77
|
(node.element_children.empty? ||
|
|
78
78
|
node.element_children.length ==
|
|
79
|
-
node.css("br
|
|
79
|
+
node.css("br, hr").length)
|
|
80
80
|
end
|
|
81
81
|
|
|
82
82
|
# Port of _hasChildBlockElement (JS line 2044)
|
data/lib/readability/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: readability-rb
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andy Croll
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: bin
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
12
11
|
dependencies:
|
|
13
12
|
- !ruby/object:Gem::Dependency
|
|
14
13
|
name: nokogiri
|
|
@@ -52,7 +51,6 @@ metadata:
|
|
|
52
51
|
source_code_uri: https://github.com/andycroll/readability-rb
|
|
53
52
|
changelog_uri: https://github.com/andycroll/readability-rb/commits/main
|
|
54
53
|
bug_tracker_uri: https://github.com/andycroll/readability-rb/issues
|
|
55
|
-
post_install_message:
|
|
56
54
|
rdoc_options: []
|
|
57
55
|
require_paths:
|
|
58
56
|
- lib
|
|
@@ -60,15 +58,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
60
58
|
requirements:
|
|
61
59
|
- - ">="
|
|
62
60
|
- !ruby/object:Gem::Version
|
|
63
|
-
version: '3.
|
|
61
|
+
version: '3.2'
|
|
64
62
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
65
63
|
requirements:
|
|
66
64
|
- - ">="
|
|
67
65
|
- !ruby/object:Gem::Version
|
|
68
66
|
version: '0'
|
|
69
67
|
requirements: []
|
|
70
|
-
rubygems_version: 3.
|
|
71
|
-
signing_key:
|
|
68
|
+
rubygems_version: 3.6.9
|
|
72
69
|
specification_version: 4
|
|
73
70
|
summary: Extract readable article content from HTML pages
|
|
74
71
|
test_files: []
|