readability-rb 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +189 -0
- data/README.md +107 -0
- data/lib/readability/cleaner.rb +742 -0
- data/lib/readability/document.rb +555 -0
- data/lib/readability/metadata.rb +299 -0
- data/lib/readability/readerable.rb +61 -0
- data/lib/readability/regexps.rb +91 -0
- data/lib/readability/result.rb +17 -0
- data/lib/readability/scoring.rb +99 -0
- data/lib/readability/utils.rb +221 -0
- data/lib/readability/version.rb +5 -0
- data/lib/readability.rb +27 -0
- metadata +74 -0
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Readability
|
|
4
|
+
module Utils
|
|
5
|
+
private
|
|
6
|
+
|
|
7
|
+
# Port of _removeNodes (JS line 304)
|
|
8
|
+
# Iterates over a node list in reverse, removing nodes where the block
|
|
9
|
+
# returns true (or all nodes if no block is given).
|
|
10
|
+
def remove_nodes(node_list)
|
|
11
|
+
node_list.to_a.reverse_each do |node|
|
|
12
|
+
next unless node.parent
|
|
13
|
+
|
|
14
|
+
if block_given?
|
|
15
|
+
node.unlink if yield(node)
|
|
16
|
+
else
|
|
17
|
+
node.unlink
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Port of _replaceNodeTags (JS line 327)
|
|
23
|
+
# Calls set_node_tag for each node in the list.
|
|
24
|
+
def replace_node_tags(node_list, new_tag)
|
|
25
|
+
node_list.each do |node|
|
|
26
|
+
set_node_tag(node, new_tag)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Port of _getAllNodesWithTag (JS line 397)
|
|
31
|
+
def get_all_nodes_with_tag(node, tag_names)
|
|
32
|
+
node.css(tag_names.join(","))
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# JS-compatible trim that also strips Unicode whitespace like \u00A0 (NBSP)
|
|
36
|
+
# JS's String.prototype.trim() strips all Unicode whitespace including NBSP.
|
|
37
|
+
# Ruby's String#strip only removes ASCII whitespace.
|
|
38
|
+
def js_trim(str)
|
|
39
|
+
str.gsub(/\A[\s\u00A0]+|[\s\u00A0]+\z/, "")
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Port of _getInnerText (JS line 2084)
|
|
43
|
+
def get_inner_text(element, normalize_spaces = true)
|
|
44
|
+
text = js_trim(element.text)
|
|
45
|
+
text = text.gsub(NORMALIZE, " ") if normalize_spaces
|
|
46
|
+
text
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Port of _isWhitespace (JS line 2068)
|
|
50
|
+
def is_whitespace?(node)
|
|
51
|
+
(node.text? && js_trim(node.text).empty?) ||
|
|
52
|
+
(node.element? && node.name == "br")
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Port of _isPhrasingContent (JS line 2057)
|
|
56
|
+
def is_phrasing_content?(node)
|
|
57
|
+
node.text? ||
|
|
58
|
+
PHRASING_ELEMS.include?(node.name) ||
|
|
59
|
+
((%w[a del ins].include?(node.name)) &&
|
|
60
|
+
node.children.all? { |child| is_phrasing_content?(child) })
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Port of _hasSingleTagInsideElement (JS line 2013)
|
|
64
|
+
def has_single_tag_inside_element?(element, tag)
|
|
65
|
+
# There should be exactly 1 element child with given tag
|
|
66
|
+
children = element.element_children
|
|
67
|
+
return false if children.length != 1 || children[0].name != tag
|
|
68
|
+
|
|
69
|
+
# And there should be no text nodes with real content
|
|
70
|
+
!element.children.any? { |node| node.text? && HAS_CONTENT.match?(node.text) }
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Port of _isElementWithoutContent (JS line 2028)
|
|
74
|
+
def is_element_without_content?(node)
|
|
75
|
+
node.element? &&
|
|
76
|
+
js_trim(node.text).empty? &&
|
|
77
|
+
(node.element_children.empty? ||
|
|
78
|
+
node.element_children.length ==
|
|
79
|
+
node.css("br").length + node.css("hr").length)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Port of _hasChildBlockElement (JS line 2044)
|
|
83
|
+
def has_child_block_element?(element)
|
|
84
|
+
element.children.any? do |node|
|
|
85
|
+
DIV_TO_P_ELEMS.include?(node.name) || has_child_block_element?(node)
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Port of _textSimilarity (JS line 981)
|
|
90
|
+
def text_similarity(text_a, text_b)
|
|
91
|
+
tokens_a = text_a.downcase.split(TOKENIZE).reject(&:empty?)
|
|
92
|
+
tokens_b = text_b.downcase.split(TOKENIZE).reject(&:empty?)
|
|
93
|
+
return 0 if tokens_a.empty? || tokens_b.empty?
|
|
94
|
+
|
|
95
|
+
uniq_tokens_b = tokens_b.reject { |token| tokens_a.include?(token) }
|
|
96
|
+
distance_b = uniq_tokens_b.join(" ").length.to_f / tokens_b.join(" ").length
|
|
97
|
+
1.0 - distance_b
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Port of _isProbablyVisible (JS line 2720)
|
|
101
|
+
# Checks inline style for display:none and visibility:hidden,
|
|
102
|
+
# the hidden attribute, and aria-hidden="true" (with fallback-image exception).
|
|
103
|
+
def is_probably_visible?(node)
|
|
104
|
+
style = node["style"] || ""
|
|
105
|
+
return false if style.match?(/display\s*:\s*none/i)
|
|
106
|
+
return false if style.match?(/visibility\s*:\s*hidden/i)
|
|
107
|
+
return false if !node["hidden"].nil?
|
|
108
|
+
|
|
109
|
+
aria_hidden = node["aria-hidden"]
|
|
110
|
+
if aria_hidden == "true"
|
|
111
|
+
class_name = node["class"] || ""
|
|
112
|
+
return false unless class_name.include?("fallback-image")
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
true
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Port of _nextNode (JS line 687)
|
|
119
|
+
# Skip whitespace siblings to find next element-ish node.
|
|
120
|
+
def next_node(node)
|
|
121
|
+
current = node
|
|
122
|
+
while current && !current.element? && WHITESPACE.match?(current.text)
|
|
123
|
+
current = current.next_sibling
|
|
124
|
+
end
|
|
125
|
+
current
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Port of _getNextNode (JS line 959)
|
|
129
|
+
# Depth-first traversal.
|
|
130
|
+
def get_next_node(node, ignore_self_and_kids = false)
|
|
131
|
+
# First check for kids if those aren't being ignored
|
|
132
|
+
if !ignore_self_and_kids && node.element_children.first
|
|
133
|
+
return node.element_children.first
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Then for siblings...
|
|
137
|
+
return node.next_element if node.next_element
|
|
138
|
+
|
|
139
|
+
# And finally, move up the parent chain *and* find a sibling
|
|
140
|
+
current = node
|
|
141
|
+
loop do
|
|
142
|
+
break unless current.respond_to?(:parent) && current.parent
|
|
143
|
+
current = current.parent
|
|
144
|
+
break if current.next_element
|
|
145
|
+
end
|
|
146
|
+
current&.next_element
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Port of _removeAndGetNext (JS line 942)
|
|
150
|
+
def remove_and_get_next(node)
|
|
151
|
+
next_nd = get_next_node(node, true)
|
|
152
|
+
node.unlink
|
|
153
|
+
next_nd
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Port of _hasAncestorTag (JS line 2243)
|
|
157
|
+
# max_depth of 0 means no limit.
|
|
158
|
+
def has_ancestor_tag?(node, tag_name, max_depth = 3, &filter_fn)
|
|
159
|
+
tag_name = tag_name.downcase
|
|
160
|
+
depth = 0
|
|
161
|
+
current = node
|
|
162
|
+
while (parent = current.parent) && parent.element?
|
|
163
|
+
return false if max_depth > 0 && depth > max_depth
|
|
164
|
+
|
|
165
|
+
if parent.name == tag_name && (!filter_fn || filter_fn.call(parent))
|
|
166
|
+
return true
|
|
167
|
+
end
|
|
168
|
+
current = parent
|
|
169
|
+
depth += 1
|
|
170
|
+
end
|
|
171
|
+
false
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Port of _getNodeAncestors (JS line 1019)
|
|
175
|
+
def get_node_ancestors(node, max_depth = 0)
|
|
176
|
+
i = 0
|
|
177
|
+
ancestors = []
|
|
178
|
+
current = node
|
|
179
|
+
while (parent = current.parent) && parent.element?
|
|
180
|
+
ancestors << parent
|
|
181
|
+
i += 1
|
|
182
|
+
break if max_depth > 0 && i == max_depth
|
|
183
|
+
current = parent
|
|
184
|
+
end
|
|
185
|
+
ancestors
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Port of _isSingleImage (JS line 1897)
|
|
189
|
+
def is_single_image?(node)
|
|
190
|
+
current = node
|
|
191
|
+
while current
|
|
192
|
+
return true if current.name == "img"
|
|
193
|
+
return false if current.element_children.length != 1 || !js_trim(current.text).empty?
|
|
194
|
+
|
|
195
|
+
current = current.element_children[0]
|
|
196
|
+
end
|
|
197
|
+
false
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# Port of _isValidByline (JS line 1005)
|
|
201
|
+
def is_valid_byline?(node, match_string)
|
|
202
|
+
rel = node["rel"]
|
|
203
|
+
itemprop = node["itemprop"]
|
|
204
|
+
byline_text = js_trim(node.text)
|
|
205
|
+
|
|
206
|
+
return false if byline_text.empty? || byline_text.length >= 100
|
|
207
|
+
|
|
208
|
+
rel == "author" ||
|
|
209
|
+
(itemprop && itemprop.include?("author")) ||
|
|
210
|
+
BYLINE.match?(match_string)
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# Port of _isUrl (JS line 442)
|
|
214
|
+
def is_url?(str)
|
|
215
|
+
uri = URI.parse(str)
|
|
216
|
+
uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
|
217
|
+
rescue URI::InvalidURIError, URI::InvalidComponentError
|
|
218
|
+
false
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
end
|
data/lib/readability.rb
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
require "json"
|
|
5
|
+
require "uri"
|
|
6
|
+
|
|
7
|
+
require_relative "readability/version"
|
|
8
|
+
require_relative "readability/result"
|
|
9
|
+
require_relative "readability/regexps"
|
|
10
|
+
require_relative "readability/utils"
|
|
11
|
+
require_relative "readability/scoring"
|
|
12
|
+
require_relative "readability/metadata"
|
|
13
|
+
require_relative "readability/cleaner"
|
|
14
|
+
require_relative "readability/document"
|
|
15
|
+
require_relative "readability/readerable"
|
|
16
|
+
|
|
17
|
+
module Readability
|
|
18
|
+
def self.parse(html, url: nil, **options)
|
|
19
|
+
doc = Nokogiri::HTML5(html)
|
|
20
|
+
Document.new(doc, url: url, **options).parse
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def self.readerable?(html, **options)
|
|
24
|
+
doc = Nokogiri::HTML5(html)
|
|
25
|
+
Readerable.probably_readerable?(doc, **options)
|
|
26
|
+
end
|
|
27
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: readability-rb
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Andy Croll
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2026-04-13 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: nokogiri
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '1.14'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '1.14'
|
|
27
|
+
description: Ruby port of Mozilla Readability.js - extracts the main content from
|
|
28
|
+
web pages, like Firefox Reader View
|
|
29
|
+
email:
|
|
30
|
+
- andy@goodscary.com
|
|
31
|
+
executables: []
|
|
32
|
+
extensions: []
|
|
33
|
+
extra_rdoc_files: []
|
|
34
|
+
files:
|
|
35
|
+
- LICENSE
|
|
36
|
+
- README.md
|
|
37
|
+
- lib/readability.rb
|
|
38
|
+
- lib/readability/cleaner.rb
|
|
39
|
+
- lib/readability/document.rb
|
|
40
|
+
- lib/readability/metadata.rb
|
|
41
|
+
- lib/readability/readerable.rb
|
|
42
|
+
- lib/readability/regexps.rb
|
|
43
|
+
- lib/readability/result.rb
|
|
44
|
+
- lib/readability/scoring.rb
|
|
45
|
+
- lib/readability/utils.rb
|
|
46
|
+
- lib/readability/version.rb
|
|
47
|
+
homepage: https://github.com/andycroll/readability-rb
|
|
48
|
+
licenses:
|
|
49
|
+
- Apache-2.0
|
|
50
|
+
metadata:
|
|
51
|
+
homepage_uri: https://github.com/andycroll/readability-rb
|
|
52
|
+
source_code_uri: https://github.com/andycroll/readability-rb
|
|
53
|
+
changelog_uri: https://github.com/andycroll/readability-rb/commits/main
|
|
54
|
+
bug_tracker_uri: https://github.com/andycroll/readability-rb/issues
|
|
55
|
+
post_install_message:
|
|
56
|
+
rdoc_options: []
|
|
57
|
+
require_paths:
|
|
58
|
+
- lib
|
|
59
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
60
|
+
requirements:
|
|
61
|
+
- - ">="
|
|
62
|
+
- !ruby/object:Gem::Version
|
|
63
|
+
version: '3.1'
|
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
65
|
+
requirements:
|
|
66
|
+
- - ">="
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: '0'
|
|
69
|
+
requirements: []
|
|
70
|
+
rubygems_version: 3.5.3
|
|
71
|
+
signing_key:
|
|
72
|
+
specification_version: 4
|
|
73
|
+
summary: Extract readable article content from HTML pages
|
|
74
|
+
test_files: []
|