readability-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,221 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Readability
4
+ module Utils
5
+ private
6
+
7
+ # Port of _removeNodes (JS line 304)
8
+ # Iterates over a node list in reverse, removing nodes where the block
9
+ # returns true (or all nodes if no block is given).
10
+ def remove_nodes(node_list)
11
+ node_list.to_a.reverse_each do |node|
12
+ next unless node.parent
13
+
14
+ if block_given?
15
+ node.unlink if yield(node)
16
+ else
17
+ node.unlink
18
+ end
19
+ end
20
+ end
21
+
22
+ # Port of _replaceNodeTags (JS line 327)
23
+ # Calls set_node_tag for each node in the list.
24
+ def replace_node_tags(node_list, new_tag)
25
+ node_list.each do |node|
26
+ set_node_tag(node, new_tag)
27
+ end
28
+ end
29
+
30
+ # Port of _getAllNodesWithTag (JS line 397)
31
+ def get_all_nodes_with_tag(node, tag_names)
32
+ node.css(tag_names.join(","))
33
+ end
34
+
35
+ # JS-compatible trim that also strips Unicode whitespace like \u00A0 (NBSP)
36
+ # JS's String.prototype.trim() strips all Unicode whitespace including NBSP.
37
+ # Ruby's String#strip only removes ASCII whitespace.
38
+ def js_trim(str)
39
+ str.gsub(/\A[\s\u00A0]+|[\s\u00A0]+\z/, "")
40
+ end
41
+
42
+ # Port of _getInnerText (JS line 2084)
43
+ def get_inner_text(element, normalize_spaces = true)
44
+ text = js_trim(element.text)
45
+ text = text.gsub(NORMALIZE, " ") if normalize_spaces
46
+ text
47
+ end
48
+
49
+ # Port of _isWhitespace (JS line 2068)
50
+ def is_whitespace?(node)
51
+ (node.text? && js_trim(node.text).empty?) ||
52
+ (node.element? && node.name == "br")
53
+ end
54
+
55
+ # Port of _isPhrasingContent (JS line 2057)
56
+ def is_phrasing_content?(node)
57
+ node.text? ||
58
+ PHRASING_ELEMS.include?(node.name) ||
59
+ ((%w[a del ins].include?(node.name)) &&
60
+ node.children.all? { |child| is_phrasing_content?(child) })
61
+ end
62
+
63
+ # Port of _hasSingleTagInsideElement (JS line 2013)
64
+ def has_single_tag_inside_element?(element, tag)
65
+ # There should be exactly 1 element child with given tag
66
+ children = element.element_children
67
+ return false if children.length != 1 || children[0].name != tag
68
+
69
+ # And there should be no text nodes with real content
70
+ !element.children.any? { |node| node.text? && HAS_CONTENT.match?(node.text) }
71
+ end
72
+
73
+ # Port of _isElementWithoutContent (JS line 2028)
74
+ def is_element_without_content?(node)
75
+ node.element? &&
76
+ js_trim(node.text).empty? &&
77
+ (node.element_children.empty? ||
78
+ node.element_children.length ==
79
+ node.css("br").length + node.css("hr").length)
80
+ end
81
+
82
+ # Port of _hasChildBlockElement (JS line 2044)
83
+ def has_child_block_element?(element)
84
+ element.children.any? do |node|
85
+ DIV_TO_P_ELEMS.include?(node.name) || has_child_block_element?(node)
86
+ end
87
+ end
88
+
89
+ # Port of _textSimilarity (JS line 981)
90
+ def text_similarity(text_a, text_b)
91
+ tokens_a = text_a.downcase.split(TOKENIZE).reject(&:empty?)
92
+ tokens_b = text_b.downcase.split(TOKENIZE).reject(&:empty?)
93
+ return 0 if tokens_a.empty? || tokens_b.empty?
94
+
95
+ uniq_tokens_b = tokens_b.reject { |token| tokens_a.include?(token) }
96
+ distance_b = uniq_tokens_b.join(" ").length.to_f / tokens_b.join(" ").length
97
+ 1.0 - distance_b
98
+ end
99
+
100
+ # Port of _isProbablyVisible (JS line 2720)
101
+ # Checks inline style for display:none and visibility:hidden,
102
+ # the hidden attribute, and aria-hidden="true" (with fallback-image exception).
103
+ def is_probably_visible?(node)
104
+ style = node["style"] || ""
105
+ return false if style.match?(/display\s*:\s*none/i)
106
+ return false if style.match?(/visibility\s*:\s*hidden/i)
107
+ return false if !node["hidden"].nil?
108
+
109
+ aria_hidden = node["aria-hidden"]
110
+ if aria_hidden == "true"
111
+ class_name = node["class"] || ""
112
+ return false unless class_name.include?("fallback-image")
113
+ end
114
+
115
+ true
116
+ end
117
+
118
+ # Port of _nextNode (JS line 687)
119
+ # Skip whitespace siblings to find next element-ish node.
120
+ def next_node(node)
121
+ current = node
122
+ while current && !current.element? && WHITESPACE.match?(current.text)
123
+ current = current.next_sibling
124
+ end
125
+ current
126
+ end
127
+
128
+ # Port of _getNextNode (JS line 959)
129
+ # Depth-first traversal.
130
+ def get_next_node(node, ignore_self_and_kids = false)
131
+ # First check for kids if those aren't being ignored
132
+ if !ignore_self_and_kids && node.element_children.first
133
+ return node.element_children.first
134
+ end
135
+
136
+ # Then for siblings...
137
+ return node.next_element if node.next_element
138
+
139
+ # And finally, move up the parent chain *and* find a sibling
140
+ current = node
141
+ loop do
142
+ break unless current.respond_to?(:parent) && current.parent
143
+ current = current.parent
144
+ break if current.next_element
145
+ end
146
+ current&.next_element
147
+ end
148
+
149
+ # Port of _removeAndGetNext (JS line 942)
150
+ def remove_and_get_next(node)
151
+ next_nd = get_next_node(node, true)
152
+ node.unlink
153
+ next_nd
154
+ end
155
+
156
+ # Port of _hasAncestorTag (JS line 2243)
157
+ # max_depth of 0 means no limit.
158
+ def has_ancestor_tag?(node, tag_name, max_depth = 3, &filter_fn)
159
+ tag_name = tag_name.downcase
160
+ depth = 0
161
+ current = node
162
+ while (parent = current.parent) && parent.element?
163
+ return false if max_depth > 0 && depth > max_depth
164
+
165
+ if parent.name == tag_name && (!filter_fn || filter_fn.call(parent))
166
+ return true
167
+ end
168
+ current = parent
169
+ depth += 1
170
+ end
171
+ false
172
+ end
173
+
174
+ # Port of _getNodeAncestors (JS line 1019)
175
+ def get_node_ancestors(node, max_depth = 0)
176
+ i = 0
177
+ ancestors = []
178
+ current = node
179
+ while (parent = current.parent) && parent.element?
180
+ ancestors << parent
181
+ i += 1
182
+ break if max_depth > 0 && i == max_depth
183
+ current = parent
184
+ end
185
+ ancestors
186
+ end
187
+
188
+ # Port of _isSingleImage (JS line 1897)
189
+ def is_single_image?(node)
190
+ current = node
191
+ while current
192
+ return true if current.name == "img"
193
+ return false if current.element_children.length != 1 || !js_trim(current.text).empty?
194
+
195
+ current = current.element_children[0]
196
+ end
197
+ false
198
+ end
199
+
200
+ # Port of _isValidByline (JS line 1005)
201
+ def is_valid_byline?(node, match_string)
202
+ rel = node["rel"]
203
+ itemprop = node["itemprop"]
204
+ byline_text = js_trim(node.text)
205
+
206
+ return false if byline_text.empty? || byline_text.length >= 100
207
+
208
+ rel == "author" ||
209
+ (itemprop && itemprop.include?("author")) ||
210
+ BYLINE.match?(match_string)
211
+ end
212
+
213
+ # Port of _isUrl (JS line 442)
214
+ def is_url?(str)
215
+ uri = URI.parse(str)
216
+ uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
217
+ rescue URI::InvalidURIError, URI::InvalidComponentError
218
+ false
219
+ end
220
+ end
221
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Readability
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+ require "json"
5
+ require "uri"
6
+
7
+ require_relative "readability/version"
8
+ require_relative "readability/result"
9
+ require_relative "readability/regexps"
10
+ require_relative "readability/utils"
11
+ require_relative "readability/scoring"
12
+ require_relative "readability/metadata"
13
+ require_relative "readability/cleaner"
14
+ require_relative "readability/document"
15
+ require_relative "readability/readerable"
16
+
17
+ module Readability
18
+ def self.parse(html, url: nil, **options)
19
+ doc = Nokogiri::HTML5(html)
20
+ Document.new(doc, url: url, **options).parse
21
+ end
22
+
23
+ def self.readerable?(html, **options)
24
+ doc = Nokogiri::HTML5(html)
25
+ Readerable.probably_readerable?(doc, **options)
26
+ end
27
+ end
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: readability-rb
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Andy Croll
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2026-04-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.14'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.14'
27
+ description: Ruby port of Mozilla Readability.js - extracts the main content from
28
+ web pages, like Firefox Reader View
29
+ email:
30
+ - andy@goodscary.com
31
+ executables: []
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - LICENSE
36
+ - README.md
37
+ - lib/readability.rb
38
+ - lib/readability/cleaner.rb
39
+ - lib/readability/document.rb
40
+ - lib/readability/metadata.rb
41
+ - lib/readability/readerable.rb
42
+ - lib/readability/regexps.rb
43
+ - lib/readability/result.rb
44
+ - lib/readability/scoring.rb
45
+ - lib/readability/utils.rb
46
+ - lib/readability/version.rb
47
+ homepage: https://github.com/andycroll/readability-rb
48
+ licenses:
49
+ - Apache-2.0
50
+ metadata:
51
+ homepage_uri: https://github.com/andycroll/readability-rb
52
+ source_code_uri: https://github.com/andycroll/readability-rb
53
+ changelog_uri: https://github.com/andycroll/readability-rb/commits/main
54
+ bug_tracker_uri: https://github.com/andycroll/readability-rb/issues
55
+ post_install_message:
56
+ rdoc_options: []
57
+ require_paths:
58
+ - lib
59
+ required_ruby_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: '3.1'
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ requirements: []
70
+ rubygems_version: 3.5.3
71
+ signing_key:
72
+ specification_version: 4
73
+ summary: Extract readable article content from HTML pages
74
+ test_files: []