readability-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,299 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Readability
4
+ module Metadata
5
+ private
6
+
7
+ # Port of _unescapeHtmlEntities (JS line 1631-1651)
8
+ # Replaces named HTML entities and numeric character references.
9
+ def unescape_html_entities(str)
10
+ return str unless str
11
+
12
+ str
13
+ .gsub(/&(quot|amp|apos|lt|gt);/) { HTML_ESCAPE_MAP[$1] }
14
+ .gsub(/&#(?:x([0-9a-f]+)|([0-9]+));/i) do
15
+ hex, num_str = $1, $2
16
+ num = (hex || num_str).to_i(hex ? 16 : 10)
17
+
18
+ # Replace invalid code points with U+FFFD
19
+ if num == 0 || num > 0x10FFFF || (num >= 0xD800 && num <= 0xDFFF)
20
+ num = 0xFFFD
21
+ end
22
+
23
+ [num].pack("U")
24
+ end
25
+ end
26
+
27
+ # Port of _getJSONLD (JS line 1658-1773)
28
+ # Extracts metadata from JSON-LD script tags with schema.org context.
29
+ def get_json_ld(doc)
30
+ scripts = get_all_nodes_with_tag(doc, ["script"])
31
+ metadata = nil
32
+
33
+ scripts.each do |json_ld_element|
34
+ next if metadata
35
+ next unless json_ld_element["type"] == "application/ld+json"
36
+
37
+ begin
38
+ # Strip CDATA markers if present
39
+ content = json_ld_element.text.gsub(/\A\s*<!\[CDATA\[|\]\]>\s*\z/, "")
40
+ parsed = JSON.parse(content)
41
+
42
+ if parsed.is_a?(Array)
43
+ parsed = parsed.find do |it|
44
+ it["@type"] && it["@type"].match?(JSON_LD_ARTICLE_TYPES)
45
+ end
46
+ next unless parsed
47
+ end
48
+
49
+ schema_dot_org_regex = /\Ahttps?:\/\/schema\.org\/?\z/
50
+
51
+ matches = (parsed["@context"].is_a?(String) &&
52
+ parsed["@context"].match?(schema_dot_org_regex)) ||
53
+ (parsed["@context"].is_a?(Hash) &&
54
+ parsed["@context"]["@vocab"].is_a?(String) &&
55
+ parsed["@context"]["@vocab"].match?(schema_dot_org_regex))
56
+
57
+ next unless matches
58
+
59
+ if !parsed["@type"] && parsed["@graph"].is_a?(Array)
60
+ parsed = parsed["@graph"].find do |it|
61
+ (it["@type"] || "").match?(JSON_LD_ARTICLE_TYPES)
62
+ end
63
+ end
64
+
65
+ next if !parsed || !parsed["@type"] || !parsed["@type"].match?(JSON_LD_ARTICLE_TYPES)
66
+
67
+ metadata = {}
68
+
69
+ if parsed["name"].is_a?(String) && parsed["headline"].is_a?(String) &&
70
+ parsed["name"] != parsed["headline"]
71
+ # Both name and headline exist and differ — compare similarity to HTML title
72
+ title = get_article_title
73
+ name_matches = text_similarity(parsed["name"], title) > 0.75
74
+ headline_matches = text_similarity(parsed["headline"], title) > 0.75
75
+
76
+ if headline_matches && !name_matches
77
+ metadata["title"] = parsed["headline"]
78
+ else
79
+ metadata["title"] = parsed["name"]
80
+ end
81
+ elsif parsed["name"].is_a?(String)
82
+ metadata["title"] = parsed["name"].strip
83
+ elsif parsed["headline"].is_a?(String)
84
+ metadata["title"] = parsed["headline"].strip
85
+ end
86
+
87
+ if parsed["author"]
88
+ if parsed["author"].is_a?(Hash) && parsed["author"]["name"].is_a?(String)
89
+ metadata["byline"] = parsed["author"]["name"].strip
90
+ elsif parsed["author"].is_a?(Array) &&
91
+ parsed["author"][0] &&
92
+ parsed["author"][0]["name"].is_a?(String)
93
+ metadata["byline"] = parsed["author"]
94
+ .select { |author| author && author["name"].is_a?(String) }
95
+ .map { |author| author["name"].strip }
96
+ .join(", ")
97
+ end
98
+ end
99
+
100
+ if parsed["description"].is_a?(String)
101
+ metadata["excerpt"] = parsed["description"].strip
102
+ end
103
+
104
+ if parsed["publisher"].is_a?(Hash) && parsed["publisher"]["name"].is_a?(String)
105
+ metadata["site_name"] = parsed["publisher"]["name"].strip
106
+ end
107
+
108
+ if parsed["datePublished"].is_a?(String)
109
+ metadata["date_published"] = parsed["datePublished"].strip
110
+ end
111
+ rescue JSON::ParserError => e
112
+ # Handle malformed JSON gracefully
113
+ log(e.message) if respond_to?(:log, true)
114
+ end
115
+ end
116
+
117
+ metadata || {}
118
+ end
119
+
120
+ # Port of _getArticleTitle (JS line 573-661)
121
+ # Extracts and cleans the article title from the document.
122
+ def get_article_title
123
+ cur_title = ""
124
+ orig_title = ""
125
+
126
+ begin
127
+ cur_title = orig_title = (@doc.at_css("title")&.text&.strip || "")
128
+
129
+ # If title came back as something other than a string (shouldn't happen
130
+ # with Nokogiri, but match JS logic)
131
+ if !cur_title.is_a?(String)
132
+ cur_title = orig_title = get_inner_text(@doc.css("title").first)
133
+ end
134
+ rescue
135
+ # ignore exceptions setting the title
136
+ end
137
+
138
+ title_had_hierarchical_separators = false
139
+ word_count = ->(str) { str.split(/\s+/).length }
140
+
141
+ # Title separator characters — exact JS source string from line 597
142
+ title_separators = '\|\-\u2013\u2014\\\\\/>»'
143
+
144
+ if cur_title.match?(/\s[#{title_separators}]\s/)
145
+ title_had_hierarchical_separators = cur_title.match?(/\s[\\\/>\u00BB]\s/)
146
+
147
+ # Find all separator positions and remove everything after the last one
148
+ all_separators = orig_title.to_enum(:scan, /\s[#{title_separators}]\s/i).map { Regexp.last_match }
149
+ cur_title = orig_title[0, all_separators.last.begin(0)]
150
+
151
+ # If the resulting title is too short, remove the first part instead
152
+ if word_count.call(cur_title) < 3
153
+ cur_title = orig_title.sub(/\A[^#{title_separators}]*[#{title_separators}]/i, "")
154
+ end
155
+ elsif cur_title.include?(": ")
156
+ # Check if we have a heading containing this exact string
157
+ headings = get_all_nodes_with_tag(@doc, ["h1", "h2"])
158
+ trimmed_title = cur_title.strip
159
+ match = headings.any? { |heading| heading.text.strip == trimmed_title }
160
+
161
+ # If we don't, extract the title out of the original title string
162
+ unless match
163
+ cur_title = orig_title[(orig_title.rindex(":") + 1)..]
164
+
165
+ # If the title is now too short, try the first colon instead
166
+ if word_count.call(cur_title) < 3
167
+ cur_title = orig_title[(orig_title.index(":") + 1)..]
168
+ # But if we have too many words before the colon there's something weird
169
+ elsif word_count.call(orig_title[0, orig_title.index(":")]) > 5
170
+ cur_title = orig_title
171
+ end
172
+ end
173
+ elsif cur_title.length > 150 || cur_title.length < 15
174
+ h_ones = @doc.css("h1")
175
+
176
+ if h_ones.length == 1
177
+ cur_title = get_inner_text(h_ones[0])
178
+ end
179
+ end
180
+
181
+ cur_title = cur_title.strip.gsub(NORMALIZE, " ")
182
+
183
+ # If we now have 4 words or fewer as our title, and either no
184
+ # 'hierarchical' separators (\, /, > or ») were found in the original
185
+ # title or we decreased the number of words by more than 1 word, use
186
+ # the original title.
187
+ cur_title_word_count = word_count.call(cur_title)
188
+ if cur_title_word_count <= 4 &&
189
+ (!title_had_hierarchical_separators ||
190
+ cur_title_word_count !=
191
+ word_count.call(orig_title.gsub(/\s[#{title_separators}]\s/, "")) - 1)
192
+ cur_title = orig_title
193
+ end
194
+
195
+ cur_title
196
+ end
197
+
198
+ # Port of _getArticleMetadata (JS line 1783-1889)
199
+ # Extracts metadata from <meta> tags and merges with JSON-LD data.
200
+ def get_article_metadata(json_ld)
201
+ metadata = {}
202
+ values = {}
203
+
204
+ meta_elements = @doc.css("meta")
205
+
206
+ # property is a space-separated list of values
207
+ property_pattern = /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/i
208
+
209
+ # name is a single value
210
+ name_pattern = /\A\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*\z/i
211
+
212
+ meta_elements.each do |element|
213
+ element_name = element["name"]
214
+ element_property = element["property"]
215
+ content = element["content"]
216
+ next unless content
217
+
218
+ matches = nil
219
+ name = nil
220
+
221
+ if element_property
222
+ matches = element_property.match(property_pattern)
223
+ if matches
224
+ # Convert to lowercase, and remove any whitespace
225
+ name = matches[0].downcase.gsub(/\s/, "")
226
+ values[name] = content.strip
227
+ end
228
+ end
229
+
230
+ if !matches && element_name && name_pattern.match?(element_name)
231
+ name = element_name
232
+ if content
233
+ # Convert to lowercase, remove whitespace, convert dots to colons
234
+ name = name.downcase.gsub(/\s/, "").gsub(".", ":")
235
+ values[name] = content.strip
236
+ end
237
+ end
238
+ end
239
+
240
+ # get title
241
+ metadata["title"] =
242
+ json_ld["title"] ||
243
+ values["dc:title"] ||
244
+ values["dcterm:title"] ||
245
+ values["og:title"] ||
246
+ values["weibo:article:title"] ||
247
+ values["weibo:webpage:title"] ||
248
+ values["title"] ||
249
+ values["twitter:title"] ||
250
+ values["parsely-title"]
251
+
252
+ metadata["title"] ||= get_article_title
253
+
254
+ article_author =
255
+ if values["article:author"].is_a?(String) && !is_url?(values["article:author"])
256
+ values["article:author"]
257
+ end
258
+
259
+ # get author
260
+ metadata["byline"] =
261
+ json_ld["byline"] ||
262
+ values["dc:creator"] ||
263
+ values["dcterm:creator"] ||
264
+ values["author"] ||
265
+ values["parsely-author"] ||
266
+ article_author
267
+
268
+ # get description
269
+ metadata["excerpt"] =
270
+ json_ld["excerpt"] ||
271
+ values["dc:description"] ||
272
+ values["dcterm:description"] ||
273
+ values["og:description"] ||
274
+ values["weibo:article:description"] ||
275
+ values["weibo:webpage:description"] ||
276
+ values["description"] ||
277
+ values["twitter:description"]
278
+
279
+ # get site name
280
+ metadata["siteName"] = json_ld["site_name"] || values["og:site_name"]
281
+
282
+ # get article published time
283
+ metadata["publishedTime"] =
284
+ json_ld["date_published"] ||
285
+ values["article:published_time"] ||
286
+ values["parsely-pub-date"] ||
287
+ nil
288
+
289
+ # Unescape HTML entities in all metadata values
290
+ metadata["title"] = unescape_html_entities(metadata["title"])
291
+ metadata["byline"] = unescape_html_entities(metadata["byline"])
292
+ metadata["excerpt"] = unescape_html_entities(metadata["excerpt"])
293
+ metadata["siteName"] = unescape_html_entities(metadata["siteName"])
294
+ metadata["publishedTime"] = unescape_html_entities(metadata["publishedTime"])
295
+
296
+ metadata
297
+ end
298
+ end
299
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "set"
4
+
5
+ module Readability
6
+ module Readerable
7
+ module_function
8
+
9
+ # For backward compat: accept a proc as second positional argument (matches JS API)
10
+ def probably_readerable?(doc, options_or_checker = {}, **kwargs)
11
+ if options_or_checker.is_a?(Proc)
12
+ kwargs[:visibility_checker] = options_or_checker
13
+ options_or_checker = {}
14
+ end
15
+ options = options_or_checker.is_a?(Hash) ? options_or_checker.merge(kwargs) : kwargs
16
+
17
+ min_score = options.fetch(:min_score, 20)
18
+ min_content_length = options.fetch(:min_content_length, 140)
19
+ visibility_checker = options.fetch(:visibility_checker, nil)
20
+ visibility_checker ||= method(:node_visible?)
21
+
22
+ nodes = doc.css("p, pre, article")
23
+
24
+ # Also include div parents of br nodes (some articles use div > br structure)
25
+ br_nodes = doc.css("div > br")
26
+ if br_nodes.any?
27
+ node_set = Set.new(nodes.to_a)
28
+ br_nodes.each { |br| node_set.add(br.parent) }
29
+ nodes = node_set.to_a
30
+ end
31
+
32
+ score = 0.0
33
+ nodes.any? do |node|
34
+ next false unless visibility_checker.call(node)
35
+
36
+ match_string = "#{node['class']} #{node['id']}"
37
+ next false if UNLIKELY_CANDIDATES.match?(match_string) && !OK_MAYBE_CANDIDATE.match?(match_string)
38
+ next false if node.matches?("li p")
39
+
40
+ text_length = node.text.strip.length
41
+ next false if text_length < min_content_length
42
+
43
+ score += Math.sqrt(text_length - min_content_length)
44
+ score > min_score
45
+ end
46
+ end
47
+
48
+ # NOTE: This matches JS isNodeVisible exactly — does NOT check visibility:hidden
49
+ def node_visible?(node)
50
+ style = node['style']
51
+ return false if style && style =~ /display:\s*none/i
52
+ return false if !node['hidden'].nil?
53
+ aria_hidden = node['aria-hidden']
54
+ if aria_hidden == "true"
55
+ class_name = node['class'] || ""
56
+ return false unless class_name.include?("fallback-image")
57
+ end
58
+ true
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "set"
4
+
5
+ module Readability
6
+ # All regex patterns from Readability.js REGEXPS object
7
+ # NOTE: All tag name constants are LOWERCASE (Nokogiri convention)
8
+
9
+ # Flags
10
+ FLAG_STRIP_UNLIKELYS = 0x1
11
+ FLAG_WEIGHT_CLASSES = 0x2
12
+ FLAG_CLEAN_CONDITIONALLY = 0x4
13
+
14
+ # Defaults
15
+ DEFAULT_MAX_ELEMS_TO_PARSE = 0
16
+ DEFAULT_N_TOP_CANDIDATES = 5
17
+ DEFAULT_CHAR_THRESHOLD = 500
18
+
19
+ DEFAULT_TAGS_TO_SCORE = %w[section h2 h3 h4 h5 h6 p td pre].freeze
20
+
21
+ # Regexps — ported from the JS REGEXPS object
22
+ UNLIKELY_CANDIDATES = /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i
23
+
24
+ OK_MAYBE_CANDIDATE = /and|article|body|column|content|main|mathjax|shadow/i
25
+
26
+ POSITIVE = /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i
27
+
28
+ NEGATIVE = /-ad-|hidden|\Ahid\z| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget/i
29
+
30
+ EXTRANEOUS = /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i
31
+
32
+ BYLINE = /byline|author|dateline|writtenby|p-author/i
33
+
34
+ REPLACE_FONTS = /<(\/?)font[^>]*>/i
35
+
36
+ NORMALIZE = /\s{2,}/
37
+
38
+ VIDEOS = /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq|bilibili|live.bilibili)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i
39
+
40
+ SHARE_ELEMENTS = /(\b|_)(share|sharedaddy)(\b|_)/i
41
+
42
+ NEXT_LINK = /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i
43
+
44
+ PREV_LINK = /(prev|earl|old|new|<|«)/i
45
+
46
+ TOKENIZE = /\W+/
47
+
48
+ WHITESPACE = /\A\s*\z/
49
+
50
+ HAS_CONTENT = /\S\z/
51
+
52
+ HASH_URL = /\A#.+/
53
+
54
+ SRCSET_URL = /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/
55
+
56
+ B64_DATA_URL = /\Adata:\s*([^\s;,]+)\s*;\s*base64\s*,/i
57
+
58
+ # Commas as used in Latin, Sindhi, Chinese and various other scripts.
59
+ # see: https://en.wikipedia.org/wiki/Comma#Comma_variants
60
+ COMMAS = /\u{002C}|\u{060C}|\u{FE50}|\u{FE10}|\u{FE11}|\u{2E41}|\u{2E34}|\u{2E32}|\u{FF0C}/
61
+
62
+ # See: https://schema.org/Article
63
+ JSON_LD_ARTICLE_TYPES = /\A(Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference)\z/
64
+
65
+ AD_WORDS = /\A(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)\z/iu
66
+
67
+ LOADING_WORDS = /\A(loading|正在加载|Загрузка|chargement|cargando)(…|\.\.\.)?\z/iu
68
+
69
+ # Element/role lists — ALL LOWERCASE
70
+ UNLIKELY_ROLES = %w[menu menubar complementary navigation alert alertdialog dialog].freeze
71
+
72
+ DIV_TO_P_ELEMS = Set.new(%w[blockquote dl div img ol p pre table ul]).freeze
73
+
74
+ ALTER_TO_DIV_EXCEPTIONS = %w[div article section p ol ul].freeze
75
+
76
+ PRESENTATIONAL_ATTRIBUTES = %w[align background bgcolor border cellpadding cellspacing frame hspace rules style valign vspace].freeze
77
+
78
+ DEPRECATED_SIZE_ATTRIBUTE_ELEMS = %w[table th td hr pre].freeze
79
+
80
+ PHRASING_ELEMS = %w[abbr audio b bdo br button cite code data datalist dfn em embed i img input kbd label mark math meter noscript object output progress q ruby samp script select small span strong sub sup textarea time var wbr].freeze
81
+
82
+ CLASSES_TO_PRESERVE = %w[page].freeze
83
+
84
+ HTML_ESCAPE_MAP = {
85
+ "lt" => "<",
86
+ "gt" => ">",
87
+ "amp" => "&",
88
+ "quot" => '"',
89
+ "apos" => "'",
90
+ }.freeze
91
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Readability
4
+ Result = Struct.new(
5
+ :title,
6
+ :byline,
7
+ :dir,
8
+ :lang,
9
+ :content,
10
+ :text_content,
11
+ :length,
12
+ :excerpt,
13
+ :site_name,
14
+ :published_time,
15
+ keyword_init: true
16
+ )
17
+ end
@@ -0,0 +1,99 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Readability
4
+ module Scoring
5
+ private
6
+
7
+ # Port of _initializeNode (JS line 903)
8
+ # Sets up a node in @candidates with a base score derived from its tag name,
9
+ # then adds the class/id weight.
10
+ def initialize_node(node)
11
+ base_score = case node.name
12
+ when "div"
13
+ 5
14
+ when "pre", "td", "blockquote"
15
+ 3
16
+ when "address", "ol", "ul", "dl", "dd", "dt", "li", "form"
17
+ -3
18
+ when "h1", "h2", "h3", "h4", "h5", "h6", "th"
19
+ -5
20
+ else
21
+ 0
22
+ end
23
+
24
+ @candidates[node] = { content_score: base_score + get_class_weight(node) }
25
+ end
26
+
27
+ # Port of _getClassWeight (JS line 2168)
28
+ # Returns a weight based on the node's class and id attributes matching
29
+ # POSITIVE or NEGATIVE regexps.
30
+ def get_class_weight(node)
31
+ return 0 unless flag_is_active?(FLAG_WEIGHT_CLASSES)
32
+
33
+ weight = 0
34
+
35
+ klass = node["class"]
36
+ if klass && !klass.empty?
37
+ weight -= 25 if NEGATIVE.match?(klass)
38
+ weight += 25 if POSITIVE.match?(klass)
39
+ end
40
+
41
+ id = node["id"]
42
+ if id && !id.empty?
43
+ weight -= 25 if NEGATIVE.match?(id)
44
+ weight += 25 if POSITIVE.match?(id)
45
+ end
46
+
47
+ weight
48
+ end
49
+
50
+ # Port of _getLinkDensity (JS line 2143)
51
+ # Returns the ratio of anchor text length to total text length.
52
+ # Fragment-only links (#...) count at 0.3 coefficient.
53
+ def get_link_density(element)
54
+ text_length = get_inner_text(element).length
55
+ return 0 if text_length == 0
56
+
57
+ link_length = 0.0
58
+
59
+ element.css("a").each do |link_node|
60
+ href = link_node["href"]
61
+ coefficient = href && HASH_URL.match?(href) ? 0.3 : 1.0
62
+ link_length += get_inner_text(link_node).length * coefficient
63
+ end
64
+
65
+ link_length / text_length
66
+ end
67
+
68
+ # Port of _getTextDensity (JS line 2440)
69
+ # Returns the ratio of text inside elements matching +tags+ to total text in element.
70
+ def get_text_density(element, tags)
71
+ text_length = get_inner_text(element, true).length
72
+ return 0 if text_length == 0
73
+
74
+ children_length = 0
75
+ get_all_nodes_with_tag(element, tags).each do |child|
76
+ children_length += get_inner_text(child, true).length
77
+ end
78
+
79
+ children_length.to_f / text_length
80
+ end
81
+
82
+ # Port of _getCharCount (JS line 2102)
83
+ # Counts occurrences of +separator+ in the element's inner text.
84
+ def get_char_count(element, separator = ",")
85
+ get_inner_text(element).split(separator).length - 1
86
+ end
87
+
88
+ # Returns the content score for a candidate node, defaulting to 0.
89
+ def content_score(node)
90
+ @candidates.dig(node, :content_score) || 0
91
+ end
92
+
93
+ # Sets the content score for a candidate node.
94
+ def set_content_score(node, score)
95
+ @candidates[node] ||= {}
96
+ @candidates[node][:content_score] = score
97
+ end
98
+ end
99
+ end