readability_js 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 03cb241180cad18709eb90a638563727c1209a5debdb4030685842abab87b86d
4
- data.tar.gz: 7d1db45a012d4b45087201b621b2fad4cd8f8c85581eccea50dd1dc2cd11d671
3
+ metadata.gz: c536792a9e26ab4080c31b6065e02fab9a5ed5536993eeac15e53b0fd1c288fc
4
+ data.tar.gz: 88baef9c54969ca53e2bef99550d9da1445b72b62d8da43250d564c0ed1bc3e5
5
5
  SHA512:
6
- metadata.gz: 02211235463faf2f652d9d04667feb6a2a9bf1575966cec5b35fb8a2c11ce42972d0cf757ba637df8ffcab3bd10d647cf8181f9e2a5a3c0bdea7dae89ece14c0
7
- data.tar.gz: cb77d8c08d3f0487238eff114e3adff58fe18514f9b1027703842e5bc85c8e8212437cc001ceb5de2abbcf6d53e5a7b8f83d8daf7b96c12c4a5c3dff0480a02e
6
+ metadata.gz: 552063ebef5709acb6ca6a9744bf69e8c33499d24d0b9db7d5bcaef53a3429f0b72b7bc7f12350d2e0137ef20c5603489feb731de831044cb2f1cda742168d75
7
+ data.tar.gz: 11b0a652f07d3c722775a7f9fc401af4e20ee2abb8513e60bba44a7b5a1e0e8d9df0bf7477f8e5aa0fdfc1c147186094a610e0aea02757a70f2752845f241265
data/.gitattributes ADDED
@@ -0,0 +1,2 @@
1
+ spec/examples/* linguist-vendored
2
+ node_modules/* linguist-vendored
data/README.md CHANGED
@@ -97,7 +97,8 @@ It contains the data returned by readability, with hash keys transformed in snak
97
97
  "dir" => "ltr",
98
98
  "site_name" => "example.com",
99
99
  "lang" => "en",
100
- "published_time" => "2024-01-01T12:00:00Z"
100
+ "published_time" => "2024-01-01T12:00:00Z",
101
+ "image_url" => "https://example.com/image.jpg" # only for extended parse
101
102
  }
102
103
  ```
103
104
 
@@ -0,0 +1,262 @@
1
+
2
+ module ReadabilityJs
3
+ class Extended
4
+
5
+ SELECTOR_BLACKLIST = [
6
+ ".Article-Partner",
7
+ ".Article-Partner-Text",
8
+ ".Article-Comments-Button",
9
+ "#isl-5-AdCarousel",
10
+ "#isl-10-ArticleComments",
11
+ "*[data-element-tracking-name]",
12
+ "*[aria-label='Anzeige']",
13
+ "nav[aria-label='breadcrumb']",
14
+ # heise
15
+ "a-video",
16
+ "a-gift",
17
+ "a-collapse",
18
+ "a-opt-in",
19
+ # spiegel
20
+ "[data-area='related_articles']",
21
+ # welt
22
+ "nav[aria-label='Breadcrumb']",
23
+ ".c-inline-teaser-list",
24
+ "[width='1'][height='1']",
25
+ # golem
26
+ ".go-alink-list",
27
+ # faz
28
+ "[data-external-selector='related-articles-entries']",
29
+ ".BigBox",
30
+ # frankfurter rundschau
31
+ ".id-Breadcrumb-item",
32
+ ".id-Story-interactionBar",
33
+ "revenue-reel",
34
+ ".id-StoryElement-factBox",
35
+ # stern
36
+ ".breadcrumb",
37
+ ".teaser",
38
+ ".group-teaserblock__items",
39
+ ".title__kicker",
40
+ "ws-adtag",
41
+ # taz
42
+ "[data-for='webelement_bio']",
43
+ "[data-for='webelement_citation']",
44
+ "#articleTeaser",
45
+ ".article-produktteaser-container",
46
+ "[x-data='{}']",
47
+ "#komune",
48
+ ".community",
49
+ ]
50
+
51
+ def self.before_cleanup(html)
52
+ pre_parser html
53
+ end
54
+
55
+ def self.after_cleanup(result, html)
56
+ find_and_add_picture result, html
57
+ clean_up_and_enrich_result result
58
+ end
59
+
60
+ private
61
+
62
+ #
63
+ # Pre-parser to clean up HTML before passing it to Readability
64
+ #
65
+ # SELECTOR_BLACKLIST contains CSS selectors of elements to be removed from the HTML
66
+ # before parsing to improve content extraction.
67
+ #
68
+ # @param html [String] The HTML document as a string.
69
+ # @return [String] The cleaned HTML document as a string.
70
+ #
71
+ def self.pre_parser(html)
72
+ doc = Nokogiri::HTML(html)
73
+ # Remove blacklisted elements by selector
74
+ SELECTOR_BLACKLIST.each do |classname|
75
+ doc.css("#{classname}").remove
76
+ end
77
+ doc.to_html
78
+ end
79
+
80
+ #
81
+ # Post-parser to find and add lead image URL if missing.
82
+ #
83
+ # Will add a picture into the result hash under the key "image_url".
84
+ #
85
+ # Looks for Open Graph and Twitter Card meta tags to find a lead image URL.
86
+ # If not found, it will have a look into the markdown content for the first image.
87
+ #
88
+ # @param result [Hash] The result hash from Readability parsing.
89
+ # @param html [String] The original HTML document as a string.
90
+ # @return [Hash] The updated result hash.
91
+ #
92
+ def self.find_and_add_picture(result, html)
93
+ return result if result.key?("lead_image_url") && !result["lead_image_url"].to_s.strip.empty?
94
+ doc = Nokogiri::HTML(html)
95
+ # try to find og:image or twitter:image meta tags
96
+ meta_tags = doc.css('meta[property="og:image"], meta[name="og:image"], meta[name="twitter:image"]')
97
+ meta_tags.each do |meta_tag|
98
+ content = meta_tag['content']
99
+ if content && !content.strip.empty?
100
+ result["image_url"] = content.strip
101
+ break
102
+ end
103
+ end
104
+ # try to find first image in markdown content if no meta tag found before
105
+ if !result.key?("image_url") || result["image_url"].to_s.strip.empty?
106
+ if result.key?("markdown_content")
107
+ md_content = result["markdown_content"]
108
+ md_content.scan(/!\[.*?\]\((.*?)\)/).each do |match|
109
+ img_url = match[0]
110
+ if img_url && !img_url.strip.empty?
111
+ # check if img ends with common image file extensions
112
+ if img_url =~ /\.(jpg|jpeg|png|gif|webp|svg|tif|avif)(\?.*)?$/i
113
+ result["image_url"] = img_url.strip
114
+ break
115
+ end
116
+ end
117
+ end
118
+ end
119
+ end
120
+ result
121
+ end
122
+
123
+ #
124
+ # Post-parser to clean up extracted content after Readability processing
125
+ #
126
+ # Cleans up comment artifacts and beautifies HTML and adds beautified Markdown content.
127
+ #
128
+ # @param result [Hash] The result hash from Readability parsing.
129
+ # @return [Hash] The cleaned result hash.
130
+ #
131
+ def self.clean_up_and_enrich_result(result)
132
+ result["content"] = clean_up_comments(result["content"]) if result.key?("content")
133
+ result["text_content"] = clean_up_comments(result["text_content"]) if result.key?("text_content")
134
+ result["excerpt"] = clean_up_comments(result["excerpt"]) if result.key?("excerpt")
135
+ result["byline"] = clean_up_comments(result["byline"]) if result.key?("byline")
136
+ if result.key?("content")
137
+ result = beautify_html_and_text(result)
138
+ result["markdown_content"] = ReverseMarkdown.convert(result["content"]) if result.key?("content")
139
+ result = beautify_markdown(result)
140
+ end
141
+ result
142
+ end
143
+
144
+ #
145
+ # Remove/replace comment / artifact noise like <!--[--&gt;, <!----&gt; etc.
146
+ #
147
+ # @param html [String] The HTML content as a string.
148
+ # @return [String] The cleaned HTML content as a string.
149
+ #
150
+ def self.clean_up_comments(html)
151
+ copy = html.dup
152
+ # Turn \x3C before comment start into '<'
153
+ copy.gsub!(/\\x3C(?=!--)/, '<')
154
+ # Decode encoded comment end --&gt; to -->
155
+ copy.gsub!(/--&gt;/, '-->')
156
+ # Remove fully empty or artifact comments ([], only whitespace)
157
+ copy.gsub!(/<!--\s*(?:\[|\]|)*\s*-->/, '')
158
+ # Collapse multiple dummy comment chains
159
+ copy.gsub!(/(?:<!--\s*-->\s*)+/, '')
160
+ # Remove remaining comment artifacts like <!--[-->, <!--]-->
161
+ copy.gsub!(/<!--\[\]-->|<!--\[\s*-->|<!--\]\s*-->/, '')
162
+ # Remove any remaining regular comments
163
+ copy.gsub!(/<!--.*?-->/m, '')
164
+ # Reduce excessive whitespace / blank lines (real newlines)
165
+ copy.gsub!(/\n[ \t]+\n/, "\n")
166
+ copy.gsub!(/\n{3,}/, "\n\n")
167
+ # Remove any remaining script tags (including encoded variants)
168
+ copy.gsub!(/(?:\\x3C|<)script\b[^>]*?(?:>|\\x3E|&gt;).*?(?:\\x3C|<)\/script(?:>|\\x3E|&gt;)/im, '')
169
+ # Preserve blocks where whitespace/newlines matter
170
+ preserve_tags = %w[pre code textarea]
171
+ preserved = {}
172
+ preserve_tags.each_with_index do |tag, idx|
173
+ copy.scan(/<#{tag}[^>]*?>.*?<\/#{tag}>/mi).each do |block|
174
+ key = "__PRESERVE_BLOCK_#{tag.upcase}_#{idx}_#{preserved.size}__"
175
+ preserved[key] = block
176
+ copy.sub!(block, key)
177
+ end
178
+ end
179
+ # Remove literal backslash+n sequences (if they exist as textual artifacts) outside preserved blocks
180
+ copy.gsub!(/\\n\s*/, ' ')
181
+ # Collapse whitespace between tags to a single space or nothing
182
+ # Remove whitespace-only text nodes represented by spaces/newlines between tags
183
+ copy.gsub!(/>\s+</, '><')
184
+ # Normalize multiple spaces to a single space
185
+ copy.gsub!(/ {2,}/, ' ')
186
+ # Trim spaces directly inside tags (e.g., <p> text </p>)
187
+ copy.gsub!(/>\s+([^<])/) { ">#{$1}" }
188
+ # Restore preserved blocks
189
+ preserved.each { |k, v| copy.sub!(k, v) }
190
+ copy.strip
191
+ end
192
+
193
+ #
194
+ # Beautify Markdown content by adding title if not present and fixing link spacing
195
+ #
196
+ # @param result [Hash] The result hash from Readability parsing.
197
+ # @return [Hash] The beautified result hash.
198
+ #
199
+ def self.beautify_markdown(result)
200
+ mark_down = result["markdown_content"]
201
+ # add title to markdown if not present
202
+ if !mark_down.start_with?("# ") && result.key?("title") && !result["title"].to_s.strip.empty? && !mark_down.include?(result["title"])
203
+ mark_down = "# #{result['title']}\n\n" + mark_down
204
+ end
205
+ # Check for image and if none is found, add after title if available
206
+ if result.key?("image_url") && !result["image_url"].to_s.strip.empty?
207
+ has_image = mark_down.match(/!\[.*?\]\(.*?\)/)
208
+ if !has_image
209
+ img_md = "![Lead Image](#{result['image_url']})\n\n"
210
+ mark_down = mark_down.sub(/^# .+?\n/, "\\0" + img_md)
211
+ end
212
+ end
213
+ # Add a space after markdown links if immediately followed by an alphanumeric char (missing separation).
214
+ mark_down.gsub!(/(\[[^\]]+\]\((?:[^\)"']+|"[^"]*"|'[^']*')*\))(?=[A-Za-z0-9ÄÖÜäöüß])/, '\1 ')
215
+ result["markdown_content"] = mark_down
216
+ result
217
+ end
218
+
219
+ #
220
+ # Beautify HTML content by adding title if not present and fixing link spacing
221
+ #
222
+ # @param result [Hash] The result hash from Readability parsing.
223
+ # @return [String] The beautified HTML content as a string.
224
+ #
225
+ def self.beautify_html_and_text(result)
226
+ html = result["content"]
227
+ text = result["text_content"]
228
+ # Add title to html and text if not present
229
+ if (html.index(/h[1-2]/) && html.index(/h[1-2]/).to_i > 128 && result.key?("title") && !result["title"].to_s.strip.empty? && !html.include?(result["title"])) || html.index(/h[1-2]/).nil?
230
+ title_tag = "<h1>#{result['title']}</h1>\n"
231
+ html = title_tag + html
232
+ text = result['title'] + "\n\n" + text
233
+ end
234
+ # Check for image and if none is found, add after title if available
235
+ if result.key?("image_url") && !result["image_url"].to_s.strip.empty?
236
+ doc = Nokogiri::HTML(html)
237
+ # check for img tags but also for picture tags
238
+ has_image = !doc.css('img, picture').empty?
239
+ if !has_image
240
+ img_tag = "<p><img src=\"#{result['image_url']}\" alt=\"Lead Image\"></p>\n"
241
+ h1 = doc.at_css('h1')
242
+ if h1
243
+ h1.add_next_sibling(Nokogiri::HTML::DocumentFragment.parse(img_tag))
244
+ html = doc.to_html
245
+ text = result['image_url'] + "\n\n" + text
246
+ end
247
+ end
248
+ end
249
+ # Add a space after a links if immediately followed by an alphanumeric char (missing separation).
250
+ doc = Nokogiri::HTML(html)
251
+ doc.css('a').each do |link|
252
+ next if link.next_sibling.nil?
253
+ if link.next_sibling.text? && link.next_sibling.content =~ /\A[A-Za-z0-9ÄÖÜäöüß]/
254
+ link.add_next_sibling(Nokogiri::XML::Text.new(' ', doc))
255
+ end
256
+ end
257
+ result["content"] = doc.to_html
258
+ result["text_content"] = text
259
+ result
260
+ end
261
+ end
262
+ end
@@ -1,4 +1,4 @@
1
- const { Readability } = require('@mozilla/readability');
1
+ const { Readability, isProbablyReaderable } = require('@mozilla/readability');
2
2
  const { JSDOM } = require('jsdom');
3
3
 
4
4
  const doc = new JSDOM("<body>Look at this cat: <img src='./cat.jpg'></body>", {
@@ -7,4 +7,10 @@ const doc = new JSDOM("<body>Look at this cat: <img src='./cat.jpg'></body>", {
7
7
  let reader = new Readability(doc.window.document);
8
8
  let article = reader.parse();
9
9
 
10
- console.log(article);
10
+ console.log(article);
11
+
12
+ if(isProbablyReaderable(doc.window.document)) {
13
+ console.log("This document is probably readerable.");
14
+ } else {
15
+ console.log("This document is probably not readerable.");
16
+ }
@@ -14,15 +14,22 @@ module ReadabilityJs
14
14
  #
15
15
  def self.parse(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0)
16
16
  begin
17
- self.new.parse html
17
+ # remove style tags from html, so jsdom does not need to process css and its warnings are not shown
18
+ html = html.gsub(/<style[^>]*>.*?<\/style>/m, '')
19
+ self.new.parse html, url, debug, max_elems_to_parse, nb_top_candidates, char_threshold, classes_to_preserve, keep_classes, disable_json_ld, serializer, allow_video_regex, link_density_modifier
18
20
  rescue ::Nodo::JavaScriptError => e
19
21
  raise ReadabilityJs::Error.new "#{e.message}"
20
22
  end
21
23
  end
22
24
 
23
- def self.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: 'isNodeVisible')
25
+ #
26
+ # instance wrapper method, as nodo does not support class methods
27
+ #
28
+ def self.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: nil)
24
29
  begin
25
- self.new.is_probably_readerable html
30
+ # remove style tags from html, so jsdom does not need to process css and its warnings are not shown
31
+ html = html.gsub(/<style[^>]*>.*?<\/style>/m, '')
32
+ self.new.is_probably_readerable html, min_content_length, min_score, visibility_checker
26
33
  rescue ::Nodo::JavaScriptError => e
27
34
  raise ReadabilityJs::Error.new "#{e.message}"
28
35
  end
@@ -57,9 +64,16 @@ module ReadabilityJs
57
64
  JS
58
65
 
59
66
  function :is_probably_readerable, <<~JS
60
- async (html) => {
67
+ async (html, minContentLength, minScore, visibilityChecker) => {
61
68
  const doc = new jsdom.JSDOM(html);
62
- return readability.Readability.isProbablyReaderable(doc);
69
+
70
+ let readability_options = {};
71
+ if(minContentLength !== undefined && minContentLength !== null) readability_options.minContentLength = minContentLength;
72
+ if(minScore !== undefined && minScore !== null) readability_options.minScore = minScore;
73
+ if(visibilityChecker !== undefined && visibilityChecker !== null) {
74
+ readability_options.visibilityChecker = eval(visibilityChecker);
75
+ }
76
+ return readability.isProbablyReaderable(doc.window.document, readability_options);
63
77
  }
64
78
  JS
65
79
 
@@ -1,3 +1,3 @@
1
1
  module ReadabilityJs
2
- VERSION = '0.0.2'.freeze
2
+ VERSION = '0.0.3'.freeze
3
3
  end
@@ -6,6 +6,7 @@ require 'nokogiri'
6
6
 
7
7
  require_relative 'readability_js/version'
8
8
  require_relative 'readability_js/nodo'
9
+ require_relative 'readability_js/extended'
9
10
 
10
11
  require_relative 'custom_errors/error'
11
12
 
@@ -15,56 +16,27 @@ require_relative 'custom_errors/error'
15
16
 
16
17
  module ReadabilityJs
17
18
 
18
- SELECTOR_BLACKLIST = [
19
- ".Article-Partner",
20
- ".Article-Partner-Text",
21
- ".Article-Comments-Button",
22
- "#isl-5-AdCarousel",
23
- "#isl-10-ArticleComments",
24
- "*[data-element-tracking-name]",
25
- "*[aria-label='Anzeige']",
26
- "nav[aria-label='breadcrumb']",
27
- # heise
28
- "a-video",
29
- "a-gift",
30
- "a-collapse",
31
- "a-opt-in",
32
- # spiegel
33
- "[data-area='related_articles']",
34
- # welt
35
- "nav[aria-label='Breadcrumb']",
36
- ".c-inline-teaser-list",
37
- # golem
38
- ".go-alink-list",
39
- # faz
40
- "[data-external-selector='related-articles-entries']",
41
- ".BigBox",
42
- # frankfurter rundschau
43
- ".id-Breadcrumb-item",
44
- ".id-Story-interactionBar",
45
- "revenue-reel",
46
- ".id-StoryElement-factBox",
47
- # stern
48
- ".breadcrumb",
49
- ".teaser",
50
- ".group-teaserblock__items",
51
- ".title__kicker",
52
- # taz
53
- "[data-for='webelement_bio']",
54
- "[data-for='webelement_citation']",
55
- "#articleTeaser",
56
- ".article-produktteaser-container",
57
- "[x-data='{}']",
58
- "#komune",
59
- ".community",
60
- ]
61
-
62
19
  #
63
20
  # Parse a HTML document and extract its main content using Mozilla's Readability library.
64
- # Raises ReadabilityJs::Error on failure.
65
21
  #
66
22
  # 'html' is a required parameters, all others are optional.
67
23
  #
24
+ # @param html [String] The HTML document as a string.
25
+ # @param url [String, nil] The URL of the document (optional, used for resolving relative links).
26
+ # @param debug [Boolean] Enable debug mode (default: false).
27
+ # @param max_elems_to_parse [Integer] Maximum number of elements to parse (default: 0, meaning no limit).
28
+ # @param nb_top_candidates [Integer] Number of top candidates to consider (default: 5).
29
+ # @param char_threshold [Integer] Minimum number of characters for an element to be considered (default: 500).
30
+ # @param classes_to_preserve [Array<String>] List of CSS classes to preserve in the output (default: []).
31
+ # @param keep_classes [Boolean] Whether to keep the original classes in the output (default: false).
32
+ # @param disable_json_ld [Boolean] Disable JSON-LD parsing (default: false).
33
+ # @param serializer [String, nil] Serializer to use for output (optional).
34
+ # @param allow_video_regex [String, nil] Regular expression to allow video URLs (optional).
35
+ # @param link_density_modifier [Float] Modifier for link density calculation (default: 0).
36
+ # @return [Hash] A hash containing the extracted content and metadata.
37
+ #
38
+ # @raise [ReadabilityJs::Error] if an error occurs during execution
39
+ #
68
40
  def self.parse(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0)
69
41
  begin
70
42
  result = ReadabilityJs::Nodo.parse(html, url: url, debug: debug, max_elems_to_parse: max_elems_to_parse, nb_top_candidates: nb_top_candidates, char_threshold: char_threshold, classes_to_preserve: classes_to_preserve, keep_classes: keep_classes, disable_json_ld: disable_json_ld, serializer: serializer, allow_video_regex: allow_video_regex, link_density_modifier: link_density_modifier)
@@ -74,13 +46,60 @@ module ReadabilityJs
74
46
  end
75
47
  end
76
48
 
49
+ #
50
+ # Like #parse but with additional pre- and post-processing to enhance content extraction.
51
+ #
52
+ # 'html' is a required parameters, all others are optional.
53
+ #
54
+ # @param html [String] The HTML document as a string.
55
+ # @param url [String, nil] The URL of the document (optional, used for resolving relative links).
56
+ # @param debug [Boolean] Enable debug mode (default: false).
57
+ # @param max_elems_to_parse [Integer] Maximum number of elements to parse (default: 0, meaning no limit).
58
+ # @param nb_top_candidates [Integer] Number of top candidates to consider (default: 5).
59
+ # @param char_threshold [Integer] Minimum number of characters for an element to be considered (default: 500).
60
+ # @param classes_to_preserve [Array<String>] List of CSS classes to preserve in the output (default: []).
61
+ # @param keep_classes [Boolean] Whether to keep the original classes in the output (default: false).
62
+ # @param disable_json_ld [Boolean] Disable JSON-LD parsing (default: false).
63
+ # @param serializer [String, nil] Serializer to use for output (optional).
64
+ # @param allow_video_regex [String, nil] Regular expression to allow video URLs (optional).
65
+ # @param link_density_modifier [Float] Modifier for link density calculation (default: 0).
66
+ # @return [Hash] A hash containing the extracted content and metadata.
67
+ #
68
+ # @raise [ReadabilityJs::Error] if an error occurs during execution
69
+ #
77
70
  def self.parse_extended(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0)
78
- result = pre_parser html
71
+ result = Extended::before_cleanup html
79
72
  result = parse result, url: url, debug: debug, max_elems_to_parse: max_elems_to_parse, nb_top_candidates: nb_top_candidates, char_threshold: char_threshold, classes_to_preserve: classes_to_preserve, keep_classes: keep_classes, disable_json_ld: disable_json_ld, serializer: serializer, allow_video_regex: allow_video_regex, link_density_modifier: link_density_modifier
80
- clean_up_result result
73
+ Extended::after_cleanup result, html
81
74
  end
82
75
 
83
- def self.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: 'isNodeVisible')
76
+ #
77
+ # Decides whether a document is probably readerable without parsing the whole document.
78
+ #
79
+ # Only 'html' is a required parameter, all others are optional.
80
+ #
81
+ # @param html [String] The HTML document as a string.
82
+ # @param min_content_length [Integer] Minimum content length to consider the document readerable
83
+ # @param min_score [Integer] Minimum score to consider the document readerable
84
+ # @param visibility_checker [String] anonymous JavaScript function definition to check node visibility as string. Uses default visibility checker if not provided.
85
+ # @return [Boolean] true if the document is probably readerable, false otherwise.
86
+ #
87
+ # @raise [ReadabilityJs::Error] if an error occurs during execution
88
+ #
89
+ # @example
90
+ #
91
+ # html = "<html>...</html>"
92
+ #
93
+ # visibility_checker = <<~JS
94
+ # (node) => {
95
+ # const style = node.ownerDocument.defaultView.getComputedStyle(node);
96
+ # return (style && style.display !== 'none' && style.visibility !== 'hidden' && parseFloat(style.opacity) > 0);
97
+ # }
98
+ # JS
99
+ #
100
+ # ReadabilityJs.is_probably_readerable(html, min_content_length: 200, min_score: 25, visibility_checker: visibility_checker)
101
+ #
102
+ def self.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: nil)
84
103
  begin
85
104
  ReadabilityJs::Nodo.is_probably_readerable(html, min_content_length: min_content_length, min_score: min_score, visibility_checker: visibility_checker)
86
105
  rescue => e
@@ -88,12 +107,45 @@ module ReadabilityJs
88
107
  end
89
108
  end
90
109
 
91
- def self.probably_readerable?(html)
92
- self.is_probably_readerable(html)
110
+
111
+ #
112
+ # Decides whether a document is probably readerable without parsing the whole document.
113
+ #
114
+ # Only 'html' is a required parameter, all others are optional.
115
+ #
116
+ # @param html [String] The HTML document as a string.
117
+ # @param min_content_length [Integer] Minimum content length to consider the document readerable
118
+ # @param min_score [Integer] Minimum score to consider the document readerable
119
+ # @param visibility_checker [String] anonymous JavaScript function definition to check node visibility as string. Uses default visibility checker if not provided.
120
+ # @return [Boolean] true if the document is probably readerable, false otherwise.
121
+ #
122
+ # @raise [ReadabilityJs::Error] if an error occurs during execution
123
+ #
124
+ # @example
125
+ #
126
+ # html = "<html>...</html>"
127
+ #
128
+ # visibility_checker = <<~JS
129
+ # (node) => {
130
+ # const style = node.ownerDocument.defaultView.getComputedStyle(node);
131
+ # return (style && style.display !== 'none' && style.visibility !== 'hidden' && parseFloat(style.opacity) > 0);
132
+ # }
133
+ # JS
134
+ #
135
+ # ReadabilityJs.probably_readerable?(html, min_content_length: 200, min_score: 25, visibility_checker: visibility_checker)
136
+ #
137
+ def self.probably_readerable?(html, min_content_length: 140, min_score: 20, visibility_checker: nil)
138
+ self.is_probably_readerable(html, min_content_length: min_content_length, min_score: min_score, visibility_checker: visibility_checker)
93
139
  end
94
140
 
95
141
  private
96
142
 
143
+ #
144
+ # Normalize result keys to snake_case for ruby style
145
+ #
146
+ # @param result [Hash] The result hash from Readability
147
+ # @return [Hash] The normalized result hash
148
+ #
97
149
  def self.normalize_result(result)
98
150
  result["text_content"] = result.delete("textContent") if result.key?("textContent")
99
151
  result["site_name"] = result.delete("siteName") if result.key?("siteName")
@@ -101,108 +153,5 @@ module ReadabilityJs
101
153
  result
102
154
  end
103
155
 
104
- def self.clean_up_result(result)
105
- result["content"] = clean_up_comments(result["content"]) if result.key?("content")
106
- result["text_content"] = clean_up_comments(result["text_content"]) if result.key?("text_content")
107
- result["excerpt"] = clean_up_comments(result["excerpt"]) if result.key?("excerpt")
108
- result["byline"] = clean_up_comments(result["byline"]) if result.key?("byline")
109
- if result.key?("content")
110
- result["content"] = beautify_html(result["content"])
111
- result["markdown_content"] = ReverseMarkdown.convert(result["content"]) if result.key?("content")
112
- result = beautify_markdown(result)
113
- end
114
- result
115
- end
116
-
117
- # Replaces comment / artifact noise like <!--[--&gt;, <!----&gt; etc.
118
- def self.clean_up_comments(html)
119
- copy = html.dup
120
-
121
- # Turn \x3C before comment start into '<'
122
- copy.gsub!(/\\x3C(?=!--)/, '<')
123
-
124
- # Decode encoded comment end --&gt; to -->
125
- copy.gsub!(/--&gt;/, '-->')
126
-
127
- # Remove fully empty or artifact comments ([], only whitespace)
128
- copy.gsub!(/<!--\s*(?:\[|\]|)*\s*-->/, '')
129
-
130
- # Collapse multiple dummy comment chains
131
- copy.gsub!(/(?:<!--\s*-->\s*)+/, '')
132
-
133
- # Remove remaining comment artifacts like <!--[-->, <!--]-->
134
- copy.gsub!(/<!--\[\]-->|<!--\[\s*-->|<!--\]\s*-->/, '')
135
-
136
- # Remove any remaining regular comments
137
- copy.gsub!(/<!--.*?-->/m, '')
138
-
139
- # Reduce excessive whitespace / blank lines (real newlines)
140
- copy.gsub!(/\n[ \t]+\n/, "\n")
141
- copy.gsub!(/\n{3,}/, "\n\n")
142
-
143
- # Remove any remaining script tags (including encoded variants)
144
- copy.gsub!(/(?:\\x3C|<)script\b[^>]*?(?:>|\\x3E|&gt;).*?(?:\\x3C|<)\/script(?:>|\\x3E|&gt;)/im, '')
145
-
146
- # Preserve blocks where whitespace/newlines matter
147
- preserve_tags = %w[pre code textarea]
148
- preserved = {}
149
- preserve_tags.each_with_index do |tag, idx|
150
- copy.scan(/<#{tag}[^>]*?>.*?<\/#{tag}>/mi).each do |block|
151
- key = "__PRESERVE_BLOCK_#{tag.upcase}_#{idx}_#{preserved.size}__"
152
- preserved[key] = block
153
- copy.sub!(block, key)
154
- end
155
- end
156
-
157
- # Remove literal backslash+n sequences (if they exist as textual artifacts) outside preserved blocks
158
- copy.gsub!(/\\n\s*/, ' ')
159
-
160
- # Collapse whitespace between tags to a single space or nothing
161
- # Remove whitespace-only text nodes represented by spaces/newlines between tags
162
- copy.gsub!(/>\s+</, '><')
163
-
164
- # Normalize multiple spaces to a single space
165
- copy.gsub!(/ {2,}/, ' ')
166
-
167
- # Trim spaces directly inside tags (e.g., <p> text </p>)
168
- copy.gsub!(/>\s+([^<])/) { ">#{$1}" }
169
-
170
- # Restore preserved blocks
171
- preserved.each { |k, v| copy.sub!(k, v) }
172
- copy.strip
173
- end
174
-
175
- def self.beautify_markdown(result)
176
- mark_down = result["markdown_content"]
177
- # add title to markdown if not present
178
- if !mark_down.start_with?("# ") && result.key?("title") && !result["title"].to_s.strip.empty? && !mark_down.include?(result["title"])
179
- mark_down = "# #{result['title']}\n\n" + mark_down
180
- end
181
- # Add a space after markdown links if immediately followed by an alphanumeric char (missing separation).
182
- mark_down.gsub!(/(\[[^\]]+\]\((?:[^\)"']+|"[^"]*"|'[^']*')*\))(?=[A-Za-z0-9ÄÖÜäöüß])/, '\1 ')
183
- result["markdown_content"] = mark_down
184
- result
185
- end
186
-
187
- def self.beautify_html(html)
188
- doc = Nokogiri::HTML(html)
189
- # Add a space after a links if immediately followed by an alphanumeric char (missing separation).
190
- doc.css('a').each do |link|
191
- next if link.next_sibling.nil?
192
- if link.next_sibling.text? && link.next_sibling.content =~ /\A[A-Za-z0-9ÄÖÜäöüß]/
193
- link.add_next_sibling(Nokogiri::XML::Text.new(' ', doc))
194
- end
195
- end
196
- doc.to_html
197
- end
198
-
199
- def self.pre_parser(html)
200
- doc = Nokogiri::HTML(html)
201
- # Remove blacklisted classes
202
- SELECTOR_BLACKLIST.each do |classname|
203
- doc.css("#{classname}").remove
204
- end
205
- doc.to_html
206
- end
207
156
 
208
157
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: readability_js
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matthäus Beyrle
@@ -145,6 +145,7 @@ executables: []
145
145
  extensions: []
146
146
  extra_rdoc_files: []
147
147
  files:
148
+ - ".gitattributes"
148
149
  - ".gitignore"
149
150
  - ".rspec"
150
151
  - CHANGELOG.md
@@ -158,6 +159,7 @@ files:
158
159
  - cli/pry.rb
159
160
  - lib/custom_errors/error.rb
160
161
  - lib/readability_js.rb
162
+ - lib/readability_js/extended.rb
161
163
  - lib/readability_js/node/node_modules/.bin/tldts
162
164
  - lib/readability_js/node/node_modules/.yarn-integrity
163
165
  - lib/readability_js/node/node_modules/@asamuzakjp/css-color/LICENSE