readability_js 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 54d488477df33ef1471e6bf880cd39ba345debf2afa6d8cdc55dd2e058187373
4
- data.tar.gz: 1345de9c85515fb049c3dfe4ae97349256a8fa12711265a7b55ddde394340c7b
3
+ metadata.gz: c536792a9e26ab4080c31b6065e02fab9a5ed5536993eeac15e53b0fd1c288fc
4
+ data.tar.gz: 88baef9c54969ca53e2bef99550d9da1445b72b62d8da43250d564c0ed1bc3e5
5
5
  SHA512:
6
- metadata.gz: c3b20a5240e202ce12c7af97418c420dc81ff9de8cf592c415bffeb7da5eeaaa65f1204f443056aece6cf6fa709f87189355340edea0066a194b769b6e1c8d18
7
- data.tar.gz: 50e9905ba059fff544b5dc3b9bc8fb21ba1f9e139c5c9b257b0de3d29b74043ce8ec8ec7424885babe994a5a1bba944c9a3641a48ee67fbfce329943895a759a
6
+ metadata.gz: 552063ebef5709acb6ca6a9744bf69e8c33499d24d0b9db7d5bcaef53a3429f0b72b7bc7f12350d2e0137ef20c5603489feb731de831044cb2f1cda742168d75
7
+ data.tar.gz: 11b0a652f07d3c722775a7f9fc401af4e20ee2abb8513e60bba44a7b5a1e0e8d9df0bf7477f8e5aa0fdfc1c147186094a610e0aea02757a70f2752845f241265
data/.gitattributes ADDED
@@ -0,0 +1,2 @@
1
+ spec/examples/* linguist-vendored
2
+ node_modules/* linguist-vendored
data/README.md CHANGED
@@ -97,7 +97,8 @@ It contains the data returned by readability, with hash keys transformed in snak
97
97
  "dir" => "ltr",
98
98
  "site_name" => "example.com",
99
99
  "lang" => "en",
100
- "published_time" => "2024-01-01T12:00:00Z"
100
+ "published_time" => "2024-01-01T12:00:00Z",
101
+ "image_url" => "https://example.com/image.jpg" # only for extended parse
101
102
  }
102
103
  ```
103
104
 
@@ -0,0 +1,262 @@
1
+
2
+ module ReadabilityJs
3
+ class Extended
4
+
5
+ SELECTOR_BLACKLIST = [
6
+ ".Article-Partner",
7
+ ".Article-Partner-Text",
8
+ ".Article-Comments-Button",
9
+ "#isl-5-AdCarousel",
10
+ "#isl-10-ArticleComments",
11
+ "*[data-element-tracking-name]",
12
+ "*[aria-label='Anzeige']",
13
+ "nav[aria-label='breadcrumb']",
14
+ # heise
15
+ "a-video",
16
+ "a-gift",
17
+ "a-collapse",
18
+ "a-opt-in",
19
+ # spiegel
20
+ "[data-area='related_articles']",
21
+ # welt
22
+ "nav[aria-label='Breadcrumb']",
23
+ ".c-inline-teaser-list",
24
+ "[width='1'][height='1']",
25
+ # golem
26
+ ".go-alink-list",
27
+ # faz
28
+ "[data-external-selector='related-articles-entries']",
29
+ ".BigBox",
30
+ # frankfurter rundschau
31
+ ".id-Breadcrumb-item",
32
+ ".id-Story-interactionBar",
33
+ "revenue-reel",
34
+ ".id-StoryElement-factBox",
35
+ # stern
36
+ ".breadcrumb",
37
+ ".teaser",
38
+ ".group-teaserblock__items",
39
+ ".title__kicker",
40
+ "ws-adtag",
41
+ # taz
42
+ "[data-for='webelement_bio']",
43
+ "[data-for='webelement_citation']",
44
+ "#articleTeaser",
45
+ ".article-produktteaser-container",
46
+ "[x-data='{}']",
47
+ "#komune",
48
+ ".community",
49
+ ]
50
+
51
+ def self.before_cleanup(html)
52
+ pre_parser html
53
+ end
54
+
55
+ def self.after_cleanup(result, html)
56
+ find_and_add_picture result, html
57
+ clean_up_and_enrich_result result
58
+ end
59
+
60
+ private
61
+
62
+ #
63
+ # Pre-parser to clean up HTML before passing it to Readability
64
+ #
65
+ # SELECTOR_BLACKLIST contains CSS selectors of elements to be removed from the HTML
66
+ # before parsing to improve content extraction.
67
+ #
68
+ # @param html [String] The HTML document as a string.
69
+ # @return [String] The cleaned HTML document as a string.
70
+ #
71
+ def self.pre_parser(html)
72
+ doc = Nokogiri::HTML(html)
73
+ # Remove blacklisted elements by selector
74
+ SELECTOR_BLACKLIST.each do |classname|
75
+ doc.css("#{classname}").remove
76
+ end
77
+ doc.to_html
78
+ end
79
+
80
+ #
81
+ # Post-parser to find and add lead image URL if missing.
82
+ #
83
+ # Will add a picture into the result hash under the key "image_url".
84
+ #
85
+ # Looks for Open Graph and Twitter Card meta tags to find a lead image URL.
86
+ # If not found, it will have a look into the markdown content for the first image.
87
+ #
88
+ # @param result [Hash] The result hash from Readability parsing.
89
+ # @param html [String] The original HTML document as a string.
90
+ # @return [Hash] The updated result hash.
91
+ #
92
+ def self.find_and_add_picture(result, html)
93
+ return result if result.key?("lead_image_url") && !result["lead_image_url"].to_s.strip.empty?
94
+ doc = Nokogiri::HTML(html)
95
+ # try to find og:image or twitter:image meta tags
96
+ meta_tags = doc.css('meta[property="og:image"], meta[name="og:image"], meta[name="twitter:image"]')
97
+ meta_tags.each do |meta_tag|
98
+ content = meta_tag['content']
99
+ if content && !content.strip.empty?
100
+ result["image_url"] = content.strip
101
+ break
102
+ end
103
+ end
104
+ # try to find first image in markdown content if no meta tag found before
105
+ if !result.key?("image_url") || result["image_url"].to_s.strip.empty?
106
+ if result.key?("markdown_content")
107
+ md_content = result["markdown_content"]
108
+ md_content.scan(/!\[.*?\]\((.*?)\)/).each do |match|
109
+ img_url = match[0]
110
+ if img_url && !img_url.strip.empty?
111
+ # check if img ends with common image file extensions
112
+ if img_url =~ /\.(jpg|jpeg|png|gif|webp|svg|tif|avif)(\?.*)?$/i
113
+ result["image_url"] = img_url.strip
114
+ break
115
+ end
116
+ end
117
+ end
118
+ end
119
+ end
120
+ result
121
+ end
122
+
123
+ #
124
+ # Post-parser to clean up extracted content after Readability processing
125
+ #
126
+ # Cleans up comment artifacts and beautifies HTML and adds beautified Markdown content.
127
+ #
128
+ # @param result [Hash] The result hash from Readability parsing.
129
+ # @return [Hash] The cleaned result hash.
130
+ #
131
+ def self.clean_up_and_enrich_result(result)
132
+ result["content"] = clean_up_comments(result["content"]) if result.key?("content")
133
+ result["text_content"] = clean_up_comments(result["text_content"]) if result.key?("text_content")
134
+ result["excerpt"] = clean_up_comments(result["excerpt"]) if result.key?("excerpt")
135
+ result["byline"] = clean_up_comments(result["byline"]) if result.key?("byline")
136
+ if result.key?("content")
137
+ result = beautify_html_and_text(result)
138
+ result["markdown_content"] = ReverseMarkdown.convert(result["content"]) if result.key?("content")
139
+ result = beautify_markdown(result)
140
+ end
141
+ result
142
+ end
143
+
144
+ #
145
+ # Remove/replace comment / artifact noise like <!--[--&gt;, <!----&gt; etc.
146
+ #
147
+ # @param html [String] The HTML content as a string.
148
+ # @return [String] The cleaned HTML content as a string.
149
+ #
150
+ def self.clean_up_comments(html)
151
+ copy = html.dup
152
+ # Turn \x3C before comment start into '<'
153
+ copy.gsub!(/\\x3C(?=!--)/, '<')
154
+ # Decode encoded comment end --&gt; to -->
155
+ copy.gsub!(/--&gt;/, '-->')
156
+ # Remove fully empty or artifact comments ([], only whitespace)
157
+ copy.gsub!(/<!--\s*(?:\[|\]|)*\s*-->/, '')
158
+ # Collapse multiple dummy comment chains
159
+ copy.gsub!(/(?:<!--\s*-->\s*)+/, '')
160
+ # Remove remaining comment artifacts like <!--[-->, <!--]-->
161
+ copy.gsub!(/<!--\[\]-->|<!--\[\s*-->|<!--\]\s*-->/, '')
162
+ # Remove any remaining regular comments
163
+ copy.gsub!(/<!--.*?-->/m, '')
164
+ # Reduce excessive whitespace / blank lines (real newlines)
165
+ copy.gsub!(/\n[ \t]+\n/, "\n")
166
+ copy.gsub!(/\n{3,}/, "\n\n")
167
+ # Remove any remaining script tags (including encoded variants)
168
+ copy.gsub!(/(?:\\x3C|<)script\b[^>]*?(?:>|\\x3E|&gt;).*?(?:\\x3C|<)\/script(?:>|\\x3E|&gt;)/im, '')
169
+ # Preserve blocks where whitespace/newlines matter
170
+ preserve_tags = %w[pre code textarea]
171
+ preserved = {}
172
+ preserve_tags.each_with_index do |tag, idx|
173
+ copy.scan(/<#{tag}[^>]*?>.*?<\/#{tag}>/mi).each do |block|
174
+ key = "__PRESERVE_BLOCK_#{tag.upcase}_#{idx}_#{preserved.size}__"
175
+ preserved[key] = block
176
+ copy.sub!(block, key)
177
+ end
178
+ end
179
+ # Remove literal backslash+n sequences (if they exist as textual artifacts) outside preserved blocks
180
+ copy.gsub!(/\\n\s*/, ' ')
181
+ # Collapse whitespace between tags to a single space or nothing
182
+ # Remove whitespace-only text nodes represented by spaces/newlines between tags
183
+ copy.gsub!(/>\s+</, '><')
184
+ # Normalize multiple spaces to a single space
185
+ copy.gsub!(/ {2,}/, ' ')
186
+ # Trim spaces directly inside tags (e.g., <p> text </p>)
187
+ copy.gsub!(/>\s+([^<])/) { ">#{$1}" }
188
+ # Restore preserved blocks
189
+ preserved.each { |k, v| copy.sub!(k, v) }
190
+ copy.strip
191
+ end
192
+
193
+ #
194
+ # Beautify Markdown content by adding title if not present and fixing link spacing
195
+ #
196
+ # @param result [Hash] The result hash from Readability parsing.
197
+ # @return [Hash] The beautified result hash.
198
+ #
199
+ def self.beautify_markdown(result)
200
+ mark_down = result["markdown_content"]
201
+ # add title to markdown if not present
202
+ if !mark_down.start_with?("# ") && result.key?("title") && !result["title"].to_s.strip.empty? && !mark_down.include?(result["title"])
203
+ mark_down = "# #{result['title']}\n\n" + mark_down
204
+ end
205
+ # Check for image and if none is found, add after title if available
206
+ if result.key?("image_url") && !result["image_url"].to_s.strip.empty?
207
+ has_image = mark_down.match(/!\[.*?\]\(.*?\)/)
208
+ if !has_image
209
+ img_md = "![Lead Image](#{result['image_url']})\n\n"
210
+ mark_down = mark_down.sub(/^# .+?\n/, "\\0" + img_md)
211
+ end
212
+ end
213
+ # Add a space after markdown links if immediately followed by an alphanumeric char (missing separation).
214
+ mark_down.gsub!(/(\[[^\]]+\]\((?:[^\)"']+|"[^"]*"|'[^']*')*\))(?=[A-Za-z0-9ÄÖÜäöüß])/, '\1 ')
215
+ result["markdown_content"] = mark_down
216
+ result
217
+ end
218
+
219
+ #
220
+ # Beautify HTML content by adding title if not present and fixing link spacing
221
+ #
222
+ # @param result [Hash] The result hash from Readability parsing.
223
+ # @return [String] The beautified HTML content as a string.
224
+ #
225
+ def self.beautify_html_and_text(result)
226
+ html = result["content"]
227
+ text = result["text_content"]
228
+ # Add title to html and text if not present
229
+ if (html.index(/h[1-2]/) && html.index(/h[1-2]/).to_i > 128 && result.key?("title") && !result["title"].to_s.strip.empty? && !html.include?(result["title"])) || html.index(/h[1-2]/).nil?
230
+ title_tag = "<h1>#{result['title']}</h1>\n"
231
+ html = title_tag + html
232
+ text = result['title'] + "\n\n" + text
233
+ end
234
+ # Check for image and if none is found, add after title if available
235
+ if result.key?("image_url") && !result["image_url"].to_s.strip.empty?
236
+ doc = Nokogiri::HTML(html)
237
+ # check for img tags but also for picture tags
238
+ has_image = !doc.css('img, picture').empty?
239
+ if !has_image
240
+ img_tag = "<p><img src=\"#{result['image_url']}\" alt=\"Lead Image\"></p>\n"
241
+ h1 = doc.at_css('h1')
242
+ if h1
243
+ h1.add_next_sibling(Nokogiri::HTML::DocumentFragment.parse(img_tag))
244
+ html = doc.to_html
245
+ text = result['image_url'] + "\n\n" + text
246
+ end
247
+ end
248
+ end
249
+ # Add a space after a links if immediately followed by an alphanumeric char (missing separation).
250
+ doc = Nokogiri::HTML(html)
251
+ doc.css('a').each do |link|
252
+ next if link.next_sibling.nil?
253
+ if link.next_sibling.text? && link.next_sibling.content =~ /\A[A-Za-z0-9ÄÖÜäöüß]/
254
+ link.add_next_sibling(Nokogiri::XML::Text.new(' ', doc))
255
+ end
256
+ end
257
+ result["content"] = doc.to_html
258
+ result["text_content"] = text
259
+ result
260
+ end
261
+ end
262
+ end
@@ -1,4 +1,4 @@
1
- const { Readability } = require('@mozilla/readability');
1
+ const { Readability, isProbablyReaderable } = require('@mozilla/readability');
2
2
  const { JSDOM } = require('jsdom');
3
3
 
4
4
  const doc = new JSDOM("<body>Look at this cat: <img src='./cat.jpg'></body>", {
@@ -7,4 +7,10 @@ const doc = new JSDOM("<body>Look at this cat: <img src='./cat.jpg'></body>", {
7
7
  let reader = new Readability(doc.window.document);
8
8
  let article = reader.parse();
9
9
 
10
- console.log(article);
10
+ console.log(article);
11
+
12
+ if(isProbablyReaderable(doc.window.document)) {
13
+ console.log("This document is probably readerable.");
14
+ } else {
15
+ console.log("This document is probably not readerable.");
16
+ }
@@ -14,15 +14,22 @@ module ReadabilityJs
14
14
  #
15
15
  def self.parse(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0)
16
16
  begin
17
- self.new.parse html
17
+ # remove style tags from html, so jsdom does not need to process css and its warnings are not shown
18
+ html = html.gsub(/<style[^>]*>.*?<\/style>/m, '')
19
+ self.new.parse html, url, debug, max_elems_to_parse, nb_top_candidates, char_threshold, classes_to_preserve, keep_classes, disable_json_ld, serializer, allow_video_regex, link_density_modifier
18
20
  rescue ::Nodo::JavaScriptError => e
19
21
  raise ReadabilityJs::Error.new "#{e.message}"
20
22
  end
21
23
  end
22
24
 
23
- def self.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: 'isNodeVisible')
25
+ #
26
+ # instance wrapper method, as nodo does not support class methods
27
+ #
28
+ def self.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: nil)
24
29
  begin
25
- self.new.is_probably_readerable html
30
+ # remove style tags from html, so jsdom does not need to process css and its warnings are not shown
31
+ html = html.gsub(/<style[^>]*>.*?<\/style>/m, '')
32
+ self.new.is_probably_readerable html, min_content_length, min_score, visibility_checker
26
33
  rescue ::Nodo::JavaScriptError => e
27
34
  raise ReadabilityJs::Error.new "#{e.message}"
28
35
  end
@@ -57,9 +64,16 @@ module ReadabilityJs
57
64
  JS
58
65
 
59
66
  function :is_probably_readerable, <<~JS
60
- async (html) => {
67
+ async (html, minContentLength, minScore, visibilityChecker) => {
61
68
  const doc = new jsdom.JSDOM(html);
62
- return readability.Readability.isProbablyReaderable(doc);
69
+
70
+ let readability_options = {};
71
+ if(minContentLength !== undefined && minContentLength !== null) readability_options.minContentLength = minContentLength;
72
+ if(minScore !== undefined && minScore !== null) readability_options.minScore = minScore;
73
+ if(visibilityChecker !== undefined && visibilityChecker !== null) {
74
+ readability_options.visibilityChecker = eval(visibilityChecker);
75
+ }
76
+ return readability.isProbablyReaderable(doc.window.document, readability_options);
63
77
  }
64
78
  JS
65
79
 
@@ -1,3 +1,3 @@
1
1
  module ReadabilityJs
2
- VERSION = '0.0.1'.freeze
2
+ VERSION = '0.0.3'.freeze
3
3
  end
@@ -6,6 +6,7 @@ require 'nokogiri'
6
6
 
7
7
  require_relative 'readability_js/version'
8
8
  require_relative 'readability_js/nodo'
9
+ require_relative 'readability_js/extended'
9
10
 
10
11
  require_relative 'custom_errors/error'
11
12
 
@@ -15,55 +16,27 @@ require_relative 'custom_errors/error'
15
16
 
16
17
  module ReadabilityJs
17
18
 
18
- SELECTOR_BLACKLIST = [
19
- ".Article-Partner",
20
- ".Article-Partner-Text",
21
- ".Article-Comments-Button",
22
- "#isl-5-AdCarousel",
23
- "#isl-10-ArticleComments",
24
- "*[data-element-tracking-name]",
25
- "*[aria-label='Anzeige']",
26
- "nav[aria-label='breadcrumb']",
27
- # heise
28
- "a-video",
29
- "a-gift",
30
- "a-collapse",
31
- # spiegel
32
- "[data-area='related_articles']",
33
- # welt
34
- "nav[aria-label='Breadcrumb']",
35
- ".c-inline-teaser-list",
36
- # golem
37
- ".go-alink-list",
38
- # faz
39
- "[data-external-selector='related-articles-entries']",
40
- ".BigBox",
41
- # frankfurter rundschau
42
- ".id-Breadcrumb-item",
43
- ".id-Story-interactionBar",
44
- "revenue-reel",
45
- ".id-StoryElement-factBox",
46
- # stern
47
- ".breadcrumb",
48
- ".teaser",
49
- ".group-teaserblock__items",
50
- ".title__kicker",
51
- # taz
52
- "[data-for='webelement_bio']",
53
- "[data-for='webelement_citation']",
54
- "#articleTeaser",
55
- ".article-produktteaser-container",
56
- "[x-data='{}']",
57
- "#komune",
58
- ".community",
59
- ]
60
-
61
19
  #
62
20
  # Parse a HTML document and extract its main content using Mozilla's Readability library.
63
- # Raises ReadabilityJs::Error on failure.
64
21
  #
65
22
  # 'html' is a required parameters, all others are optional.
66
23
  #
24
+ # @param html [String] The HTML document as a string.
25
+ # @param url [String, nil] The URL of the document (optional, used for resolving relative links).
26
+ # @param debug [Boolean] Enable debug mode (default: false).
27
+ # @param max_elems_to_parse [Integer] Maximum number of elements to parse (default: 0, meaning no limit).
28
+ # @param nb_top_candidates [Integer] Number of top candidates to consider (default: 5).
29
+ # @param char_threshold [Integer] Minimum number of characters for an element to be considered (default: 500).
30
+ # @param classes_to_preserve [Array<String>] List of CSS classes to preserve in the output (default: []).
31
+ # @param keep_classes [Boolean] Whether to keep the original classes in the output (default: false).
32
+ # @param disable_json_ld [Boolean] Disable JSON-LD parsing (default: false).
33
+ # @param serializer [String, nil] Serializer to use for output (optional).
34
+ # @param allow_video_regex [String, nil] Regular expression to allow video URLs (optional).
35
+ # @param link_density_modifier [Float] Modifier for link density calculation (default: 0).
36
+ # @return [Hash] A hash containing the extracted content and metadata.
37
+ #
38
+ # @raise [ReadabilityJs::Error] if an error occurs during execution
39
+ #
67
40
  def self.parse(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0)
68
41
  begin
69
42
  result = ReadabilityJs::Nodo.parse(html, url: url, debug: debug, max_elems_to_parse: max_elems_to_parse, nb_top_candidates: nb_top_candidates, char_threshold: char_threshold, classes_to_preserve: classes_to_preserve, keep_classes: keep_classes, disable_json_ld: disable_json_ld, serializer: serializer, allow_video_regex: allow_video_regex, link_density_modifier: link_density_modifier)
@@ -73,13 +46,60 @@ module ReadabilityJs
73
46
  end
74
47
  end
75
48
 
49
+ #
50
+ # Like #parse but with additional pre- and post-processing to enhance content extraction.
51
+ #
52
+ # 'html' is a required parameters, all others are optional.
53
+ #
54
+ # @param html [String] The HTML document as a string.
55
+ # @param url [String, nil] The URL of the document (optional, used for resolving relative links).
56
+ # @param debug [Boolean] Enable debug mode (default: false).
57
+ # @param max_elems_to_parse [Integer] Maximum number of elements to parse (default: 0, meaning no limit).
58
+ # @param nb_top_candidates [Integer] Number of top candidates to consider (default: 5).
59
+ # @param char_threshold [Integer] Minimum number of characters for an element to be considered (default: 500).
60
+ # @param classes_to_preserve [Array<String>] List of CSS classes to preserve in the output (default: []).
61
+ # @param keep_classes [Boolean] Whether to keep the original classes in the output (default: false).
62
+ # @param disable_json_ld [Boolean] Disable JSON-LD parsing (default: false).
63
+ # @param serializer [String, nil] Serializer to use for output (optional).
64
+ # @param allow_video_regex [String, nil] Regular expression to allow video URLs (optional).
65
+ # @param link_density_modifier [Float] Modifier for link density calculation (default: 0).
66
+ # @return [Hash] A hash containing the extracted content and metadata.
67
+ #
68
+ # @raise [ReadabilityJs::Error] if an error occurs during execution
69
+ #
76
70
  def self.parse_extended(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0)
77
- result = pre_parser html
71
+ result = Extended::before_cleanup html
78
72
  result = parse result, url: url, debug: debug, max_elems_to_parse: max_elems_to_parse, nb_top_candidates: nb_top_candidates, char_threshold: char_threshold, classes_to_preserve: classes_to_preserve, keep_classes: keep_classes, disable_json_ld: disable_json_ld, serializer: serializer, allow_video_regex: allow_video_regex, link_density_modifier: link_density_modifier
79
- clean_up_result result
73
+ Extended::after_cleanup result, html
80
74
  end
81
75
 
82
- def self.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: 'isNodeVisible')
76
+ #
77
+ # Decides whether a document is probably readerable without parsing the whole document.
78
+ #
79
+ # Only 'html' is a required parameter, all others are optional.
80
+ #
81
+ # @param html [String] The HTML document as a string.
82
+ # @param min_content_length [Integer] Minimum content length to consider the document readerable
83
+ # @param min_score [Integer] Minimum score to consider the document readerable
84
+ # @param visibility_checker [String] anonymous JavaScript function definition to check node visibility as string. Uses default visibility checker if not provided.
85
+ # @return [Boolean] true if the document is probably readerable, false otherwise.
86
+ #
87
+ # @raise [ReadabilityJs::Error] if an error occurs during execution
88
+ #
89
+ # @example
90
+ #
91
+ # html = "<html>...</html>"
92
+ #
93
+ # visibility_checker = <<~JS
94
+ # (node) => {
95
+ # const style = node.ownerDocument.defaultView.getComputedStyle(node);
96
+ # return (style && style.display !== 'none' && style.visibility !== 'hidden' && parseFloat(style.opacity) > 0);
97
+ # }
98
+ # JS
99
+ #
100
+ # ReadabilityJs.is_probably_readerable(html, min_content_length: 200, min_score: 25, visibility_checker: visibility_checker)
101
+ #
102
+ def self.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: nil)
83
103
  begin
84
104
  ReadabilityJs::Nodo.is_probably_readerable(html, min_content_length: min_content_length, min_score: min_score, visibility_checker: visibility_checker)
85
105
  rescue => e
@@ -87,12 +107,45 @@ module ReadabilityJs
87
107
  end
88
108
  end
89
109
 
90
- def self.probably_readerable?(html)
91
- self.is_probably_readerable(html)
110
+
111
+ #
112
+ # Decides whether a document is probably readerable without parsing the whole document.
113
+ #
114
+ # Only 'html' is a required parameter, all others are optional.
115
+ #
116
+ # @param html [String] The HTML document as a string.
117
+ # @param min_content_length [Integer] Minimum content length to consider the document readerable
118
+ # @param min_score [Integer] Minimum score to consider the document readerable
119
+ # @param visibility_checker [String] anonymous JavaScript function definition to check node visibility as string. Uses default visibility checker if not provided.
120
+ # @return [Boolean] true if the document is probably readerable, false otherwise.
121
+ #
122
+ # @raise [ReadabilityJs::Error] if an error occurs during execution
123
+ #
124
+ # @example
125
+ #
126
+ # html = "<html>...</html>"
127
+ #
128
+ # visibility_checker = <<~JS
129
+ # (node) => {
130
+ # const style = node.ownerDocument.defaultView.getComputedStyle(node);
131
+ # return (style && style.display !== 'none' && style.visibility !== 'hidden' && parseFloat(style.opacity) > 0);
132
+ # }
133
+ # JS
134
+ #
135
+ # ReadabilityJs.probably_readerable?(html, min_content_length: 200, min_score: 25, visibility_checker: visibility_checker)
136
+ #
137
+ def self.probably_readerable?(html, min_content_length: 140, min_score: 20, visibility_checker: nil)
138
+ self.is_probably_readerable(html, min_content_length: min_content_length, min_score: min_score, visibility_checker: visibility_checker)
92
139
  end
93
140
 
94
141
  private
95
142
 
143
+ #
144
+ # Normalize result keys to snake_case for ruby style
145
+ #
146
+ # @param result [Hash] The result hash from Readability
147
+ # @return [Hash] The normalized result hash
148
+ #
96
149
  def self.normalize_result(result)
97
150
  result["text_content"] = result.delete("textContent") if result.key?("textContent")
98
151
  result["site_name"] = result.delete("siteName") if result.key?("siteName")
@@ -100,108 +153,5 @@ module ReadabilityJs
100
153
  result
101
154
  end
102
155
 
103
- def self.clean_up_result(result)
104
- result["content"] = clean_up_comments(result["content"]) if result.key?("content")
105
- result["text_content"] = clean_up_comments(result["text_content"]) if result.key?("text_content")
106
- result["excerpt"] = clean_up_comments(result["excerpt"]) if result.key?("excerpt")
107
- result["byline"] = clean_up_comments(result["byline"]) if result.key?("byline")
108
- if result.key?("content")
109
- result["content"] = beautify_html(result["content"])
110
- result["markdown_content"] = ReverseMarkdown.convert(result["content"]) if result.key?("content")
111
- result = beautify_markdown(result)
112
- end
113
- result
114
- end
115
-
116
- # Replaces comment / artifact noise like <!--[--&gt;, <!----&gt; etc.
117
- def self.clean_up_comments(html)
118
- copy = html.dup
119
-
120
- # Turn \x3C before comment start into '<'
121
- copy.gsub!(/\\x3C(?=!--)/, '<')
122
-
123
- # Decode encoded comment end --&gt; to -->
124
- copy.gsub!(/--&gt;/, '-->')
125
-
126
- # Remove fully empty or artifact comments ([], only whitespace)
127
- copy.gsub!(/<!--\s*(?:\[|\]|)*\s*-->/, '')
128
-
129
- # Collapse multiple dummy comment chains
130
- copy.gsub!(/(?:<!--\s*-->\s*)+/, '')
131
-
132
- # Remove remaining comment artifacts like <!--[-->, <!--]-->
133
- copy.gsub!(/<!--\[\]-->|<!--\[\s*-->|<!--\]\s*-->/, '')
134
-
135
- # Remove any remaining regular comments
136
- copy.gsub!(/<!--.*?-->/m, '')
137
-
138
- # Reduce excessive whitespace / blank lines (real newlines)
139
- copy.gsub!(/\n[ \t]+\n/, "\n")
140
- copy.gsub!(/\n{3,}/, "\n\n")
141
-
142
- # Remove any remaining script tags (including encoded variants)
143
- copy.gsub!(/(?:\\x3C|<)script\b[^>]*?(?:>|\\x3E|&gt;).*?(?:\\x3C|<)\/script(?:>|\\x3E|&gt;)/im, '')
144
-
145
- # Preserve blocks where whitespace/newlines matter
146
- preserve_tags = %w[pre code textarea]
147
- preserved = {}
148
- preserve_tags.each_with_index do |tag, idx|
149
- copy.scan(/<#{tag}[^>]*?>.*?<\/#{tag}>/mi).each do |block|
150
- key = "__PRESERVE_BLOCK_#{tag.upcase}_#{idx}_#{preserved.size}__"
151
- preserved[key] = block
152
- copy.sub!(block, key)
153
- end
154
- end
155
-
156
- # Remove literal backslash+n sequences (if they exist as textual artifacts) outside preserved blocks
157
- copy.gsub!(/\\n\s*/, ' ')
158
-
159
- # Collapse whitespace between tags to a single space or nothing
160
- # Remove whitespace-only text nodes represented by spaces/newlines between tags
161
- copy.gsub!(/>\s+</, '><')
162
-
163
- # Normalize multiple spaces to a single space
164
- copy.gsub!(/ {2,}/, ' ')
165
-
166
- # Trim spaces directly inside tags (e.g., <p> text </p>)
167
- copy.gsub!(/>\s+([^<])/) { ">#{$1}" }
168
-
169
- # Restore preserved blocks
170
- preserved.each { |k, v| copy.sub!(k, v) }
171
- copy.strip
172
- end
173
-
174
- def self.beautify_markdown(result)
175
- mark_down = result["markdown_content"]
176
- # add title to markdown if not present
177
- if !mark_down.start_with?("# ") && result.key?("title") && !result["title"].to_s.strip.empty? && !mark_down.include?(result["title"])
178
- mark_down = "# #{result['title']}\n\n" + mark_down
179
- end
180
- # Add a space after markdown links if immediately followed by an alphanumeric char (missing separation).
181
- mark_down.gsub!(/(\[[^\]]+\]\((?:[^\)"']+|"[^"]*"|'[^']*')*\))(?=[A-Za-z0-9ÄÖÜäöüß])/, '\1 ')
182
- result["markdown_content"] = mark_down
183
- result
184
- end
185
-
186
- def self.beautify_html(html)
187
- doc = Nokogiri::HTML(html)
188
- # Add a space after a links if immediately followed by an alphanumeric char (missing separation).
189
- doc.css('a').each do |link|
190
- next if link.next_sibling.nil?
191
- if link.next_sibling.text? && link.next_sibling.content =~ /\A[A-Za-z0-9ÄÖÜäöüß]/
192
- link.add_next_sibling(Nokogiri::XML::Text.new(' ', doc))
193
- end
194
- end
195
- doc.to_html
196
- end
197
-
198
- def self.pre_parser(html)
199
- doc = Nokogiri::HTML(html)
200
- # Remove blacklisted classes
201
- SELECTOR_BLACKLIST.each do |classname|
202
- doc.css("#{classname}").remove
203
- end
204
- doc.to_html
205
- end
206
156
 
207
157
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: readability_js
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matthäus Beyrle
@@ -145,6 +145,7 @@ executables: []
145
145
  extensions: []
146
146
  extra_rdoc_files: []
147
147
  files:
148
+ - ".gitattributes"
148
149
  - ".gitignore"
149
150
  - ".rspec"
150
151
  - CHANGELOG.md
@@ -158,6 +159,7 @@ files:
158
159
  - cli/pry.rb
159
160
  - lib/custom_errors/error.rb
160
161
  - lib/readability_js.rb
162
+ - lib/readability_js/extended.rb
161
163
  - lib/readability_js/node/node_modules/.bin/tldts
162
164
  - lib/readability_js/node/node_modules/.yarn-integrity
163
165
  - lib/readability_js/node/node_modules/@asamuzakjp/css-color/LICENSE