readability_js 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +2 -0
- data/CHANGELOG.md +6 -0
- data/README.md +4 -3
- data/lib/readability_js/extended.rb +261 -0
- data/lib/readability_js/node/readability-example.js +8 -2
- data/lib/readability_js/nodo.rb +19 -5
- data/lib/readability_js/version.rb +1 -1
- data/lib/readability_js.rb +102 -153
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 8ce8fc0727f6b8ce1bfc45cf586b58c0e895deb95ba8f5f341a0d41ed6e9a9ba
|
|
4
|
+
data.tar.gz: 5baf642f9053d3c0adb9b81e04f9ff0d4b0252725274827143340fcb6f4067cf
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3ee241f68497574ea9477afa66e1b605007446f36ea29fcd4760581891e7482b167249e2dd502254e4407bee1cb22501eea40b74be920a4097de656f808690ac
|
|
7
|
+
data.tar.gz: 8d8d6a3d85108e590762ed17f70f12142d8aca5787abaf8cb3fc5160b47aa5e7397605bf1bcbedb17fd3761335862223c45426e3be13d3bdfe1c288084396690
|
data/.gitattributes
ADDED
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
|
@@ -69,7 +69,7 @@ and includes a beautified markdown version of the content.
|
|
|
69
69
|
```
|
|
70
70
|
|
|
71
71
|
### Query parameters
|
|
72
|
-
You can pass all parameters supported by readability, checkout the [rubydoc for more details](https://
|
|
72
|
+
You can pass all parameters supported by readability, checkout the [rubydoc for more details](https://rubydoc.info/github/magynhard/ruby-readability_js/ReadabilityJs).
|
|
73
73
|
|
|
74
74
|
Here an example with all parameters, the camelCase parameters are converted to snake_case in ruby:
|
|
75
75
|
|
|
@@ -97,14 +97,15 @@ It contains the data returned by readability, with hash keys transformed in snak
|
|
|
97
97
|
"dir" => "ltr",
|
|
98
98
|
"site_name" => "example.com",
|
|
99
99
|
"lang" => "en",
|
|
100
|
-
"published_time" => "2024-01-01T12:00:00Z"
|
|
100
|
+
"published_time" => "2024-01-01T12:00:00Z",
|
|
101
|
+
"image_url" => "https://example.com/image.jpg" # only for extended parse
|
|
101
102
|
}
|
|
102
103
|
```
|
|
103
104
|
|
|
104
105
|
<a name="documentation"></a>
|
|
105
106
|
## Documentation
|
|
106
107
|
Check out the doc at RubyDoc:<br>
|
|
107
|
-
https://
|
|
108
|
+
https://rubydoc.info/github/magynhard/ruby-readability_js
|
|
108
109
|
|
|
109
110
|
|
|
110
111
|
As this library is only a wrapper, checkout the original readability documentation:<br>
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
|
|
2
|
+
module ReadabilityJs
|
|
3
|
+
class Extended
|
|
4
|
+
|
|
5
|
+
SELECTOR_BLACKLIST = [
|
|
6
|
+
".Article-Partner",
|
|
7
|
+
".Article-Partner-Text",
|
|
8
|
+
".Article-Comments-Button",
|
|
9
|
+
"#isl-5-AdCarousel",
|
|
10
|
+
"#isl-10-ArticleComments",
|
|
11
|
+
"*[data-element-tracking-name]",
|
|
12
|
+
"*[aria-label='Anzeige']",
|
|
13
|
+
"nav[aria-label='breadcrumb']",
|
|
14
|
+
# heise
|
|
15
|
+
"a-video",
|
|
16
|
+
"a-gift",
|
|
17
|
+
"a-collapse",
|
|
18
|
+
"a-opt-in",
|
|
19
|
+
# spiegel
|
|
20
|
+
"[data-area='related_articles']",
|
|
21
|
+
# welt
|
|
22
|
+
"nav[aria-label='Breadcrumb']",
|
|
23
|
+
".c-inline-teaser-list",
|
|
24
|
+
"[width='1'][height='1']",
|
|
25
|
+
# golem
|
|
26
|
+
".go-alink-list",
|
|
27
|
+
# faz
|
|
28
|
+
"[data-external-selector='related-articles-entries']",
|
|
29
|
+
".BigBox",
|
|
30
|
+
# frankfurter rundschau
|
|
31
|
+
".id-Breadcrumb-item",
|
|
32
|
+
".id-Story-interactionBar",
|
|
33
|
+
"revenue-reel",
|
|
34
|
+
".id-StoryElement-factBox",
|
|
35
|
+
# stern
|
|
36
|
+
".breadcrumb",
|
|
37
|
+
".teaser",
|
|
38
|
+
".group-teaserblock__items",
|
|
39
|
+
".title__kicker",
|
|
40
|
+
"ws-adtag",
|
|
41
|
+
# taz
|
|
42
|
+
"[data-for='webelement_bio']",
|
|
43
|
+
"[data-for='webelement_citation']",
|
|
44
|
+
"#articleTeaser",
|
|
45
|
+
".article-produktteaser-container",
|
|
46
|
+
"[x-data='{}']",
|
|
47
|
+
"#komune",
|
|
48
|
+
".community",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
def self.before_cleanup(html)
|
|
52
|
+
pre_parser html
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def self.after_cleanup(result, html)
|
|
56
|
+
find_and_add_picture result, html
|
|
57
|
+
clean_up_and_enrich_result result
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
#
|
|
63
|
+
# Pre-parser to clean up HTML before passing it to Readability
|
|
64
|
+
#
|
|
65
|
+
# SELECTOR_BLACKLIST contains CSS selectors of elements to be removed from the HTML
|
|
66
|
+
# before parsing to improve content extraction.
|
|
67
|
+
#
|
|
68
|
+
# @param html [String] The HTML document as a string.
|
|
69
|
+
# @return [String] The cleaned HTML document as a string.
|
|
70
|
+
#
|
|
71
|
+
def self.pre_parser(html)
|
|
72
|
+
doc = Nokogiri::HTML(html)
|
|
73
|
+
# Remove blacklisted elements by selector
|
|
74
|
+
SELECTOR_BLACKLIST.each do |classname|
|
|
75
|
+
doc.css("#{classname}").remove
|
|
76
|
+
end
|
|
77
|
+
doc.to_html
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
#
|
|
81
|
+
# Post-parser to find and add lead image URL if missing.
|
|
82
|
+
#
|
|
83
|
+
# Will add a picture into the result hash under the key "image_url".
|
|
84
|
+
#
|
|
85
|
+
# Looks for Open Graph and Twitter Card meta tags to find a lead image URL.
|
|
86
|
+
# If not found, it will have a look into the markdown content for the first image.
|
|
87
|
+
#
|
|
88
|
+
# @param result [Hash] The result hash from Readability parsing.
|
|
89
|
+
# @param html [String] The original HTML document as a string.
|
|
90
|
+
# @return [Hash] The updated result hash.
|
|
91
|
+
#
|
|
92
|
+
def self.find_and_add_picture(result, html)
|
|
93
|
+
return result if result.key?("lead_image_url") && !result["lead_image_url"].to_s.strip.empty?
|
|
94
|
+
doc = Nokogiri::HTML(html)
|
|
95
|
+
# try to find og:image or twitter:image meta tags
|
|
96
|
+
meta_tags = doc.css('meta[property="og:image"], meta[name="og:image"], meta[name="twitter:image"]')
|
|
97
|
+
meta_tags.each do |meta_tag|
|
|
98
|
+
content = meta_tag['content']
|
|
99
|
+
if content && !content.strip.empty?
|
|
100
|
+
result["image_url"] = content.strip
|
|
101
|
+
break
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
# try to find first image in markdown content if no meta tag found before
|
|
105
|
+
if !result.key?("image_url") || result["image_url"].to_s.strip.empty?
|
|
106
|
+
if result.key?("markdown_content")
|
|
107
|
+
md_content = result["markdown_content"]
|
|
108
|
+
md_content.scan(/!\[.*?\]\((.*?)\)/).each do |match|
|
|
109
|
+
img_url = match[0]
|
|
110
|
+
if img_url && !img_url.strip.empty?
|
|
111
|
+
# check if img ends with common image file extensions
|
|
112
|
+
if img_url =~ /\.(jpg|jpeg|png|gif|webp|svg|tif|avif)(\?.*)?$/i
|
|
113
|
+
result["image_url"] = img_url.strip
|
|
114
|
+
break
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
result
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
#
|
|
124
|
+
# Post-parser to clean up extracted content after Readability processing
|
|
125
|
+
#
|
|
126
|
+
# Cleans up comment artifacts and beautifies HTML and adds beautified Markdown content.
|
|
127
|
+
#
|
|
128
|
+
# @param result [Hash] The result hash from Readability parsing.
|
|
129
|
+
# @return [Hash] The cleaned result hash.
|
|
130
|
+
#
|
|
131
|
+
def self.clean_up_and_enrich_result(result)
|
|
132
|
+
result["content"] = clean_up_comments(result["content"]) if result.key?("content")
|
|
133
|
+
result["text_content"] = clean_up_comments(result["text_content"]) if result.key?("text_content")
|
|
134
|
+
result["excerpt"] = clean_up_comments(result["excerpt"]) if result.key?("excerpt")
|
|
135
|
+
result["byline"] = clean_up_comments(result["byline"]) if result.key?("byline")
|
|
136
|
+
if result.key?("content")
|
|
137
|
+
result = beautify_html_and_text(result)
|
|
138
|
+
result["markdown_content"] = ReverseMarkdown.convert(result["content"]) if result.key?("content")
|
|
139
|
+
result = beautify_markdown(result)
|
|
140
|
+
end
|
|
141
|
+
result
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
#
|
|
145
|
+
# Remove/replace comment / artifact noise like <!--[-->, <!----> etc.
|
|
146
|
+
#
|
|
147
|
+
# @param html [String] The HTML content as a string.
|
|
148
|
+
# @return [String] The cleaned HTML content as a string.
|
|
149
|
+
#
|
|
150
|
+
def self.clean_up_comments(html)
|
|
151
|
+
copy = html.dup
|
|
152
|
+
# Turn \x3C before comment start into '<'
|
|
153
|
+
copy.gsub!(/\\x3C(?=!--)/, '<')
|
|
154
|
+
# Decode encoded comment end --> to -->
|
|
155
|
+
copy.gsub!(/-->/, '-->')
|
|
156
|
+
# Remove fully empty or artifact comments ([], only whitespace)
|
|
157
|
+
copy.gsub!(/<!--\s*(?:\[|\]|)*\s*-->/, '')
|
|
158
|
+
# Collapse multiple dummy comment chains
|
|
159
|
+
copy.gsub!(/(?:<!--\s*-->\s*)+/, '')
|
|
160
|
+
# Remove remaining comment artifacts like <!--[-->, <!--]-->
|
|
161
|
+
copy.gsub!(/<!--\[\]-->|<!--\[\s*-->|<!--\]\s*-->/, '')
|
|
162
|
+
# Remove any remaining regular comments
|
|
163
|
+
copy.gsub!(/<!--.*?-->/m, '')
|
|
164
|
+
# Reduce excessive whitespace / blank lines (real newlines)
|
|
165
|
+
copy.gsub!(/\n[ \t]+\n/, "\n")
|
|
166
|
+
copy.gsub!(/\n{3,}/, "\n\n")
|
|
167
|
+
# Remove any remaining script tags (including encoded variants)
|
|
168
|
+
copy.gsub!(/(?:\\x3C|<)script\b[^>]*?(?:>|\\x3E|>).*?(?:\\x3C|<)\/script(?:>|\\x3E|>)/im, '')
|
|
169
|
+
# Preserve blocks where whitespace/newlines matter
|
|
170
|
+
preserve_tags = %w[pre code textarea]
|
|
171
|
+
preserved = {}
|
|
172
|
+
preserve_tags.each_with_index do |tag, idx|
|
|
173
|
+
copy.scan(/<#{tag}[^>]*?>.*?<\/#{tag}>/mi).each do |block|
|
|
174
|
+
key = "__PRESERVE_BLOCK_#{tag.upcase}_#{idx}_#{preserved.size}__"
|
|
175
|
+
preserved[key] = block
|
|
176
|
+
copy.sub!(block, key)
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
# Remove literal backslash+n sequences (if they exist as textual artifacts) outside preserved blocks
|
|
180
|
+
copy.gsub!(/\\n\s*/, ' ')
|
|
181
|
+
# Collapse whitespace between tags to a single space or nothing
|
|
182
|
+
# Remove whitespace-only text nodes represented by spaces/newlines between tags
|
|
183
|
+
copy.gsub!(/>\s+</, '><')
|
|
184
|
+
# Normalize multiple spaces to a single space
|
|
185
|
+
copy.gsub!(/ {2,}/, ' ')
|
|
186
|
+
# Trim spaces directly inside tags (e.g., <p> text </p>)
|
|
187
|
+
copy.gsub!(/>\s+([^<])/) { ">#{$1}" }
|
|
188
|
+
# Restore preserved blocks
|
|
189
|
+
preserved.each { |k, v| copy.sub!(k, v) }
|
|
190
|
+
copy.strip
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
#
|
|
194
|
+
# Beautify Markdown content by adding title if not present and fixing link spacing
|
|
195
|
+
#
|
|
196
|
+
# @param result [Hash] The result hash from Readability parsing.
|
|
197
|
+
# @return [Hash] The beautified result hash.
|
|
198
|
+
#
|
|
199
|
+
def self.beautify_markdown(result)
|
|
200
|
+
mark_down = result["markdown_content"]
|
|
201
|
+
# add title to markdown if not present
|
|
202
|
+
if !mark_down.start_with?("# ") && result.key?("title") && !result["title"].to_s.strip.empty? && !mark_down.include?(result["title"])
|
|
203
|
+
mark_down = "# #{result['title']}\n\n" + mark_down
|
|
204
|
+
end
|
|
205
|
+
# Check for image and if none is found, add after title if available
|
|
206
|
+
if result.key?("image_url") && !result["image_url"].to_s.strip.empty?
|
|
207
|
+
has_image = mark_down.match(/!\[.*?\]\(.*?\)/)
|
|
208
|
+
if !has_image
|
|
209
|
+
img_md = "\n\n"
|
|
210
|
+
mark_down = mark_down.sub(/^# .+?\n/, "\\0" + img_md)
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
# Add a space after markdown links if immediately followed by an alphanumeric char (missing separation).
|
|
214
|
+
mark_down.gsub!(/(\[[^\]]+\]\((?:[^\)"']+|"[^"]*"|'[^']*')*\))(?=[A-Za-z0-9ÄÖÜäöüß])/, '\1 ')
|
|
215
|
+
result["markdown_content"] = mark_down
|
|
216
|
+
result
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
#
|
|
220
|
+
# Beautify HTML content by adding title if not present and fixing link spacing
|
|
221
|
+
#
|
|
222
|
+
# @param result [Hash] The result hash from Readability parsing.
|
|
223
|
+
# @return [String] The beautified HTML content as a string.
|
|
224
|
+
#
|
|
225
|
+
def self.beautify_html_and_text(result)
|
|
226
|
+
html = result["content"]
|
|
227
|
+
text = result["text_content"]
|
|
228
|
+
# Add title to html and text if not present
|
|
229
|
+
if (html.index(/h[1-2]/) && html.index(/h[1-2]/).to_i > 128 && result.key?("title") && !result["title"].to_s.strip.empty? && !html.include?(result["title"])) || html.index(/h[1-2]/).nil?
|
|
230
|
+
title_tag = "<h1>#{result['title']}</h1>\n"
|
|
231
|
+
html = title_tag + html
|
|
232
|
+
text = result['title'] + "\n\n" + text
|
|
233
|
+
end
|
|
234
|
+
# Check for image and if none is found, add after title if available
|
|
235
|
+
if result.key?("image_url") && !result["image_url"].to_s.strip.empty?
|
|
236
|
+
doc = Nokogiri::HTML(html)
|
|
237
|
+
# check for img tags but also for picture tags
|
|
238
|
+
has_image = !doc.css('img, picture').empty?
|
|
239
|
+
if !has_image
|
|
240
|
+
img_tag = "<p><img src=\"#{result['image_url']}\" alt=\"Lead Image\"></p>\n"
|
|
241
|
+
h1 = doc.at_css('h1')
|
|
242
|
+
if h1
|
|
243
|
+
h1.add_next_sibling(Nokogiri::HTML::DocumentFragment.parse(img_tag))
|
|
244
|
+
html = doc.to_html
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
# Add a space after a links if immediately followed by an alphanumeric char (missing separation).
|
|
249
|
+
doc = Nokogiri::HTML(html)
|
|
250
|
+
doc.css('a').each do |link|
|
|
251
|
+
next if link.next_sibling.nil?
|
|
252
|
+
if link.next_sibling.text? && link.next_sibling.content =~ /\A[A-Za-z0-9ÄÖÜäöüß]/
|
|
253
|
+
link.add_next_sibling(Nokogiri::XML::Text.new(' ', doc))
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
result["content"] = doc.to_html
|
|
257
|
+
result["text_content"] = text
|
|
258
|
+
result
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
end
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
const { Readability } = require('@mozilla/readability');
|
|
1
|
+
const { Readability, isProbablyReaderable } = require('@mozilla/readability');
|
|
2
2
|
const { JSDOM } = require('jsdom');
|
|
3
3
|
|
|
4
4
|
const doc = new JSDOM("<body>Look at this cat: <img src='./cat.jpg'></body>", {
|
|
@@ -7,4 +7,10 @@ const doc = new JSDOM("<body>Look at this cat: <img src='./cat.jpg'></body>", {
|
|
|
7
7
|
let reader = new Readability(doc.window.document);
|
|
8
8
|
let article = reader.parse();
|
|
9
9
|
|
|
10
|
-
console.log(article);
|
|
10
|
+
console.log(article);
|
|
11
|
+
|
|
12
|
+
if(isProbablyReaderable(doc.window.document)) {
|
|
13
|
+
console.log("This document is probably readerable.");
|
|
14
|
+
} else {
|
|
15
|
+
console.log("This document is probably not readerable.");
|
|
16
|
+
}
|
data/lib/readability_js/nodo.rb
CHANGED
|
@@ -14,15 +14,22 @@ module ReadabilityJs
|
|
|
14
14
|
#
|
|
15
15
|
def self.parse(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0)
|
|
16
16
|
begin
|
|
17
|
-
|
|
17
|
+
# remove style tags from html, so jsdom does not need to process css and its warnings are not shown
|
|
18
|
+
html = html.gsub(/<style[^>]*>.*?<\/style>/m, '')
|
|
19
|
+
self.new.parse html, url, debug, max_elems_to_parse, nb_top_candidates, char_threshold, classes_to_preserve, keep_classes, disable_json_ld, serializer, allow_video_regex, link_density_modifier
|
|
18
20
|
rescue ::Nodo::JavaScriptError => e
|
|
19
21
|
raise ReadabilityJs::Error.new "#{e.message}"
|
|
20
22
|
end
|
|
21
23
|
end
|
|
22
24
|
|
|
23
|
-
|
|
25
|
+
#
|
|
26
|
+
# instance wrapper method, as nodo does not support class methods
|
|
27
|
+
#
|
|
28
|
+
def self.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: nil)
|
|
24
29
|
begin
|
|
25
|
-
|
|
30
|
+
# remove style tags from html, so jsdom does not need to process css and its warnings are not shown
|
|
31
|
+
html = html.gsub(/<style[^>]*>.*?<\/style>/m, '')
|
|
32
|
+
self.new.is_probably_readerable html, min_content_length, min_score, visibility_checker
|
|
26
33
|
rescue ::Nodo::JavaScriptError => e
|
|
27
34
|
raise ReadabilityJs::Error.new "#{e.message}"
|
|
28
35
|
end
|
|
@@ -57,9 +64,16 @@ module ReadabilityJs
|
|
|
57
64
|
JS
|
|
58
65
|
|
|
59
66
|
function :is_probably_readerable, <<~JS
|
|
60
|
-
async (html) => {
|
|
67
|
+
async (html, minContentLength, minScore, visibilityChecker) => {
|
|
61
68
|
const doc = new jsdom.JSDOM(html);
|
|
62
|
-
|
|
69
|
+
|
|
70
|
+
let readability_options = {};
|
|
71
|
+
if(minContentLength !== undefined && minContentLength !== null) readability_options.minContentLength = minContentLength;
|
|
72
|
+
if(minScore !== undefined && minScore !== null) readability_options.minScore = minScore;
|
|
73
|
+
if(visibilityChecker !== undefined && visibilityChecker !== null) {
|
|
74
|
+
readability_options.visibilityChecker = eval(visibilityChecker);
|
|
75
|
+
}
|
|
76
|
+
return readability.isProbablyReaderable(doc.window.document, readability_options);
|
|
63
77
|
}
|
|
64
78
|
JS
|
|
65
79
|
|
data/lib/readability_js.rb
CHANGED
|
@@ -6,6 +6,7 @@ require 'nokogiri'
|
|
|
6
6
|
|
|
7
7
|
require_relative 'readability_js/version'
|
|
8
8
|
require_relative 'readability_js/nodo'
|
|
9
|
+
require_relative 'readability_js/extended'
|
|
9
10
|
|
|
10
11
|
require_relative 'custom_errors/error'
|
|
11
12
|
|
|
@@ -15,56 +16,27 @@ require_relative 'custom_errors/error'
|
|
|
15
16
|
|
|
16
17
|
module ReadabilityJs
|
|
17
18
|
|
|
18
|
-
SELECTOR_BLACKLIST = [
|
|
19
|
-
".Article-Partner",
|
|
20
|
-
".Article-Partner-Text",
|
|
21
|
-
".Article-Comments-Button",
|
|
22
|
-
"#isl-5-AdCarousel",
|
|
23
|
-
"#isl-10-ArticleComments",
|
|
24
|
-
"*[data-element-tracking-name]",
|
|
25
|
-
"*[aria-label='Anzeige']",
|
|
26
|
-
"nav[aria-label='breadcrumb']",
|
|
27
|
-
# heise
|
|
28
|
-
"a-video",
|
|
29
|
-
"a-gift",
|
|
30
|
-
"a-collapse",
|
|
31
|
-
"a-opt-in",
|
|
32
|
-
# spiegel
|
|
33
|
-
"[data-area='related_articles']",
|
|
34
|
-
# welt
|
|
35
|
-
"nav[aria-label='Breadcrumb']",
|
|
36
|
-
".c-inline-teaser-list",
|
|
37
|
-
# golem
|
|
38
|
-
".go-alink-list",
|
|
39
|
-
# faz
|
|
40
|
-
"[data-external-selector='related-articles-entries']",
|
|
41
|
-
".BigBox",
|
|
42
|
-
# frankfurter rundschau
|
|
43
|
-
".id-Breadcrumb-item",
|
|
44
|
-
".id-Story-interactionBar",
|
|
45
|
-
"revenue-reel",
|
|
46
|
-
".id-StoryElement-factBox",
|
|
47
|
-
# stern
|
|
48
|
-
".breadcrumb",
|
|
49
|
-
".teaser",
|
|
50
|
-
".group-teaserblock__items",
|
|
51
|
-
".title__kicker",
|
|
52
|
-
# taz
|
|
53
|
-
"[data-for='webelement_bio']",
|
|
54
|
-
"[data-for='webelement_citation']",
|
|
55
|
-
"#articleTeaser",
|
|
56
|
-
".article-produktteaser-container",
|
|
57
|
-
"[x-data='{}']",
|
|
58
|
-
"#komune",
|
|
59
|
-
".community",
|
|
60
|
-
]
|
|
61
|
-
|
|
62
19
|
#
|
|
63
20
|
# Parse a HTML document and extract its main content using Mozilla's Readability library.
|
|
64
|
-
# Raises ReadabilityJs::Error on failure.
|
|
65
21
|
#
|
|
66
22
|
# 'html' is a required parameters, all others are optional.
|
|
67
23
|
#
|
|
24
|
+
# @param html [String] The HTML document as a string.
|
|
25
|
+
# @param url [String, nil] The URL of the document (optional, used for resolving relative links).
|
|
26
|
+
# @param debug [Boolean] Enable debug mode (default: false).
|
|
27
|
+
# @param max_elems_to_parse [Integer] Maximum number of elements to parse (default: 0, meaning no limit).
|
|
28
|
+
# @param nb_top_candidates [Integer] Number of top candidates to consider (default: 5).
|
|
29
|
+
# @param char_threshold [Integer] Minimum number of characters for an element to be considered (default: 500).
|
|
30
|
+
# @param classes_to_preserve [Array<String>] List of CSS classes to preserve in the output (default: []).
|
|
31
|
+
# @param keep_classes [Boolean] Whether to keep the original classes in the output (default: false).
|
|
32
|
+
# @param disable_json_ld [Boolean] Disable JSON-LD parsing (default: false).
|
|
33
|
+
# @param serializer [String, nil] Serializer to use for output (optional).
|
|
34
|
+
# @param allow_video_regex [String, nil] Regular expression to allow video URLs (optional).
|
|
35
|
+
# @param link_density_modifier [Float] Modifier for link density calculation (default: 0).
|
|
36
|
+
# @return [Hash] A hash containing the extracted content and metadata.
|
|
37
|
+
#
|
|
38
|
+
# @raise [ReadabilityJs::Error] if an error occurs during execution
|
|
39
|
+
#
|
|
68
40
|
def self.parse(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0)
|
|
69
41
|
begin
|
|
70
42
|
result = ReadabilityJs::Nodo.parse(html, url: url, debug: debug, max_elems_to_parse: max_elems_to_parse, nb_top_candidates: nb_top_candidates, char_threshold: char_threshold, classes_to_preserve: classes_to_preserve, keep_classes: keep_classes, disable_json_ld: disable_json_ld, serializer: serializer, allow_video_regex: allow_video_regex, link_density_modifier: link_density_modifier)
|
|
@@ -74,13 +46,60 @@ module ReadabilityJs
|
|
|
74
46
|
end
|
|
75
47
|
end
|
|
76
48
|
|
|
49
|
+
#
|
|
50
|
+
# Like #parse but with additional pre- and post-processing to enhance content extraction.
|
|
51
|
+
#
|
|
52
|
+
# 'html' is a required parameters, all others are optional.
|
|
53
|
+
#
|
|
54
|
+
# @param html [String] The HTML document as a string.
|
|
55
|
+
# @param url [String, nil] The URL of the document (optional, used for resolving relative links).
|
|
56
|
+
# @param debug [Boolean] Enable debug mode (default: false).
|
|
57
|
+
# @param max_elems_to_parse [Integer] Maximum number of elements to parse (default: 0, meaning no limit).
|
|
58
|
+
# @param nb_top_candidates [Integer] Number of top candidates to consider (default: 5).
|
|
59
|
+
# @param char_threshold [Integer] Minimum number of characters for an element to be considered (default: 500).
|
|
60
|
+
# @param classes_to_preserve [Array<String>] List of CSS classes to preserve in the output (default: []).
|
|
61
|
+
# @param keep_classes [Boolean] Whether to keep the original classes in the output (default: false).
|
|
62
|
+
# @param disable_json_ld [Boolean] Disable JSON-LD parsing (default: false).
|
|
63
|
+
# @param serializer [String, nil] Serializer to use for output (optional).
|
|
64
|
+
# @param allow_video_regex [String, nil] Regular expression to allow video URLs (optional).
|
|
65
|
+
# @param link_density_modifier [Float] Modifier for link density calculation (default: 0).
|
|
66
|
+
# @return [Hash] A hash containing the extracted content and metadata.
|
|
67
|
+
#
|
|
68
|
+
# @raise [ReadabilityJs::Error] if an error occurs during execution
|
|
69
|
+
#
|
|
77
70
|
def self.parse_extended(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0)
|
|
78
|
-
result =
|
|
71
|
+
result = Extended::before_cleanup html
|
|
79
72
|
result = parse result, url: url, debug: debug, max_elems_to_parse: max_elems_to_parse, nb_top_candidates: nb_top_candidates, char_threshold: char_threshold, classes_to_preserve: classes_to_preserve, keep_classes: keep_classes, disable_json_ld: disable_json_ld, serializer: serializer, allow_video_regex: allow_video_regex, link_density_modifier: link_density_modifier
|
|
80
|
-
|
|
73
|
+
Extended::after_cleanup result, html
|
|
81
74
|
end
|
|
82
75
|
|
|
83
|
-
|
|
76
|
+
#
|
|
77
|
+
# Decides whether a document is probably readerable without parsing the whole document.
|
|
78
|
+
#
|
|
79
|
+
# Only 'html' is a required parameter, all others are optional.
|
|
80
|
+
#
|
|
81
|
+
# @param html [String] The HTML document as a string.
|
|
82
|
+
# @param min_content_length [Integer] Minimum content length to consider the document readerable
|
|
83
|
+
# @param min_score [Integer] Minimum score to consider the document readerable
|
|
84
|
+
# @param visibility_checker [String] anonymous JavaScript function definition to check node visibility as string. Uses default visibility checker if not provided.
|
|
85
|
+
# @return [Boolean] true if the document is probably readerable, false otherwise.
|
|
86
|
+
#
|
|
87
|
+
# @raise [ReadabilityJs::Error] if an error occurs during execution
|
|
88
|
+
#
|
|
89
|
+
# @example
|
|
90
|
+
#
|
|
91
|
+
# html = "<html>...</html>"
|
|
92
|
+
#
|
|
93
|
+
# visibility_checker = <<~JS
|
|
94
|
+
# (node) => {
|
|
95
|
+
# const style = node.ownerDocument.defaultView.getComputedStyle(node);
|
|
96
|
+
# return (style && style.display !== 'none' && style.visibility !== 'hidden' && parseFloat(style.opacity) > 0);
|
|
97
|
+
# }
|
|
98
|
+
# JS
|
|
99
|
+
#
|
|
100
|
+
# ReadabilityJs.is_probably_readerable(html, min_content_length: 200, min_score: 25, visibility_checker: visibility_checker)
|
|
101
|
+
#
|
|
102
|
+
def self.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: nil)
|
|
84
103
|
begin
|
|
85
104
|
ReadabilityJs::Nodo.is_probably_readerable(html, min_content_length: min_content_length, min_score: min_score, visibility_checker: visibility_checker)
|
|
86
105
|
rescue => e
|
|
@@ -88,12 +107,45 @@ module ReadabilityJs
|
|
|
88
107
|
end
|
|
89
108
|
end
|
|
90
109
|
|
|
91
|
-
|
|
92
|
-
|
|
110
|
+
|
|
111
|
+
#
|
|
112
|
+
# Decides whether a document is probably readerable without parsing the whole document.
|
|
113
|
+
#
|
|
114
|
+
# Only 'html' is a required parameter, all others are optional.
|
|
115
|
+
#
|
|
116
|
+
# @param html [String] The HTML document as a string.
|
|
117
|
+
# @param min_content_length [Integer] Minimum content length to consider the document readerable
|
|
118
|
+
# @param min_score [Integer] Minimum score to consider the document readerable
|
|
119
|
+
# @param visibility_checker [String] anonymous JavaScript function definition to check node visibility as string. Uses default visibility checker if not provided.
|
|
120
|
+
# @return [Boolean] true if the document is probably readerable, false otherwise.
|
|
121
|
+
#
|
|
122
|
+
# @raise [ReadabilityJs::Error] if an error occurs during execution
|
|
123
|
+
#
|
|
124
|
+
# @example
|
|
125
|
+
#
|
|
126
|
+
# html = "<html>...</html>"
|
|
127
|
+
#
|
|
128
|
+
# visibility_checker = <<~JS
|
|
129
|
+
# (node) => {
|
|
130
|
+
# const style = node.ownerDocument.defaultView.getComputedStyle(node);
|
|
131
|
+
# return (style && style.display !== 'none' && style.visibility !== 'hidden' && parseFloat(style.opacity) > 0);
|
|
132
|
+
# }
|
|
133
|
+
# JS
|
|
134
|
+
#
|
|
135
|
+
# ReadabilityJs.probably_readerable?(html, min_content_length: 200, min_score: 25, visibility_checker: visibility_checker)
|
|
136
|
+
#
|
|
137
|
+
def self.probably_readerable?(html, min_content_length: 140, min_score: 20, visibility_checker: nil)
|
|
138
|
+
self.is_probably_readerable(html, min_content_length: min_content_length, min_score: min_score, visibility_checker: visibility_checker)
|
|
93
139
|
end
|
|
94
140
|
|
|
95
141
|
private
|
|
96
142
|
|
|
143
|
+
#
|
|
144
|
+
# Normalize result keys to snake_case for ruby style
|
|
145
|
+
#
|
|
146
|
+
# @param result [Hash] The result hash from Readability
|
|
147
|
+
# @return [Hash] The normalized result hash
|
|
148
|
+
#
|
|
97
149
|
def self.normalize_result(result)
|
|
98
150
|
result["text_content"] = result.delete("textContent") if result.key?("textContent")
|
|
99
151
|
result["site_name"] = result.delete("siteName") if result.key?("siteName")
|
|
@@ -101,108 +153,5 @@ module ReadabilityJs
|
|
|
101
153
|
result
|
|
102
154
|
end
|
|
103
155
|
|
|
104
|
-
def self.clean_up_result(result)
|
|
105
|
-
result["content"] = clean_up_comments(result["content"]) if result.key?("content")
|
|
106
|
-
result["text_content"] = clean_up_comments(result["text_content"]) if result.key?("text_content")
|
|
107
|
-
result["excerpt"] = clean_up_comments(result["excerpt"]) if result.key?("excerpt")
|
|
108
|
-
result["byline"] = clean_up_comments(result["byline"]) if result.key?("byline")
|
|
109
|
-
if result.key?("content")
|
|
110
|
-
result["content"] = beautify_html(result["content"])
|
|
111
|
-
result["markdown_content"] = ReverseMarkdown.convert(result["content"]) if result.key?("content")
|
|
112
|
-
result = beautify_markdown(result)
|
|
113
|
-
end
|
|
114
|
-
result
|
|
115
|
-
end
|
|
116
|
-
|
|
117
|
-
# Replaces comment / artifact noise like <!--[-->, <!----> etc.
|
|
118
|
-
def self.clean_up_comments(html)
|
|
119
|
-
copy = html.dup
|
|
120
|
-
|
|
121
|
-
# Turn \x3C before comment start into '<'
|
|
122
|
-
copy.gsub!(/\\x3C(?=!--)/, '<')
|
|
123
|
-
|
|
124
|
-
# Decode encoded comment end --> to -->
|
|
125
|
-
copy.gsub!(/-->/, '-->')
|
|
126
|
-
|
|
127
|
-
# Remove fully empty or artifact comments ([], only whitespace)
|
|
128
|
-
copy.gsub!(/<!--\s*(?:\[|\]|)*\s*-->/, '')
|
|
129
|
-
|
|
130
|
-
# Collapse multiple dummy comment chains
|
|
131
|
-
copy.gsub!(/(?:<!--\s*-->\s*)+/, '')
|
|
132
|
-
|
|
133
|
-
# Remove remaining comment artifacts like <!--[-->, <!--]-->
|
|
134
|
-
copy.gsub!(/<!--\[\]-->|<!--\[\s*-->|<!--\]\s*-->/, '')
|
|
135
|
-
|
|
136
|
-
# Remove any remaining regular comments
|
|
137
|
-
copy.gsub!(/<!--.*?-->/m, '')
|
|
138
|
-
|
|
139
|
-
# Reduce excessive whitespace / blank lines (real newlines)
|
|
140
|
-
copy.gsub!(/\n[ \t]+\n/, "\n")
|
|
141
|
-
copy.gsub!(/\n{3,}/, "\n\n")
|
|
142
|
-
|
|
143
|
-
# Remove any remaining script tags (including encoded variants)
|
|
144
|
-
copy.gsub!(/(?:\\x3C|<)script\b[^>]*?(?:>|\\x3E|>).*?(?:\\x3C|<)\/script(?:>|\\x3E|>)/im, '')
|
|
145
|
-
|
|
146
|
-
# Preserve blocks where whitespace/newlines matter
|
|
147
|
-
preserve_tags = %w[pre code textarea]
|
|
148
|
-
preserved = {}
|
|
149
|
-
preserve_tags.each_with_index do |tag, idx|
|
|
150
|
-
copy.scan(/<#{tag}[^>]*?>.*?<\/#{tag}>/mi).each do |block|
|
|
151
|
-
key = "__PRESERVE_BLOCK_#{tag.upcase}_#{idx}_#{preserved.size}__"
|
|
152
|
-
preserved[key] = block
|
|
153
|
-
copy.sub!(block, key)
|
|
154
|
-
end
|
|
155
|
-
end
|
|
156
|
-
|
|
157
|
-
# Remove literal backslash+n sequences (if they exist as textual artifacts) outside preserved blocks
|
|
158
|
-
copy.gsub!(/\\n\s*/, ' ')
|
|
159
|
-
|
|
160
|
-
# Collapse whitespace between tags to a single space or nothing
|
|
161
|
-
# Remove whitespace-only text nodes represented by spaces/newlines between tags
|
|
162
|
-
copy.gsub!(/>\s+</, '><')
|
|
163
|
-
|
|
164
|
-
# Normalize multiple spaces to a single space
|
|
165
|
-
copy.gsub!(/ {2,}/, ' ')
|
|
166
|
-
|
|
167
|
-
# Trim spaces directly inside tags (e.g., <p> text </p>)
|
|
168
|
-
copy.gsub!(/>\s+([^<])/) { ">#{$1}" }
|
|
169
|
-
|
|
170
|
-
# Restore preserved blocks
|
|
171
|
-
preserved.each { |k, v| copy.sub!(k, v) }
|
|
172
|
-
copy.strip
|
|
173
|
-
end
|
|
174
|
-
|
|
175
|
-
def self.beautify_markdown(result)
|
|
176
|
-
mark_down = result["markdown_content"]
|
|
177
|
-
# add title to markdown if not present
|
|
178
|
-
if !mark_down.start_with?("# ") && result.key?("title") && !result["title"].to_s.strip.empty? && !mark_down.include?(result["title"])
|
|
179
|
-
mark_down = "# #{result['title']}\n\n" + mark_down
|
|
180
|
-
end
|
|
181
|
-
# Add a space after markdown links if immediately followed by an alphanumeric char (missing separation).
|
|
182
|
-
mark_down.gsub!(/(\[[^\]]+\]\((?:[^\)"']+|"[^"]*"|'[^']*')*\))(?=[A-Za-z0-9ÄÖÜäöüß])/, '\1 ')
|
|
183
|
-
result["markdown_content"] = mark_down
|
|
184
|
-
result
|
|
185
|
-
end
|
|
186
|
-
|
|
187
|
-
def self.beautify_html(html)
|
|
188
|
-
doc = Nokogiri::HTML(html)
|
|
189
|
-
# Add a space after a links if immediately followed by an alphanumeric char (missing separation).
|
|
190
|
-
doc.css('a').each do |link|
|
|
191
|
-
next if link.next_sibling.nil?
|
|
192
|
-
if link.next_sibling.text? && link.next_sibling.content =~ /\A[A-Za-z0-9ÄÖÜäöüß]/
|
|
193
|
-
link.add_next_sibling(Nokogiri::XML::Text.new(' ', doc))
|
|
194
|
-
end
|
|
195
|
-
end
|
|
196
|
-
doc.to_html
|
|
197
|
-
end
|
|
198
|
-
|
|
199
|
-
def self.pre_parser(html)
|
|
200
|
-
doc = Nokogiri::HTML(html)
|
|
201
|
-
# Remove blacklisted classes
|
|
202
|
-
SELECTOR_BLACKLIST.each do |classname|
|
|
203
|
-
doc.css("#{classname}").remove
|
|
204
|
-
end
|
|
205
|
-
doc.to_html
|
|
206
|
-
end
|
|
207
156
|
|
|
208
157
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: readability_js
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Matthäus Beyrle
|
|
@@ -145,6 +145,7 @@ executables: []
|
|
|
145
145
|
extensions: []
|
|
146
146
|
extra_rdoc_files: []
|
|
147
147
|
files:
|
|
148
|
+
- ".gitattributes"
|
|
148
149
|
- ".gitignore"
|
|
149
150
|
- ".rspec"
|
|
150
151
|
- CHANGELOG.md
|
|
@@ -158,6 +159,7 @@ files:
|
|
|
158
159
|
- cli/pry.rb
|
|
159
160
|
- lib/custom_errors/error.rb
|
|
160
161
|
- lib/readability_js.rb
|
|
162
|
+
- lib/readability_js/extended.rb
|
|
161
163
|
- lib/readability_js/node/node_modules/.bin/tldts
|
|
162
164
|
- lib/readability_js/node/node_modules/.yarn-integrity
|
|
163
165
|
- lib/readability_js/node/node_modules/@asamuzakjp/css-color/LICENSE
|