readability_js 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -2
- data/lib/readability_js/extended.rb +11 -15
- data/lib/readability_js/version.rb +1 -1
- data/lib/readability_js.rb +3 -2
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ff2c212926fdd4859f0a70a538ad28f3bc4b760b0d022b0e77f61bc21b40e859
|
|
4
|
+
data.tar.gz: 99ea17969d31e4de8d2c1718693b1c43c254ec8099a4ec85c5ba80342c80fda1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7050f66c9af71d4b420a49a322b9be44afa9777ea500c59f1bda10f350ca39806c572de0645e563d8f1d18d4ad06955c43f94c6907e8af79045184f3e5bd6112
|
|
7
|
+
data.tar.gz: 32520125bda871a44601ee83eaa653e62ef9c4104c8ccc3d1dbc0a2624c09304e0a4e63bf5e29672fcb8c6f8035081696640a3b7bac020ad627049d37c772dad
|
data/README.md
CHANGED
|
@@ -64,7 +64,9 @@ and includes a beautified markdown version of the content.
|
|
|
64
64
|
```ruby
|
|
65
65
|
require 'readability_js'
|
|
66
66
|
html = File.read("my_article.html")
|
|
67
|
-
|
|
67
|
+
# extend has included a DEFAULT_SELECTOR_BLACKLIST and you can add your own selectors to it as well,
|
|
68
|
+
# that will be used to remove unwanted elements from the content before parsing at all.
|
|
69
|
+
result = ReadabilityJs.parse_extended(html, blacklist_selectors: [".advertisement", "#sponsored"])
|
|
68
70
|
p result
|
|
69
71
|
```
|
|
70
72
|
|
|
@@ -81,7 +83,7 @@ data = ReadabilityJs.parse(
|
|
|
81
83
|
# => Hash
|
|
82
84
|
```
|
|
83
85
|
|
|
84
|
-
###
|
|
86
|
+
### Parse response
|
|
85
87
|
The response object is of type `Hash`.
|
|
86
88
|
It contains the data returned by readability, with hash keys transformed in snake_case.
|
|
87
89
|
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
|
|
2
1
|
module ReadabilityJs
|
|
3
2
|
class Extended
|
|
4
3
|
|
|
5
|
-
|
|
4
|
+
DEFAULT_SELECTOR_BLACKLIST = [
|
|
6
5
|
".Article-Partner",
|
|
7
6
|
".Article-Partner-Text",
|
|
8
7
|
".Article-Comments-Button",
|
|
@@ -11,34 +10,26 @@ module ReadabilityJs
|
|
|
11
10
|
"*[data-element-tracking-name]",
|
|
12
11
|
"*[aria-label='Anzeige']",
|
|
13
12
|
"nav[aria-label='breadcrumb']",
|
|
14
|
-
# heise
|
|
15
13
|
"a-video",
|
|
16
14
|
"a-gift",
|
|
17
15
|
"a-collapse",
|
|
18
16
|
"a-opt-in",
|
|
19
|
-
# spiegel
|
|
20
17
|
"[data-area='related_articles']",
|
|
21
|
-
# welt
|
|
22
18
|
"nav[aria-label='Breadcrumb']",
|
|
23
19
|
".c-inline-teaser-list",
|
|
24
20
|
"[width='1'][height='1']",
|
|
25
|
-
# golem
|
|
26
21
|
".go-alink-list",
|
|
27
|
-
# faz
|
|
28
22
|
"[data-external-selector='related-articles-entries']",
|
|
29
23
|
".BigBox",
|
|
30
|
-
# frankfurter rundschau
|
|
31
24
|
".id-Breadcrumb-item",
|
|
32
25
|
".id-Story-interactionBar",
|
|
33
26
|
"revenue-reel",
|
|
34
27
|
".id-StoryElement-factBox",
|
|
35
|
-
# stern
|
|
36
28
|
".breadcrumb",
|
|
37
29
|
".teaser",
|
|
38
30
|
".group-teaserblock__items",
|
|
39
31
|
".title__kicker",
|
|
40
32
|
"ws-adtag",
|
|
41
|
-
# taz
|
|
42
33
|
"[data-for='webelement_bio']",
|
|
43
34
|
"[data-for='webelement_citation']",
|
|
44
35
|
"#articleTeaser",
|
|
@@ -46,10 +37,13 @@ module ReadabilityJs
|
|
|
46
37
|
"[x-data='{}']",
|
|
47
38
|
"#komune",
|
|
48
39
|
".community",
|
|
40
|
+
".article-head__topline",
|
|
41
|
+
".article__audioicon",
|
|
42
|
+
".auplayer",
|
|
49
43
|
]
|
|
50
44
|
|
|
51
|
-
def self.before_cleanup(html)
|
|
52
|
-
pre_parser html
|
|
45
|
+
def self.before_cleanup(html, blacklist_selectors: [])
|
|
46
|
+
pre_parser html, blacklist_selectors: blacklist_selectors
|
|
53
47
|
end
|
|
54
48
|
|
|
55
49
|
def self.after_cleanup(result, html)
|
|
@@ -62,16 +56,18 @@ module ReadabilityJs
|
|
|
62
56
|
#
|
|
63
57
|
# Pre-parser to clean up HTML before passing it to Readability
|
|
64
58
|
#
|
|
65
|
-
#
|
|
59
|
+
# DEFAULT_SELECTOR_BLACKLIST and given blacklist_selectors contains CSS selectors of elements to be removed from the HTML
|
|
66
60
|
# before parsing to improve content extraction.
|
|
67
61
|
#
|
|
68
62
|
# @param html [String] The HTML document as a string.
|
|
69
63
|
# @return [String] The cleaned HTML document as a string.
|
|
70
64
|
#
|
|
71
|
-
def self.pre_parser(html)
|
|
65
|
+
def self.pre_parser(html, blacklist_selectors: [])
|
|
66
|
+
final_blacklist = DEFAULT_SELECTOR_BLACKLIST
|
|
67
|
+
final_blacklist += blacklist_selectors if blacklist_selectors.is_a?(Array) && !blacklist_selectors.empty?
|
|
72
68
|
doc = Nokogiri::HTML(html)
|
|
73
69
|
# Remove blacklisted elements by selector
|
|
74
|
-
|
|
70
|
+
final_blacklist.each do |classname|
|
|
75
71
|
doc.css("#{classname}").remove
|
|
76
72
|
end
|
|
77
73
|
doc.to_html
|
data/lib/readability_js.rb
CHANGED
|
@@ -63,12 +63,13 @@ module ReadabilityJs
|
|
|
63
63
|
# @param serializer [String, nil] Serializer to use for output (optional).
|
|
64
64
|
# @param allow_video_regex [String, nil] Regular expression to allow video URLs (optional).
|
|
65
65
|
# @param link_density_modifier [Float] Modifier for link density calculation (default: 0).
|
|
66
|
+
# @param blacklist_selectors [Array<String>] List of CSS selectors to remove from the HTML before parsing (default: []).
|
|
66
67
|
# @return [Hash] A hash containing the extracted content and metadata.
|
|
67
68
|
#
|
|
68
69
|
# @raise [ReadabilityJs::Error] if an error occurs during execution
|
|
69
70
|
#
|
|
70
|
-
def self.parse_extended(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0)
|
|
71
|
-
result = Extended::before_cleanup html
|
|
71
|
+
def self.parse_extended(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0, blacklist_selectors: [])
|
|
72
|
+
result = Extended::before_cleanup html, blacklist_selectors: blacklist_selectors
|
|
72
73
|
result = parse result, url: url, debug: debug, max_elems_to_parse: max_elems_to_parse, nb_top_candidates: nb_top_candidates, char_threshold: char_threshold, classes_to_preserve: classes_to_preserve, keep_classes: keep_classes, disable_json_ld: disable_json_ld, serializer: serializer, allow_video_regex: allow_video_regex, link_density_modifier: link_density_modifier
|
|
73
74
|
Extended::after_cleanup result, html
|
|
74
75
|
end
|