readability_js 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9a5faf9c156d80158ba2ee471bb3d7ec102396c3c03ca5472162211706ac74d9
4
- data.tar.gz: 16a0a329c137ecfbae089501b24f4dea0cc00395d62d5bedc123c40757018e27
3
+ metadata.gz: ff2c212926fdd4859f0a70a538ad28f3bc4b760b0d022b0e77f61bc21b40e859
4
+ data.tar.gz: 99ea17969d31e4de8d2c1718693b1c43c254ec8099a4ec85c5ba80342c80fda1
5
5
  SHA512:
6
- metadata.gz: 0f3b5aa2bb14ee235f3aad9b884cdeeb801e28ea18048d33d7a2c50921e89b33210df375e7d67607f86c33c23a0c1cfa3b82fd8420951ea5008534164eede0a8
7
- data.tar.gz: ae1fe7c1decc638e7da359f16c8178a6ce8f02a2632bef7b5e3e4542bc7de17b52bba16bce4dbb70692a4db87c46728c1943f13b6134c5a0b101d458fc7baf78
6
+ metadata.gz: 7050f66c9af71d4b420a49a322b9be44afa9777ea500c59f1bda10f350ca39806c572de0645e563d8f1d18d4ad06955c43f94c6907e8af79045184f3e5bd6112
7
+ data.tar.gz: 32520125bda871a44601ee83eaa653e62ef9c4104c8ccc3d1dbc0a2624c09304e0a4e63bf5e29672fcb8c6f8035081696640a3b7bac020ad627049d37c772dad
data/README.md CHANGED
@@ -64,7 +64,9 @@ and includes a beautified markdown version of the content.
64
64
  ```ruby
65
65
  require 'readability_js'
66
66
  html = File.read("my_article.html")
67
- result = ReadabilityJs.parse_extended(html)
67
+ # extend has included a DEFAULT_SELECTOR_BLACKLIST and you can add your own selectors to it as well,
68
+ # that will be used to remove unwanted elements from the content before parsing at all.
69
+ result = ReadabilityJs.parse_extended(html, blacklist_selectors: [".advertisement", "#sponsored"])
68
70
  p result
69
71
  ```
70
72
 
@@ -81,7 +83,7 @@ data = ReadabilityJs.parse(
81
83
  # => Hash
82
84
  ```
83
85
 
84
- ### Query response
86
+ ### Parse response
85
87
  The response object is of type `Hash`.
86
88
  It contains the data returned by readability, with hash keys transformed in snake_case.
87
89
 
@@ -1,8 +1,7 @@
1
-
2
1
  module ReadabilityJs
3
2
  class Extended
4
3
 
5
- SELECTOR_BLACKLIST = [
4
+ DEFAULT_SELECTOR_BLACKLIST = [
6
5
  ".Article-Partner",
7
6
  ".Article-Partner-Text",
8
7
  ".Article-Comments-Button",
@@ -11,34 +10,26 @@ module ReadabilityJs
11
10
  "*[data-element-tracking-name]",
12
11
  "*[aria-label='Anzeige']",
13
12
  "nav[aria-label='breadcrumb']",
14
- # heise
15
13
  "a-video",
16
14
  "a-gift",
17
15
  "a-collapse",
18
16
  "a-opt-in",
19
- # spiegel
20
17
  "[data-area='related_articles']",
21
- # welt
22
18
  "nav[aria-label='Breadcrumb']",
23
19
  ".c-inline-teaser-list",
24
20
  "[width='1'][height='1']",
25
- # golem
26
21
  ".go-alink-list",
27
- # faz
28
22
  "[data-external-selector='related-articles-entries']",
29
23
  ".BigBox",
30
- # frankfurter rundschau
31
24
  ".id-Breadcrumb-item",
32
25
  ".id-Story-interactionBar",
33
26
  "revenue-reel",
34
27
  ".id-StoryElement-factBox",
35
- # stern
36
28
  ".breadcrumb",
37
29
  ".teaser",
38
30
  ".group-teaserblock__items",
39
31
  ".title__kicker",
40
32
  "ws-adtag",
41
- # taz
42
33
  "[data-for='webelement_bio']",
43
34
  "[data-for='webelement_citation']",
44
35
  "#articleTeaser",
@@ -46,10 +37,13 @@ module ReadabilityJs
46
37
  "[x-data='{}']",
47
38
  "#komune",
48
39
  ".community",
40
+ ".article-head__topline",
41
+ ".article__audioicon",
42
+ ".auplayer",
49
43
  ]
50
44
 
51
- def self.before_cleanup(html)
52
- pre_parser html
45
+ def self.before_cleanup(html, blacklist_selectors: [])
46
+ pre_parser html, blacklist_selectors: blacklist_selectors
53
47
  end
54
48
 
55
49
  def self.after_cleanup(result, html)
@@ -62,16 +56,18 @@ module ReadabilityJs
62
56
  #
63
57
  # Pre-parser to clean up HTML before passing it to Readability
64
58
  #
65
- # SELECTOR_BLACKLIST contains CSS selectors of elements to be removed from the HTML
59
+ # DEFAULT_SELECTOR_BLACKLIST and given blacklist_selectors contains CSS selectors of elements to be removed from the HTML
66
60
  # before parsing to improve content extraction.
67
61
  #
68
62
  # @param html [String] The HTML document as a string.
69
63
  # @return [String] The cleaned HTML document as a string.
70
64
  #
71
- def self.pre_parser(html)
65
+ def self.pre_parser(html, blacklist_selectors: [])
66
+ final_blacklist = DEFAULT_SELECTOR_BLACKLIST
67
+ final_blacklist += blacklist_selectors if blacklist_selectors.is_a?(Array) && !blacklist_selectors.empty?
72
68
  doc = Nokogiri::HTML(html)
73
69
  # Remove blacklisted elements by selector
74
- SELECTOR_BLACKLIST.each do |classname|
70
+ final_blacklist.each do |classname|
75
71
  doc.css("#{classname}").remove
76
72
  end
77
73
  doc.to_html
@@ -148,7 +144,7 @@ module ReadabilityJs
148
144
  # @return [String] The cleaned HTML content as a string.
149
145
  #
150
146
  def self.clean_up_comments(html)
151
- copy = html.dup.to_s
147
+ copy = html.dup || ""
152
148
  # Turn \x3C before comment start into '<'
153
149
  copy.gsub!(/\\x3C(?=!--)/, '<')
154
150
  # Decode encoded comment end --&gt; to -->
@@ -204,9 +200,9 @@ module ReadabilityJs
204
200
  end
205
201
  # Check for image and if none is found, add after title if available
206
202
  if result.key?("image_url") && !result["image_url"].to_s.strip.empty?
207
- has_image = mark_down.match(/!\[.*?\]\(.*?\)/)
203
+ has_image = mark_down.match(/!\[.*?\]\(.*?\)/) || mark_down.match(/<img\b[^>]*>/) || mark_down.match(/<picture\b[^>]*>.*?<\/picture>/m)
208
204
  if !has_image
209
- img_md = "![Lead Image](#{result['image_url']})\n\n"
205
+ img_md = "![image](#{result['image_url']})\n\n"
210
206
  mark_down = mark_down.sub(/^# .+?\n/, "\\0" + img_md)
211
207
  end
212
208
  end
@@ -237,7 +233,7 @@ module ReadabilityJs
237
233
  # check for img tags but also for picture tags
238
234
  has_image = !doc.css('img, picture').empty?
239
235
  if !has_image
240
- img_tag = "<p><img src=\"#{result['image_url']}\" alt=\"Lead Image\"></p>\n"
236
+ img_tag = "<p><img src=\"#{result['image_url']}\"></p>\n"
241
237
  h1 = doc.at_css('h1')
242
238
  if h1
243
239
  h1.add_next_sibling(Nokogiri::HTML::DocumentFragment.parse(img_tag))
@@ -1,3 +1,3 @@
1
1
  module ReadabilityJs
2
- VERSION = '0.0.5'.freeze
2
+ VERSION = '0.0.7'.freeze
3
3
  end
@@ -63,12 +63,13 @@ module ReadabilityJs
63
63
  # @param serializer [String, nil] Serializer to use for output (optional).
64
64
  # @param allow_video_regex [String, nil] Regular expression to allow video URLs (optional).
65
65
  # @param link_density_modifier [Float] Modifier for link density calculation (default: 0).
66
+ # @param blacklist_selectors [Array<String>] List of CSS selectors to remove from the HTML before parsing (default: []).
66
67
  # @return [Hash] A hash containing the extracted content and metadata.
67
68
  #
68
69
  # @raise [ReadabilityJs::Error] if an error occurs during execution
69
70
  #
70
- def self.parse_extended(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0)
71
- result = Extended::before_cleanup html
71
+ def self.parse_extended(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0, blacklist_selectors: [])
72
+ result = Extended::before_cleanup html, blacklist_selectors: blacklist_selectors
72
73
  result = parse result, url: url, debug: debug, max_elems_to_parse: max_elems_to_parse, nb_top_candidates: nb_top_candidates, char_threshold: char_threshold, classes_to_preserve: classes_to_preserve, keep_classes: keep_classes, disable_json_ld: disable_json_ld, serializer: serializer, allow_video_regex: allow_video_regex, link_density_modifier: link_density_modifier
73
74
  Extended::after_cleanup result, html
74
75
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: readability_js
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matthäus Beyrle