readability_js 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e656198387a824f6e6bdb7a3e4ccf3f5173100782c94dc9a0462dd8dc6423ef4
4
- data.tar.gz: b31a6cf4c54563a55844c4fa852ca27804f8ae9d9acb31dfdbcc66d5d58be14c
3
+ metadata.gz: ff2c212926fdd4859f0a70a538ad28f3bc4b760b0d022b0e77f61bc21b40e859
4
+ data.tar.gz: 99ea17969d31e4de8d2c1718693b1c43c254ec8099a4ec85c5ba80342c80fda1
5
5
  SHA512:
6
- metadata.gz: faedb480cfb28b6b73c6d6b6779979758427ba202177abe1cd0947073fb94296521ad42b58c9dab867d0b546187a9844892b7b6a4d0d2d2960a7f77b9c96b6e5
7
- data.tar.gz: b227cfc77004de40ea9e12fa16a3838cb59871687c7986e7efdd4f28c0a7f078a7ab07184b0ead21511f237857a7a2ec41042f85a5b20f2abf19acfb1e6d69f5
6
+ metadata.gz: 7050f66c9af71d4b420a49a322b9be44afa9777ea500c59f1bda10f350ca39806c572de0645e563d8f1d18d4ad06955c43f94c6907e8af79045184f3e5bd6112
7
+ data.tar.gz: 32520125bda871a44601ee83eaa653e62ef9c4104c8ccc3d1dbc0a2624c09304e0a4e63bf5e29672fcb8c6f8035081696640a3b7bac020ad627049d37c772dad
data/README.md CHANGED
@@ -64,7 +64,9 @@ and includes a beautified markdown version of the content.
64
64
  ```ruby
65
65
  require 'readability_js'
66
66
  html = File.read("my_article.html")
67
- result = ReadabilityJs.parse_extended(html)
67
+ # extend has included a DEFAULT_SELECTOR_BLACKLIST and you can add your own selectors to it as well,
68
+ # that will be used to remove unwanted elements from the content before parsing at all.
69
+ result = ReadabilityJs.parse_extended(html, blacklist_selectors: [".advertisement", "#sponsored"])
68
70
  p result
69
71
  ```
70
72
 
@@ -81,7 +83,7 @@ data = ReadabilityJs.parse(
81
83
  # => Hash
82
84
  ```
83
85
 
84
- ### Query response
86
+ ### Parse response
85
87
  The response object is of type `Hash`.
86
88
  It contains the data returned by readability, with hash keys transformed in snake_case.
87
89
 
@@ -1,8 +1,7 @@
1
-
2
1
  module ReadabilityJs
3
2
  class Extended
4
3
 
5
- SELECTOR_BLACKLIST = [
4
+ DEFAULT_SELECTOR_BLACKLIST = [
6
5
  ".Article-Partner",
7
6
  ".Article-Partner-Text",
8
7
  ".Article-Comments-Button",
@@ -11,34 +10,26 @@ module ReadabilityJs
11
10
  "*[data-element-tracking-name]",
12
11
  "*[aria-label='Anzeige']",
13
12
  "nav[aria-label='breadcrumb']",
14
- # heise
15
13
  "a-video",
16
14
  "a-gift",
17
15
  "a-collapse",
18
16
  "a-opt-in",
19
- # spiegel
20
17
  "[data-area='related_articles']",
21
- # welt
22
18
  "nav[aria-label='Breadcrumb']",
23
19
  ".c-inline-teaser-list",
24
20
  "[width='1'][height='1']",
25
- # golem
26
21
  ".go-alink-list",
27
- # faz
28
22
  "[data-external-selector='related-articles-entries']",
29
23
  ".BigBox",
30
- # frankfurter rundschau
31
24
  ".id-Breadcrumb-item",
32
25
  ".id-Story-interactionBar",
33
26
  "revenue-reel",
34
27
  ".id-StoryElement-factBox",
35
- # stern
36
28
  ".breadcrumb",
37
29
  ".teaser",
38
30
  ".group-teaserblock__items",
39
31
  ".title__kicker",
40
32
  "ws-adtag",
41
- # taz
42
33
  "[data-for='webelement_bio']",
43
34
  "[data-for='webelement_citation']",
44
35
  "#articleTeaser",
@@ -46,10 +37,13 @@ module ReadabilityJs
46
37
  "[x-data='{}']",
47
38
  "#komune",
48
39
  ".community",
40
+ ".article-head__topline",
41
+ ".article__audioicon",
42
+ ".auplayer",
49
43
  ]
50
44
 
51
- def self.before_cleanup(html)
52
- pre_parser html
45
+ def self.before_cleanup(html, blacklist_selectors: [])
46
+ pre_parser html, blacklist_selectors: blacklist_selectors
53
47
  end
54
48
 
55
49
  def self.after_cleanup(result, html)
@@ -62,16 +56,18 @@ module ReadabilityJs
62
56
  #
63
57
  # Pre-parser to clean up HTML before passing it to Readability
64
58
  #
65
- # SELECTOR_BLACKLIST contains CSS selectors of elements to be removed from the HTML
59
+ # DEFAULT_SELECTOR_BLACKLIST and given blacklist_selectors contains CSS selectors of elements to be removed from the HTML
66
60
  # before parsing to improve content extraction.
67
61
  #
68
62
  # @param html [String] The HTML document as a string.
69
63
  # @return [String] The cleaned HTML document as a string.
70
64
  #
71
- def self.pre_parser(html)
65
+ def self.pre_parser(html, blacklist_selectors: [])
66
+ final_blacklist = DEFAULT_SELECTOR_BLACKLIST
67
+ final_blacklist += blacklist_selectors if blacklist_selectors.is_a?(Array) && !blacklist_selectors.empty?
72
68
  doc = Nokogiri::HTML(html)
73
69
  # Remove blacklisted elements by selector
74
- SELECTOR_BLACKLIST.each do |classname|
70
+ final_blacklist.each do |classname|
75
71
  doc.css("#{classname}").remove
76
72
  end
77
73
  doc.to_html
@@ -1,3 +1,3 @@
1
1
  module ReadabilityJs
2
- VERSION = '0.0.6'.freeze
2
+ VERSION = '0.0.7'.freeze
3
3
  end
@@ -63,12 +63,13 @@ module ReadabilityJs
63
63
  # @param serializer [String, nil] Serializer to use for output (optional).
64
64
  # @param allow_video_regex [String, nil] Regular expression to allow video URLs (optional).
65
65
  # @param link_density_modifier [Float] Modifier for link density calculation (default: 0).
66
+ # @param blacklist_selectors [Array<String>] List of CSS selectors to remove from the HTML before parsing (default: []).
66
67
  # @return [Hash] A hash containing the extracted content and metadata.
67
68
  #
68
69
  # @raise [ReadabilityJs::Error] if an error occurs during execution
69
70
  #
70
- def self.parse_extended(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0)
71
- result = Extended::before_cleanup html
71
+ def self.parse_extended(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0, blacklist_selectors: [])
72
+ result = Extended::before_cleanup html, blacklist_selectors: blacklist_selectors
72
73
  result = parse result, url: url, debug: debug, max_elems_to_parse: max_elems_to_parse, nb_top_candidates: nb_top_candidates, char_threshold: char_threshold, classes_to_preserve: classes_to_preserve, keep_classes: keep_classes, disable_json_ld: disable_json_ld, serializer: serializer, allow_video_regex: allow_video_regex, link_density_modifier: link_density_modifier
73
74
  Extended::after_cleanup result, html
74
75
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: readability_js
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matthäus Beyrle