html2text 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +26 -4
  3. data/README.md +5 -5
  4. data/lib/html2text/version.rb +3 -1
  5. data/lib/html2text.rb +108 -106
  6. metadata +78 -109
  7. data/spec/examples/anchors.html +0 -12
  8. data/spec/examples/anchors.txt +0 -5
  9. data/spec/examples/basic.html +0 -21
  10. data/spec/examples/basic.txt +0 -15
  11. data/spec/examples/dom-processing.html +0 -8
  12. data/spec/examples/dom-processing.txt +0 -1
  13. data/spec/examples/empty.html +0 -0
  14. data/spec/examples/empty.txt +0 -0
  15. data/spec/examples/full_email.html +0 -220
  16. data/spec/examples/full_email.txt +0 -54
  17. data/spec/examples/huge-msoffice.html +0 -1
  18. data/spec/examples/huge-msoffice.txt +0 -25872
  19. data/spec/examples/images.html +0 -54
  20. data/spec/examples/images.txt +0 -27
  21. data/spec/examples/invalid.html +0 -4
  22. data/spec/examples/invalid.txt +0 -1
  23. data/spec/examples/lists.html +0 -24
  24. data/spec/examples/lists.txt +0 -17
  25. data/spec/examples/more-anchors.html +0 -14
  26. data/spec/examples/more-anchors.txt +0 -7
  27. data/spec/examples/msoffice.html +0 -1
  28. data/spec/examples/msoffice.txt +0 -12
  29. data/spec/examples/nbsp.html +0 -1
  30. data/spec/examples/nbsp.txt +0 -1
  31. data/spec/examples/nested-divs.html +0 -17
  32. data/spec/examples/nested-divs.txt +0 -12
  33. data/spec/examples/newlines.html +0 -50
  34. data/spec/examples/newlines.txt +0 -35
  35. data/spec/examples/non-breaking-spaces.html +0 -1
  36. data/spec/examples/non-breaking-spaces.txt +0 -1
  37. data/spec/examples/pre.html +0 -10
  38. data/spec/examples/pre.txt +0 -8
  39. data/spec/examples/table.html +0 -53
  40. data/spec/examples/table.txt +0 -7
  41. data/spec/examples/test3.html +0 -1
  42. data/spec/examples/test3.txt +0 -2
  43. data/spec/examples/test4.html +0 -1
  44. data/spec/examples/test4.txt +0 -5
  45. data/spec/examples/utf8-example.html +0 -4
  46. data/spec/examples/utf8-example.txt +0 -2
  47. data/spec/examples/windows-1252-example.html +0 -4
  48. data/spec/examples/windows-1252-example.txt +0 -2
  49. data/spec/examples/zero-width-non-joiners.html +0 -1
  50. data/spec/examples/zero-width-non-joiners.txt +0 -1
  51. data/spec/examples_spec.rb +0 -41
  52. data/spec/html2text_spec.rb +0 -58
  53. data/spec/spec_helper.rb +0 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d633d005beae05080d43ccb792e1542a4304af22f1361bb45a7c535e85457aab
4
- data.tar.gz: e842dd18378b0db33338c8d398552423f325d63ad8edab84cae46e29816aecfe
3
+ metadata.gz: 32afc21e326c44b7881358081161b9581c396b167fad44614a96cc0b6df91f23
4
+ data.tar.gz: fe03a0811cbff965e6b720ad1fdfdd55c0aa1e03165c16c84de7ecac39d65c9d
5
5
  SHA512:
6
- metadata.gz: 75e2591f8f66a9a0f6789c12410bdd5c8cd19dbf816be0e5632d348fd84bb4dae52a20937c8887d6a7772158bbbce6b49aeec2bd89f91e6ede080012fd22ab14
7
- data.tar.gz: 7846aeb8039076478113de5abea3518809045a7108dfcc0ef7b2cb97b09e29f5be9d36d22a5cf3173cb5a17aef228e6edf84f9968783bc67ce4ab07f433ced0c
6
+ metadata.gz: e26ef2f826da8958c56a390bd4242461abdd26a110216ee3903c902e007b5fb26b38a8f358b420abe6eac480b4a452fedba90b75a36eb7f3f0c2bec4dad040a7
7
+ data.tar.gz: 31515d14c3ca612f2eb9faaf52655639c3c0b72687fed89c41d83fad816af852854057e3ff0803e9f16e5e1d2b62657699cedef5b9240ab114826d936d0ed3c0
data/CHANGELOG.md CHANGED
@@ -5,8 +5,28 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
5
5
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
6
 
7
7
  ## [Unreleased]
8
+
9
+ ## [0.4.0] - 2024-06-08
10
+ ### Added
11
+ - Switch from Travis to Github Actions for Build and Test
12
+ - Add rubocop for linting and cleanup existing violations ([#36](https://github.com/soundasleep/html2text_ruby/pull/36))
13
+
14
+ ### Changed
15
+ - Add support for Ruby 3.x, removed support for Ruby < 3.0 since it is EOL
16
+ - Allow subclassing of `Html2Text` to override the default behaviour ([#30](https://github.com/soundasleep/html2text_ruby/pull/30))
17
+
18
+ ### Fixed
19
+ - Loosen nokogiri dependency to allow for nokogiri < 2.0 ([#17](https://github.com/soundasleep/html2text_ruby/pull/17))
20
+ - Fix `NoMethodError` when parsing nodes with no name ([#15](https://github.com/soundasleep/html2text_ruby/pull/15))
21
+
22
+ ## [0.3.1] - 2019-06-12
23
+ ### Security
24
+ - Bumped nokogiri requirement to ~> 1.10.3, resolving [CVE-2019-11068](https://nvd.nist.gov/vuln/detail/CVE-2019-11068)
25
+ ([#8](https://github.com/soundasleep/html2text_ruby/issues/8))
26
+
27
+ ## [0.3.0] - 2019-02-15
8
28
  ### Added
9
- - Zero-width non-joiners are now stripped [#5](https://github.com/soundasleep/html2text_ruby/pull/5)
29
+ - Zero-width non-joiners are now stripped ([#5](https://github.com/soundasleep/html2text_ruby/pull/5))
10
30
  - Support both UTF-8 and Windows-1252 encoded files
11
31
  - Support converting `<pre>` blocks, including whitespace within these blocks
12
32
  - MS Office (MsoNormal) documents are now rendered closer to actual render output
@@ -22,7 +42,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
22
42
 
23
43
  ## [0.2.1] - 2017-09-27
24
44
  ### Fixed
25
- - Convert non-string input into strings [#3](https://github.com/soundasleep/html2text_ruby/pull/3)
45
+ - Convert non-string input into strings ([#3](https://github.com/soundasleep/html2text_ruby/pull/3))
26
46
 
27
- [Unreleased]: https://github.com/soundasleep/html2text/compare/0.2.1...HEAD
28
- [0.2.1]: https://github.com/soundasleep/html2text/compare/0.2.1...0.2.1
47
+ [Unreleased]: https://github.com/soundasleep/html2text_ruby/compare/0.3.1...HEAD
48
+ [0.3.1]: https://github.com/soundasleep/html2text_ruby/compare/0.3.0...0.3.1
49
+ [0.3.0]: https://github.com/soundasleep/html2text_ruby/compare/0.2.1...0.3.0
50
+ [0.2.1]: https://github.com/soundasleep/html2text_ruby/compare/0.2.1...0.2.1
data/README.md CHANGED
@@ -1,5 +1,5 @@
1
- html2text [![Build Status](https://travis-ci.org/soundasleep/html2text_ruby.svg?branch=master)](https://travis-ci.org/soundasleep/html2text_ruby) [![Total Downloads](https://ruby-gem-downloads-badge.herokuapp.com/html2text?type=total&metric=true)]
2
- ==============
1
+ html2text ![Build](https://github.com/soundasleep/html2text_ruby/actions/workflows/build.yml/badge.svg) [![Gem Version](https://badge.fury.io/rb/html2text.svg)](https://rubygems.org/gems/html2text)
2
+ ---
3
3
 
4
4
  `html2text` is a very simple gem that uses DOM methods to convert HTML into a format similar to what would be
5
5
  rendered by a browser - perfect for places where you need a quick text representation. For example:
@@ -20,7 +20,7 @@ rendered by a browser - perfect for places where you need a quick text represent
20
20
  <div>Another div</div>
21
21
  <div>A div<div>within a div</div></div>
22
22
 
23
- <a href="http://foo.com">A link</a>
23
+ <a href="https://foo.com">A link</a>
24
24
 
25
25
  </body>
26
26
  </html>
@@ -40,7 +40,7 @@ Another div
40
40
  A div
41
41
  within a div
42
42
 
43
- [A link](http://foo.com)
43
+ [A link](https://foo.com)
44
44
  ```
45
45
 
46
46
  See the [original blog post](http://journals.jevon.org/users/jevon-phd/entry/19818) or the related [StackOverflow answer](http://stackoverflow.com/a/2564472/39531).
@@ -63,7 +63,7 @@ text = Html2Text.convert(html)
63
63
 
64
64
  ## Tests
65
65
 
66
- See all of the test cases defined in [spec/examples/](spec/examples/). These can be run with `bundle && rspec`.
66
+ See all of the test cases defined in [spec/examples/](spec/examples/). These can be run with `bundle exec rake`.
67
67
 
68
68
  ## License
69
69
 
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Html2Text
2
- VERSION = "0.3.0"
4
+ VERSION = '0.4.0'
3
5
  end
data/lib/html2text.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'nokogiri'
2
4
 
3
5
  class Html2Text
@@ -10,14 +12,14 @@ class Html2Text
10
12
  def self.convert(html)
11
13
  html = html.to_s
12
14
 
13
- if is_office_document?(html)
15
+ if office_document?(html)
14
16
  # Emulate the CSS rendering of Office documents
15
- html = html.gsub("<p class=MsoNormal>", "<br>")
16
- .gsub("<o:p>&nbsp;</o:p>", "<br>")
17
- .gsub("<o:p></o:p>", "")
17
+ html = html.gsub('<p class=MsoNormal>', '<br>')
18
+ .gsub('<o:p>&nbsp;</o:p>', '<br>')
19
+ .gsub('<o:p></o:p>', '')
18
20
  end
19
21
 
20
- if !html.include?("<html")
22
+ unless html.include?('<html')
21
23
  # Stop Nokogiri from inserting in <p> tags
22
24
  html = "<div>#{html}</div>"
23
25
  end
@@ -25,25 +27,29 @@ class Html2Text
25
27
  html = fix_newlines(replace_entities(html))
26
28
  doc = Nokogiri::HTML(html)
27
29
 
28
- Html2Text.new(doc).convert
30
+ new(doc).convert
29
31
  end
30
32
 
31
33
  def self.fix_newlines(text)
34
+ # rubocop:disable Performance/StringReplacement
32
35
  text.gsub("\r\n", "\n").gsub("\r", "\n")
36
+ # rubocop:enable Performance/StringReplacement
33
37
  end
34
38
 
35
39
  def self.replace_entities(text)
36
- text.gsub("&nbsp;", " ").gsub("\u00a0", " ").gsub("&zwnj;", "")
40
+ # rubocop:disable Performance/StringReplacement
41
+ text.gsub('&nbsp;', ' ').gsub("\u00a0", ' ').gsub('&zwnj;', '')
42
+ # rubocop:enable Performance/StringReplacement
37
43
  end
38
44
 
39
45
  def convert
40
46
  output = iterate_over(doc)
41
47
  output = remove_leading_and_trailing_whitespace(output)
42
48
  output = remove_unnecessary_empty_lines(output)
43
- return output.strip
49
+ output.strip
44
50
  end
45
51
 
46
- DO_NOT_TOUCH_WHITESPACE = "<do-not-touch-whitespace>"
52
+ DO_NOT_TOUCH_WHITESPACE = '<do-not-touch-whitespace>'
47
53
 
48
54
  def remove_leading_and_trailing_whitespace(text)
49
55
  # ignore any <pre> blocks, which we don't want to interact with
@@ -51,22 +57,22 @@ class Html2Text
51
57
 
52
58
  output = []
53
59
  pre_blocks.each.with_index do |block, index|
54
- if index % 2 == 0
55
- output << block.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t")
56
- else
57
- output << block
58
- end
60
+ output << if index.even?
61
+ block.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t")
62
+ else
63
+ block
64
+ end
59
65
  end
60
66
 
61
- output.join("")
67
+ output.join
62
68
  end
63
69
 
64
- private
65
-
66
- def self.is_office_document?(text)
67
- text.include?("urn:schemas-microsoft-com:office")
70
+ private_class_method def self.office_document?(text)
71
+ text.include?('urn:schemas-microsoft-com:office')
68
72
  end
69
73
 
74
+ private
75
+
70
76
  def remove_unnecessary_empty_lines(text)
71
77
  text.gsub(/\n\n\n*/im, "\n\n")
72
78
  end
@@ -75,187 +81,183 @@ class Html2Text
75
81
  # Replace whitespace characters with a space (equivalent to \s)
76
82
  # and force any text encoding into UTF-8
77
83
  if text.valid_encoding?
78
- text.gsub(/[\t\n\f\r ]+/im, " ")
84
+ text.gsub(/[\t\n\f\r ]+/im, ' ')
79
85
  else
80
- text.force_encoding("WINDOWS-1252")
81
- return trimmed_whitespace(text.encode("UTF-16be", invalid: :replace, replace: "?").encode('UTF-8'))
86
+ text.force_encoding('WINDOWS-1252')
87
+ trimmed_whitespace(text.encode('UTF-16be', invalid: :replace, replace: '?').encode('UTF-8'))
82
88
  end
83
89
  end
84
90
 
85
91
  def iterate_over(node)
86
- return "\n" if node.name.downcase == "br" && next_node_is_text?(node)
92
+ return "\n" if node.name.downcase == 'br' && next_node_is_text?(node)
87
93
 
88
94
  return trimmed_whitespace(node.text) if node.text?
89
95
 
90
- if ["style", "head", "title", "meta", "script"].include?(node.name.downcase)
91
- return ""
92
- end
96
+ return '' if %w[style head title meta script].include?(node.name.downcase)
93
97
 
94
- if node.name.downcase == "pre"
95
- return "\n#{DO_NOT_TOUCH_WHITESPACE}#{node.text}#{DO_NOT_TOUCH_WHITESPACE}"
96
- end
98
+ return "\n#{DO_NOT_TOUCH_WHITESPACE}#{node.text}#{DO_NOT_TOUCH_WHITESPACE}" if node.name.downcase == 'pre'
97
99
 
98
100
  output = []
99
101
 
100
102
  output << prefix_whitespace(node)
101
103
  output += node.children.map do |child|
102
- iterate_over(child)
104
+ iterate_over(child) unless child.name.nil?
103
105
  end
104
106
  output << suffix_whitespace(node)
105
107
 
106
- output = output.compact.join("") || ""
108
+ output = output.compact.join || ''
107
109
 
108
- if node.name.downcase == "a"
109
- output = wrap_link(node, output)
110
- elsif node.name.downcase == "img"
111
- output = image_text(node)
110
+ unless node.name.nil?
111
+ if node.name.downcase == 'a'
112
+ output = wrap_link(node, output)
113
+ elsif node.name.downcase == 'img'
114
+ output = image_text(node)
115
+ end
112
116
  end
113
117
 
114
- return output
118
+ output
115
119
  end
116
120
 
121
+ # rubocop:disable Lint/DuplicateBranch
117
122
  def prefix_whitespace(node)
118
123
  case node.name.downcase
119
- when "hr"
120
- "\n---------------------------------------------------------------\n"
124
+ when 'hr'
125
+ "\n---------------------------------------------------------------\n"
121
126
 
122
- when "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul"
123
- "\n\n"
127
+ when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul'
128
+ "\n\n"
124
129
 
125
- when "p"
126
- "\n\n"
130
+ when 'p'
131
+ "\n\n"
127
132
 
128
- when "tr"
129
- "\n"
133
+ when 'tr'
134
+ "\n"
130
135
 
131
- when "div"
132
- if node.parent.name == "div" && (node.parent.text.strip == node.text.strip)
133
- ""
134
- else
135
- "\n"
136
- end
136
+ when 'div'
137
+ if node.parent.name == 'div' && (node.parent.text.strip == node.text.strip)
138
+ ''
139
+ else
140
+ "\n"
141
+ end
137
142
 
138
- when "td", "th"
139
- "\t"
143
+ when 'td', 'th'
144
+ "\t"
140
145
 
141
- when "li"
142
- "- "
146
+ when 'li'
147
+ '- '
143
148
  end
144
149
  end
150
+ # rubocop:enable Lint/DuplicateBranch
145
151
 
152
+ # rubocop:disable Lint/DuplicateBranch
146
153
  def suffix_whitespace(node)
147
154
  case node.name.downcase
148
- when "h1", "h2", "h3", "h4", "h5", "h6"
149
- # add another line
150
- "\n\n"
155
+ when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
156
+ # add another line
157
+ "\n\n"
151
158
 
152
- when "p"
153
- "\n\n"
159
+ when 'p'
160
+ "\n\n"
154
161
 
155
- when "br"
156
- if next_node_name(node) != "div" && next_node_name(node) != nil
157
- "\n"
158
- end
162
+ when 'br'
163
+ "\n" if next_node_name(node) != 'div' && !next_node_name(node).nil?
159
164
 
160
- when "li"
161
- "\n"
165
+ when 'li'
166
+ "\n"
162
167
 
163
- when "div"
164
- if next_node_is_text?(node)
165
- "\n"
166
- elsif next_node_name(node) != "div" && next_node_name(node) != nil
167
- "\n"
168
- end
168
+ when 'div'
169
+ if next_node_is_text?(node)
170
+ "\n"
171
+ elsif next_node_name(node) != 'div' && !next_node_name(node).nil?
172
+ "\n"
173
+ end
169
174
  end
170
175
  end
176
+ # rubocop:enable Lint/DuplicateBranch
171
177
 
172
178
  # links are returned in [text](link) format
173
179
  def wrap_link(node, output)
174
- href = node.attribute("href")
175
- name = node.attribute("name")
180
+ href = node.attribute('href')
181
+ name = node.attribute('name')
176
182
 
177
183
  output = output.strip
178
184
 
179
185
  # remove double [[ ]]s from linking images
180
- if output[0] == "[" && output[-1] == "]"
186
+ if output[0] == '[' && output[-1] == ']'
181
187
  output = output[1, output.length - 2]
182
188
 
183
189
  # for linking images, the title of the <a> overrides the title of the <img>
184
- if node.attribute("title")
185
- output = node.attribute("title").to_s
186
- end
190
+ output = node.attribute('title').to_s if node.attribute('title')
187
191
  end
188
192
 
189
193
  # if there is no link text, but a title attr
190
- if output.empty? && node.attribute("title")
191
- output = node.attribute("title").to_s
192
- end
194
+ output = node.attribute('title').to_s if output.empty? && node.attribute('title')
193
195
 
194
196
  if href.nil?
195
- if !name.nil?
196
- output = "[#{output}]"
197
- end
197
+ output = "[#{output}]" unless name.nil?
198
198
  else
199
199
  href = href.to_s
200
200
 
201
201
  if href != output && href != "mailto:#{output}" &&
202
- href != "http://#{output}" && href != "https://#{output}"
203
- if output.empty?
204
- output = href
205
- else
206
- output = "[#{output}](#{href})"
207
- end
202
+ href != "http://#{output}" && href != "https://#{output}"
203
+ output = if output.empty?
204
+ href
205
+ else
206
+ "[#{output}](#{href})"
207
+ end
208
208
  end
209
209
  end
210
210
 
211
211
  case next_node_name(node)
212
- when "h1", "h2", "h3", "h4", "h5", "h6"
213
- output += "\n"
212
+ when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
213
+ output += "\n"
214
214
  end
215
215
 
216
216
  output
217
217
  end
218
218
 
219
219
  def image_text(node)
220
- if node.attribute("title")
221
- "[" + node.attribute("title").to_s + "]"
222
- elsif node.attribute("alt")
223
- "[" + node.attribute("alt").to_s + "]"
220
+ if node.attribute('title')
221
+ "[#{node.attribute('title')}]"
222
+ elsif node.attribute('alt')
223
+ "[#{node.attribute('alt')}]"
224
224
  else
225
- ""
225
+ ''
226
226
  end
227
227
  end
228
228
 
229
229
  def next_node_name(node)
230
230
  next_node = node.next_sibling
231
- while next_node != nil
231
+ until next_node.nil?
232
232
  break if next_node.element?
233
+
233
234
  next_node = next_node.next_sibling
234
235
  end
235
236
 
236
- if next_node && next_node.element?
237
- next_node.name.downcase
238
- end
237
+ return unless next_node&.element?
238
+
239
+ next_node.name.downcase
239
240
  end
240
241
 
241
242
  def next_node_is_text?(node)
242
- return !node.next_sibling.nil? && node.next_sibling.text? && !node.next_sibling.text.strip.empty?
243
+ !node.next_sibling.nil? && node.next_sibling.text? && !node.next_sibling.text.strip.empty?
243
244
  end
244
245
 
245
246
  def previous_node_name(node)
246
247
  previous_node = node.previous_sibling
247
- while previous_node != nil
248
+ until previous_node.nil?
248
249
  break if previous_node.element?
250
+
249
251
  previous_node = previous_node.previous_sibling
250
252
  end
251
253
 
252
- if previous_node && previous_node.element?
253
- previous_node.name.downcase
254
- end
254
+ return unless previous_node&.element?
255
+
256
+ previous_node.name.downcase
255
257
  end
256
258
 
257
259
  def previous_node_is_text?(node)
258
- return !node.previous_sibling.nil? && node.previous_sibling.text? && !node.previous_sibling.text.strip.empty?
260
+ !node.previous_sibling.nil? && node.previous_sibling.text? && !node.previous_sibling.text.strip.empty?
259
261
  end
260
262
 
261
263
  # def previous_node_is_not_text?(node)