html2text 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +13 -0
  3. data/README.md +5 -5
  4. data/lib/html2text/version.rb +3 -1
  5. data/lib/html2text.rb +108 -106
  6. metadata +65 -110
  7. data/spec/examples/anchors.html +0 -12
  8. data/spec/examples/anchors.txt +0 -5
  9. data/spec/examples/basic.html +0 -21
  10. data/spec/examples/basic.txt +0 -15
  11. data/spec/examples/dom-processing.html +0 -8
  12. data/spec/examples/dom-processing.txt +0 -1
  13. data/spec/examples/empty.html +0 -0
  14. data/spec/examples/empty.txt +0 -0
  15. data/spec/examples/full_email.html +0 -220
  16. data/spec/examples/full_email.txt +0 -54
  17. data/spec/examples/huge-msoffice.html +0 -1
  18. data/spec/examples/huge-msoffice.txt +0 -25872
  19. data/spec/examples/images.html +0 -54
  20. data/spec/examples/images.txt +0 -27
  21. data/spec/examples/invalid.html +0 -4
  22. data/spec/examples/invalid.txt +0 -1
  23. data/spec/examples/lists.html +0 -24
  24. data/spec/examples/lists.txt +0 -17
  25. data/spec/examples/more-anchors.html +0 -14
  26. data/spec/examples/more-anchors.txt +0 -7
  27. data/spec/examples/msoffice.html +0 -1
  28. data/spec/examples/msoffice.txt +0 -12
  29. data/spec/examples/nbsp.html +0 -1
  30. data/spec/examples/nbsp.txt +0 -1
  31. data/spec/examples/nested-divs.html +0 -17
  32. data/spec/examples/nested-divs.txt +0 -12
  33. data/spec/examples/newlines.html +0 -50
  34. data/spec/examples/newlines.txt +0 -35
  35. data/spec/examples/non-breaking-spaces.html +0 -1
  36. data/spec/examples/non-breaking-spaces.txt +0 -1
  37. data/spec/examples/pre.html +0 -10
  38. data/spec/examples/pre.txt +0 -8
  39. data/spec/examples/table.html +0 -53
  40. data/spec/examples/table.txt +0 -7
  41. data/spec/examples/test3.html +0 -1
  42. data/spec/examples/test3.txt +0 -2
  43. data/spec/examples/test4.html +0 -1
  44. data/spec/examples/test4.txt +0 -5
  45. data/spec/examples/utf8-example.html +0 -4
  46. data/spec/examples/utf8-example.txt +0 -2
  47. data/spec/examples/windows-1252-example.html +0 -4
  48. data/spec/examples/windows-1252-example.txt +0 -2
  49. data/spec/examples/zero-width-non-joiners.html +0 -1
  50. data/spec/examples/zero-width-non-joiners.txt +0 -1
  51. data/spec/examples_spec.rb +0 -41
  52. data/spec/html2text_spec.rb +0 -58
  53. data/spec/spec_helper.rb +0 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7d1902161f7964cd95630662cfe326001842de6ae9cfc791216b2a5c2d6fc763
4
- data.tar.gz: 4940f60ec3ea46df4a3117aa7c053d1b30b935c3114bddb81e8d6e81e29fccbb
3
+ metadata.gz: 32afc21e326c44b7881358081161b9581c396b167fad44614a96cc0b6df91f23
4
+ data.tar.gz: fe03a0811cbff965e6b720ad1fdfdd55c0aa1e03165c16c84de7ecac39d65c9d
5
5
  SHA512:
6
- metadata.gz: cd7354466697fc737c336a6abf38e6c70a9480e7d609de135348d4f8b6ab765832929ccd5687fc88209a75d2f82932421a8a59fe8c0754121680d60a0a5f3496
7
- data.tar.gz: 39337ef32bc46adf101c06fc33cc98d8960bf31ce1816fde93dfb1a8a6aa75381b28114a8ff0ad363c5335f2bd61df9766ece0ef8c2b325c28d261e9a3552f7b
6
+ metadata.gz: e26ef2f826da8958c56a390bd4242461abdd26a110216ee3903c902e007b5fb26b38a8f358b420abe6eac480b4a452fedba90b75a36eb7f3f0c2bec4dad040a7
7
+ data.tar.gz: 31515d14c3ca612f2eb9faaf52655639c3c0b72687fed89c41d83fad816af852854057e3ff0803e9f16e5e1d2b62657699cedef5b9240ab114826d936d0ed3c0
data/CHANGELOG.md CHANGED
@@ -6,6 +6,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.4.0] - 2024-06-08
10
+ ### Added
11
+ - Switch from Travis to Github Actions for Build and Test
12
+ - Add rubocop for linting and cleanup existing violations ([#36](https://github.com/soundasleep/html2text_ruby/pull/36))
13
+
14
+ ### Changed
15
+ - Add support for Ruby 3.x, removed support for Ruby < 3.0 since it is EOL
16
+ - Allow subclassing of `Html2Text` to override the default behaviour ([#30](https://github.com/soundasleep/html2text_ruby/pull/30))
17
+
18
+ ### Fixed
19
+ - Loosen nokogiri dependency to allow for nokogiri < 2.0 ([#17](https://github.com/soundasleep/html2text_ruby/pull/17))
20
+ - Fix `NoMethodError` when parsing nodes with no name ([#15](https://github.com/soundasleep/html2text_ruby/pull/15))
21
+
9
22
  ## [0.3.1] - 2019-06-12
10
23
  ### Security
11
24
  - Bumped nokogiri requirement to ~> 1.10.3, resolving [CVE-2019-11068](https://nvd.nist.gov/vuln/detail/CVE-2019-11068)
data/README.md CHANGED
@@ -1,5 +1,5 @@
1
- html2text [![Build Status](https://travis-ci.org/soundasleep/html2text_ruby.svg?branch=master)](https://travis-ci.org/soundasleep/html2text_ruby) [![Total Downloads](https://ruby-gem-downloads-badge.herokuapp.com/html2text?type=total&metric=true)](https://rubygems.org/gems/html2text/)
2
- ==============
1
+ html2text ![Build](https://github.com/soundasleep/html2text_ruby/actions/workflows/build.yml/badge.svg) [![Gem Version](https://badge.fury.io/rb/html2text.svg)](https://rubygems.org/gems/html2text)
2
+ ---
3
3
 
4
4
  `html2text` is a very simple gem that uses DOM methods to convert HTML into a format similar to what would be
5
5
  rendered by a browser - perfect for places where you need a quick text representation. For example:
@@ -20,7 +20,7 @@ rendered by a browser - perfect for places where you need a quick text represent
20
20
  <div>Another div</div>
21
21
  <div>A div<div>within a div</div></div>
22
22
 
23
- <a href="http://foo.com">A link</a>
23
+ <a href="https://foo.com">A link</a>
24
24
 
25
25
  </body>
26
26
  </html>
@@ -40,7 +40,7 @@ Another div
40
40
  A div
41
41
  within a div
42
42
 
43
- [A link](http://foo.com)
43
+ [A link](https://foo.com)
44
44
  ```
45
45
 
46
46
  See the [original blog post](http://journals.jevon.org/users/jevon-phd/entry/19818) or the related [StackOverflow answer](http://stackoverflow.com/a/2564472/39531).
@@ -63,7 +63,7 @@ text = Html2Text.convert(html)
63
63
 
64
64
  ## Tests
65
65
 
66
- See all of the test cases defined in [spec/examples/](spec/examples/). These can be run with `bundle && rspec`.
66
+ See all of the test cases defined in [spec/examples/](spec/examples/). These can be run with `bundle exec rake`.
67
67
 
68
68
  ## License
69
69
 
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Html2Text
2
- VERSION = "0.3.1"
4
+ VERSION = '0.4.0'
3
5
  end
data/lib/html2text.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'nokogiri'
2
4
 
3
5
  class Html2Text
@@ -10,14 +12,14 @@ class Html2Text
10
12
  def self.convert(html)
11
13
  html = html.to_s
12
14
 
13
- if is_office_document?(html)
15
+ if office_document?(html)
14
16
  # Emulate the CSS rendering of Office documents
15
- html = html.gsub("<p class=MsoNormal>", "<br>")
16
- .gsub("<o:p>&nbsp;</o:p>", "<br>")
17
- .gsub("<o:p></o:p>", "")
17
+ html = html.gsub('<p class=MsoNormal>', '<br>')
18
+ .gsub('<o:p>&nbsp;</o:p>', '<br>')
19
+ .gsub('<o:p></o:p>', '')
18
20
  end
19
21
 
20
- if !html.include?("<html")
22
+ unless html.include?('<html')
21
23
  # Stop Nokogiri from inserting in <p> tags
22
24
  html = "<div>#{html}</div>"
23
25
  end
@@ -25,25 +27,29 @@ class Html2Text
25
27
  html = fix_newlines(replace_entities(html))
26
28
  doc = Nokogiri::HTML(html)
27
29
 
28
- Html2Text.new(doc).convert
30
+ new(doc).convert
29
31
  end
30
32
 
31
33
  def self.fix_newlines(text)
34
+ # rubocop:disable Performance/StringReplacement
32
35
  text.gsub("\r\n", "\n").gsub("\r", "\n")
36
+ # rubocop:enable Performance/StringReplacement
33
37
  end
34
38
 
35
39
  def self.replace_entities(text)
36
- text.gsub("&nbsp;", " ").gsub("\u00a0", " ").gsub("&zwnj;", "")
40
+ # rubocop:disable Performance/StringReplacement
41
+ text.gsub('&nbsp;', ' ').gsub("\u00a0", ' ').gsub('&zwnj;', '')
42
+ # rubocop:enable Performance/StringReplacement
37
43
  end
38
44
 
39
45
  def convert
40
46
  output = iterate_over(doc)
41
47
  output = remove_leading_and_trailing_whitespace(output)
42
48
  output = remove_unnecessary_empty_lines(output)
43
- return output.strip
49
+ output.strip
44
50
  end
45
51
 
46
- DO_NOT_TOUCH_WHITESPACE = "<do-not-touch-whitespace>"
52
+ DO_NOT_TOUCH_WHITESPACE = '<do-not-touch-whitespace>'
47
53
 
48
54
  def remove_leading_and_trailing_whitespace(text)
49
55
  # ignore any <pre> blocks, which we don't want to interact with
@@ -51,22 +57,22 @@ class Html2Text
51
57
 
52
58
  output = []
53
59
  pre_blocks.each.with_index do |block, index|
54
- if index % 2 == 0
55
- output << block.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t")
56
- else
57
- output << block
58
- end
60
+ output << if index.even?
61
+ block.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t")
62
+ else
63
+ block
64
+ end
59
65
  end
60
66
 
61
- output.join("")
67
+ output.join
62
68
  end
63
69
 
64
- private
65
-
66
- def self.is_office_document?(text)
67
- text.include?("urn:schemas-microsoft-com:office")
70
+ private_class_method def self.office_document?(text)
71
+ text.include?('urn:schemas-microsoft-com:office')
68
72
  end
69
73
 
74
+ private
75
+
70
76
  def remove_unnecessary_empty_lines(text)
71
77
  text.gsub(/\n\n\n*/im, "\n\n")
72
78
  end
@@ -75,187 +81,183 @@ class Html2Text
75
81
  # Replace whitespace characters with a space (equivalent to \s)
76
82
  # and force any text encoding into UTF-8
77
83
  if text.valid_encoding?
78
- text.gsub(/[\t\n\f\r ]+/im, " ")
84
+ text.gsub(/[\t\n\f\r ]+/im, ' ')
79
85
  else
80
- text.force_encoding("WINDOWS-1252")
81
- return trimmed_whitespace(text.encode("UTF-16be", invalid: :replace, replace: "?").encode('UTF-8'))
86
+ text.force_encoding('WINDOWS-1252')
87
+ trimmed_whitespace(text.encode('UTF-16be', invalid: :replace, replace: '?').encode('UTF-8'))
82
88
  end
83
89
  end
84
90
 
85
91
  def iterate_over(node)
86
- return "\n" if node.name.downcase == "br" && next_node_is_text?(node)
92
+ return "\n" if node.name.downcase == 'br' && next_node_is_text?(node)
87
93
 
88
94
  return trimmed_whitespace(node.text) if node.text?
89
95
 
90
- if ["style", "head", "title", "meta", "script"].include?(node.name.downcase)
91
- return ""
92
- end
96
+ return '' if %w[style head title meta script].include?(node.name.downcase)
93
97
 
94
- if node.name.downcase == "pre"
95
- return "\n#{DO_NOT_TOUCH_WHITESPACE}#{node.text}#{DO_NOT_TOUCH_WHITESPACE}"
96
- end
98
+ return "\n#{DO_NOT_TOUCH_WHITESPACE}#{node.text}#{DO_NOT_TOUCH_WHITESPACE}" if node.name.downcase == 'pre'
97
99
 
98
100
  output = []
99
101
 
100
102
  output << prefix_whitespace(node)
101
103
  output += node.children.map do |child|
102
- iterate_over(child)
104
+ iterate_over(child) unless child.name.nil?
103
105
  end
104
106
  output << suffix_whitespace(node)
105
107
 
106
- output = output.compact.join("") || ""
108
+ output = output.compact.join || ''
107
109
 
108
- if node.name.downcase == "a"
109
- output = wrap_link(node, output)
110
- elsif node.name.downcase == "img"
111
- output = image_text(node)
110
+ unless node.name.nil?
111
+ if node.name.downcase == 'a'
112
+ output = wrap_link(node, output)
113
+ elsif node.name.downcase == 'img'
114
+ output = image_text(node)
115
+ end
112
116
  end
113
117
 
114
- return output
118
+ output
115
119
  end
116
120
 
121
+ # rubocop:disable Lint/DuplicateBranch
117
122
  def prefix_whitespace(node)
118
123
  case node.name.downcase
119
- when "hr"
120
- "\n---------------------------------------------------------------\n"
124
+ when 'hr'
125
+ "\n---------------------------------------------------------------\n"
121
126
 
122
- when "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul"
123
- "\n\n"
127
+ when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul'
128
+ "\n\n"
124
129
 
125
- when "p"
126
- "\n\n"
130
+ when 'p'
131
+ "\n\n"
127
132
 
128
- when "tr"
129
- "\n"
133
+ when 'tr'
134
+ "\n"
130
135
 
131
- when "div"
132
- if node.parent.name == "div" && (node.parent.text.strip == node.text.strip)
133
- ""
134
- else
135
- "\n"
136
- end
136
+ when 'div'
137
+ if node.parent.name == 'div' && (node.parent.text.strip == node.text.strip)
138
+ ''
139
+ else
140
+ "\n"
141
+ end
137
142
 
138
- when "td", "th"
139
- "\t"
143
+ when 'td', 'th'
144
+ "\t"
140
145
 
141
- when "li"
142
- "- "
146
+ when 'li'
147
+ '- '
143
148
  end
144
149
  end
150
+ # rubocop:enable Lint/DuplicateBranch
145
151
 
152
+ # rubocop:disable Lint/DuplicateBranch
146
153
  def suffix_whitespace(node)
147
154
  case node.name.downcase
148
- when "h1", "h2", "h3", "h4", "h5", "h6"
149
- # add another line
150
- "\n\n"
155
+ when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
156
+ # add another line
157
+ "\n\n"
151
158
 
152
- when "p"
153
- "\n\n"
159
+ when 'p'
160
+ "\n\n"
154
161
 
155
- when "br"
156
- if next_node_name(node) != "div" && next_node_name(node) != nil
157
- "\n"
158
- end
162
+ when 'br'
163
+ "\n" if next_node_name(node) != 'div' && !next_node_name(node).nil?
159
164
 
160
- when "li"
161
- "\n"
165
+ when 'li'
166
+ "\n"
162
167
 
163
- when "div"
164
- if next_node_is_text?(node)
165
- "\n"
166
- elsif next_node_name(node) != "div" && next_node_name(node) != nil
167
- "\n"
168
- end
168
+ when 'div'
169
+ if next_node_is_text?(node)
170
+ "\n"
171
+ elsif next_node_name(node) != 'div' && !next_node_name(node).nil?
172
+ "\n"
173
+ end
169
174
  end
170
175
  end
176
+ # rubocop:enable Lint/DuplicateBranch
171
177
 
172
178
  # links are returned in [text](link) format
173
179
  def wrap_link(node, output)
174
- href = node.attribute("href")
175
- name = node.attribute("name")
180
+ href = node.attribute('href')
181
+ name = node.attribute('name')
176
182
 
177
183
  output = output.strip
178
184
 
179
185
  # remove double [[ ]]s from linking images
180
- if output[0] == "[" && output[-1] == "]"
186
+ if output[0] == '[' && output[-1] == ']'
181
187
  output = output[1, output.length - 2]
182
188
 
183
189
  # for linking images, the title of the <a> overrides the title of the <img>
184
- if node.attribute("title")
185
- output = node.attribute("title").to_s
186
- end
190
+ output = node.attribute('title').to_s if node.attribute('title')
187
191
  end
188
192
 
189
193
  # if there is no link text, but a title attr
190
- if output.empty? && node.attribute("title")
191
- output = node.attribute("title").to_s
192
- end
194
+ output = node.attribute('title').to_s if output.empty? && node.attribute('title')
193
195
 
194
196
  if href.nil?
195
- if !name.nil?
196
- output = "[#{output}]"
197
- end
197
+ output = "[#{output}]" unless name.nil?
198
198
  else
199
199
  href = href.to_s
200
200
 
201
201
  if href != output && href != "mailto:#{output}" &&
202
- href != "http://#{output}" && href != "https://#{output}"
203
- if output.empty?
204
- output = href
205
- else
206
- output = "[#{output}](#{href})"
207
- end
202
+ href != "http://#{output}" && href != "https://#{output}"
203
+ output = if output.empty?
204
+ href
205
+ else
206
+ "[#{output}](#{href})"
207
+ end
208
208
  end
209
209
  end
210
210
 
211
211
  case next_node_name(node)
212
- when "h1", "h2", "h3", "h4", "h5", "h6"
213
- output += "\n"
212
+ when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
213
+ output += "\n"
214
214
  end
215
215
 
216
216
  output
217
217
  end
218
218
 
219
219
  def image_text(node)
220
- if node.attribute("title")
221
- "[" + node.attribute("title").to_s + "]"
222
- elsif node.attribute("alt")
223
- "[" + node.attribute("alt").to_s + "]"
220
+ if node.attribute('title')
221
+ "[#{node.attribute('title')}]"
222
+ elsif node.attribute('alt')
223
+ "[#{node.attribute('alt')}]"
224
224
  else
225
- ""
225
+ ''
226
226
  end
227
227
  end
228
228
 
229
229
  def next_node_name(node)
230
230
  next_node = node.next_sibling
231
- while next_node != nil
231
+ until next_node.nil?
232
232
  break if next_node.element?
233
+
233
234
  next_node = next_node.next_sibling
234
235
  end
235
236
 
236
- if next_node && next_node.element?
237
- next_node.name.downcase
238
- end
237
+ return unless next_node&.element?
238
+
239
+ next_node.name.downcase
239
240
  end
240
241
 
241
242
  def next_node_is_text?(node)
242
- return !node.next_sibling.nil? && node.next_sibling.text? && !node.next_sibling.text.strip.empty?
243
+ !node.next_sibling.nil? && node.next_sibling.text? && !node.next_sibling.text.strip.empty?
243
244
  end
244
245
 
245
246
  def previous_node_name(node)
246
247
  previous_node = node.previous_sibling
247
- while previous_node != nil
248
+ until previous_node.nil?
248
249
  break if previous_node.element?
250
+
249
251
  previous_node = previous_node.previous_sibling
250
252
  end
251
253
 
252
- if previous_node && previous_node.element?
253
- previous_node.name.downcase
254
- end
254
+ return unless previous_node&.element?
255
+
256
+ previous_node.name.downcase
255
257
  end
256
258
 
257
259
  def previous_node_is_text?(node)
258
- return !node.previous_sibling.nil? && node.previous_sibling.text? && !node.previous_sibling.text.strip.empty?
260
+ !node.previous_sibling.nil? && node.previous_sibling.text? && !node.previous_sibling.text.strip.empty?
259
261
  end
260
262
 
261
263
  # def previous_node_is_not_text?(node)