html2text 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +5 -5
- data/lib/html2text/version.rb +3 -1
- data/lib/html2text.rb +108 -106
- metadata +65 -110
- data/spec/examples/anchors.html +0 -12
- data/spec/examples/anchors.txt +0 -5
- data/spec/examples/basic.html +0 -21
- data/spec/examples/basic.txt +0 -15
- data/spec/examples/dom-processing.html +0 -8
- data/spec/examples/dom-processing.txt +0 -1
- data/spec/examples/empty.html +0 -0
- data/spec/examples/empty.txt +0 -0
- data/spec/examples/full_email.html +0 -220
- data/spec/examples/full_email.txt +0 -54
- data/spec/examples/huge-msoffice.html +0 -1
- data/spec/examples/huge-msoffice.txt +0 -25872
- data/spec/examples/images.html +0 -54
- data/spec/examples/images.txt +0 -27
- data/spec/examples/invalid.html +0 -4
- data/spec/examples/invalid.txt +0 -1
- data/spec/examples/lists.html +0 -24
- data/spec/examples/lists.txt +0 -17
- data/spec/examples/more-anchors.html +0 -14
- data/spec/examples/more-anchors.txt +0 -7
- data/spec/examples/msoffice.html +0 -1
- data/spec/examples/msoffice.txt +0 -12
- data/spec/examples/nbsp.html +0 -1
- data/spec/examples/nbsp.txt +0 -1
- data/spec/examples/nested-divs.html +0 -17
- data/spec/examples/nested-divs.txt +0 -12
- data/spec/examples/newlines.html +0 -50
- data/spec/examples/newlines.txt +0 -35
- data/spec/examples/non-breaking-spaces.html +0 -1
- data/spec/examples/non-breaking-spaces.txt +0 -1
- data/spec/examples/pre.html +0 -10
- data/spec/examples/pre.txt +0 -8
- data/spec/examples/table.html +0 -53
- data/spec/examples/table.txt +0 -7
- data/spec/examples/test3.html +0 -1
- data/spec/examples/test3.txt +0 -2
- data/spec/examples/test4.html +0 -1
- data/spec/examples/test4.txt +0 -5
- data/spec/examples/utf8-example.html +0 -4
- data/spec/examples/utf8-example.txt +0 -2
- data/spec/examples/windows-1252-example.html +0 -4
- data/spec/examples/windows-1252-example.txt +0 -2
- data/spec/examples/zero-width-non-joiners.html +0 -1
- data/spec/examples/zero-width-non-joiners.txt +0 -1
- data/spec/examples_spec.rb +0 -41
- data/spec/html2text_spec.rb +0 -58
- data/spec/spec_helper.rb +0 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 32afc21e326c44b7881358081161b9581c396b167fad44614a96cc0b6df91f23
|
4
|
+
data.tar.gz: fe03a0811cbff965e6b720ad1fdfdd55c0aa1e03165c16c84de7ecac39d65c9d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e26ef2f826da8958c56a390bd4242461abdd26a110216ee3903c902e007b5fb26b38a8f358b420abe6eac480b4a452fedba90b75a36eb7f3f0c2bec4dad040a7
|
7
|
+
data.tar.gz: 31515d14c3ca612f2eb9faaf52655639c3c0b72687fed89c41d83fad816af852854057e3ff0803e9f16e5e1d2b62657699cedef5b9240ab114826d936d0ed3c0
|
data/CHANGELOG.md
CHANGED
@@ -6,6 +6,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
6
6
|
|
7
7
|
## [Unreleased]
|
8
8
|
|
9
|
+
## [0.4.0] - 2024-06-08
|
10
|
+
### Added
|
11
|
+
- Switch from Travis to Github Actions for Build and Test
|
12
|
+
- Add rubocop for linting and cleanup existing violations ([#36](https://github.com/soundasleep/html2text_ruby/pull/36))
|
13
|
+
|
14
|
+
### Changed
|
15
|
+
- Add support for Ruby 3.x, removed support for Ruby < 3.0 since it is EOL
|
16
|
+
- Allow subclassing of `Html2Text` to override the default behaviour ([#30](https://github.com/soundasleep/html2text_ruby/pull/30))
|
17
|
+
|
18
|
+
### Fixed
|
19
|
+
- Loosen nokogiri dependency to allow for nokogiri < 2.0 ([#17](https://github.com/soundasleep/html2text_ruby/pull/17))
|
20
|
+
- Fix `NoMethodError` when parsing nodes with no name ([#15](https://github.com/soundasleep/html2text_ruby/pull/15))
|
21
|
+
|
9
22
|
## [0.3.1] - 2019-06-12
|
10
23
|
### Security
|
11
24
|
- Bumped nokogiri requirement to ~> 1.10.3, resolving [CVE-2019-11068](https://nvd.nist.gov/vuln/detail/CVE-2019-11068)
|
data/README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
html2text
|
2
|
-
|
1
|
+
html2text ![Build](https://github.com/soundasleep/html2text_ruby/actions/workflows/build.yml/badge.svg) [![Gem Version](https://badge.fury.io/rb/html2text.svg)](https://rubygems.org/gems/html2text)
|
2
|
+
---
|
3
3
|
|
4
4
|
`html2text` is a very simple gem that uses DOM methods to convert HTML into a format similar to what would be
|
5
5
|
rendered by a browser - perfect for places where you need a quick text representation. For example:
|
@@ -20,7 +20,7 @@ rendered by a browser - perfect for places where you need a quick text represent
|
|
20
20
|
<div>Another div</div>
|
21
21
|
<div>A div<div>within a div</div></div>
|
22
22
|
|
23
|
-
<a href="
|
23
|
+
<a href="https://foo.com">A link</a>
|
24
24
|
|
25
25
|
</body>
|
26
26
|
</html>
|
@@ -40,7 +40,7 @@ Another div
|
|
40
40
|
A div
|
41
41
|
within a div
|
42
42
|
|
43
|
-
[A link](
|
43
|
+
[A link](https://foo.com)
|
44
44
|
```
|
45
45
|
|
46
46
|
See the [original blog post](http://journals.jevon.org/users/jevon-phd/entry/19818) or the related [StackOverflow answer](http://stackoverflow.com/a/2564472/39531).
|
@@ -63,7 +63,7 @@ text = Html2Text.convert(html)
|
|
63
63
|
|
64
64
|
## Tests
|
65
65
|
|
66
|
-
See all of the test cases defined in [spec/examples/](spec/examples/). These can be run with `bundle
|
66
|
+
See all of the test cases defined in [spec/examples/](spec/examples/). These can be run with `bundle exec rake`.
|
67
67
|
|
68
68
|
## License
|
69
69
|
|
data/lib/html2text/version.rb
CHANGED
data/lib/html2text.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
|
3
5
|
class Html2Text
|
@@ -10,14 +12,14 @@ class Html2Text
|
|
10
12
|
def self.convert(html)
|
11
13
|
html = html.to_s
|
12
14
|
|
13
|
-
if
|
15
|
+
if office_document?(html)
|
14
16
|
# Emulate the CSS rendering of Office documents
|
15
|
-
html = html.gsub(
|
16
|
-
|
17
|
-
|
17
|
+
html = html.gsub('<p class=MsoNormal>', '<br>')
|
18
|
+
.gsub('<o:p> </o:p>', '<br>')
|
19
|
+
.gsub('<o:p></o:p>', '')
|
18
20
|
end
|
19
21
|
|
20
|
-
|
22
|
+
unless html.include?('<html')
|
21
23
|
# Stop Nokogiri from inserting in <p> tags
|
22
24
|
html = "<div>#{html}</div>"
|
23
25
|
end
|
@@ -25,25 +27,29 @@ class Html2Text
|
|
25
27
|
html = fix_newlines(replace_entities(html))
|
26
28
|
doc = Nokogiri::HTML(html)
|
27
29
|
|
28
|
-
|
30
|
+
new(doc).convert
|
29
31
|
end
|
30
32
|
|
31
33
|
def self.fix_newlines(text)
|
34
|
+
# rubocop:disable Performance/StringReplacement
|
32
35
|
text.gsub("\r\n", "\n").gsub("\r", "\n")
|
36
|
+
# rubocop:enable Performance/StringReplacement
|
33
37
|
end
|
34
38
|
|
35
39
|
def self.replace_entities(text)
|
36
|
-
|
40
|
+
# rubocop:disable Performance/StringReplacement
|
41
|
+
text.gsub(' ', ' ').gsub("\u00a0", ' ').gsub('‌', '')
|
42
|
+
# rubocop:enable Performance/StringReplacement
|
37
43
|
end
|
38
44
|
|
39
45
|
def convert
|
40
46
|
output = iterate_over(doc)
|
41
47
|
output = remove_leading_and_trailing_whitespace(output)
|
42
48
|
output = remove_unnecessary_empty_lines(output)
|
43
|
-
|
49
|
+
output.strip
|
44
50
|
end
|
45
51
|
|
46
|
-
DO_NOT_TOUCH_WHITESPACE =
|
52
|
+
DO_NOT_TOUCH_WHITESPACE = '<do-not-touch-whitespace>'
|
47
53
|
|
48
54
|
def remove_leading_and_trailing_whitespace(text)
|
49
55
|
# ignore any <pre> blocks, which we don't want to interact with
|
@@ -51,22 +57,22 @@ class Html2Text
|
|
51
57
|
|
52
58
|
output = []
|
53
59
|
pre_blocks.each.with_index do |block, index|
|
54
|
-
if index
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
60
|
+
output << if index.even?
|
61
|
+
block.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t")
|
62
|
+
else
|
63
|
+
block
|
64
|
+
end
|
59
65
|
end
|
60
66
|
|
61
|
-
output.join
|
67
|
+
output.join
|
62
68
|
end
|
63
69
|
|
64
|
-
|
65
|
-
|
66
|
-
def self.is_office_document?(text)
|
67
|
-
text.include?("urn:schemas-microsoft-com:office")
|
70
|
+
private_class_method def self.office_document?(text)
|
71
|
+
text.include?('urn:schemas-microsoft-com:office')
|
68
72
|
end
|
69
73
|
|
74
|
+
private
|
75
|
+
|
70
76
|
def remove_unnecessary_empty_lines(text)
|
71
77
|
text.gsub(/\n\n\n*/im, "\n\n")
|
72
78
|
end
|
@@ -75,187 +81,183 @@ class Html2Text
|
|
75
81
|
# Replace whitespace characters with a space (equivalent to \s)
|
76
82
|
# and force any text encoding into UTF-8
|
77
83
|
if text.valid_encoding?
|
78
|
-
text.gsub(/[\t\n\f\r ]+/im,
|
84
|
+
text.gsub(/[\t\n\f\r ]+/im, ' ')
|
79
85
|
else
|
80
|
-
text.force_encoding(
|
81
|
-
|
86
|
+
text.force_encoding('WINDOWS-1252')
|
87
|
+
trimmed_whitespace(text.encode('UTF-16be', invalid: :replace, replace: '?').encode('UTF-8'))
|
82
88
|
end
|
83
89
|
end
|
84
90
|
|
85
91
|
def iterate_over(node)
|
86
|
-
return "\n" if node.name.downcase ==
|
92
|
+
return "\n" if node.name.downcase == 'br' && next_node_is_text?(node)
|
87
93
|
|
88
94
|
return trimmed_whitespace(node.text) if node.text?
|
89
95
|
|
90
|
-
if [
|
91
|
-
return ""
|
92
|
-
end
|
96
|
+
return '' if %w[style head title meta script].include?(node.name.downcase)
|
93
97
|
|
94
|
-
if node.name.downcase ==
|
95
|
-
return "\n#{DO_NOT_TOUCH_WHITESPACE}#{node.text}#{DO_NOT_TOUCH_WHITESPACE}"
|
96
|
-
end
|
98
|
+
return "\n#{DO_NOT_TOUCH_WHITESPACE}#{node.text}#{DO_NOT_TOUCH_WHITESPACE}" if node.name.downcase == 'pre'
|
97
99
|
|
98
100
|
output = []
|
99
101
|
|
100
102
|
output << prefix_whitespace(node)
|
101
103
|
output += node.children.map do |child|
|
102
|
-
iterate_over(child)
|
104
|
+
iterate_over(child) unless child.name.nil?
|
103
105
|
end
|
104
106
|
output << suffix_whitespace(node)
|
105
107
|
|
106
|
-
output = output.compact.join
|
108
|
+
output = output.compact.join || ''
|
107
109
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
110
|
+
unless node.name.nil?
|
111
|
+
if node.name.downcase == 'a'
|
112
|
+
output = wrap_link(node, output)
|
113
|
+
elsif node.name.downcase == 'img'
|
114
|
+
output = image_text(node)
|
115
|
+
end
|
112
116
|
end
|
113
117
|
|
114
|
-
|
118
|
+
output
|
115
119
|
end
|
116
120
|
|
121
|
+
# rubocop:disable Lint/DuplicateBranch
|
117
122
|
def prefix_whitespace(node)
|
118
123
|
case node.name.downcase
|
119
|
-
|
120
|
-
|
124
|
+
when 'hr'
|
125
|
+
"\n---------------------------------------------------------------\n"
|
121
126
|
|
122
|
-
|
123
|
-
|
127
|
+
when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul'
|
128
|
+
"\n\n"
|
124
129
|
|
125
|
-
|
126
|
-
|
130
|
+
when 'p'
|
131
|
+
"\n\n"
|
127
132
|
|
128
|
-
|
129
|
-
|
133
|
+
when 'tr'
|
134
|
+
"\n"
|
130
135
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
136
|
+
when 'div'
|
137
|
+
if node.parent.name == 'div' && (node.parent.text.strip == node.text.strip)
|
138
|
+
''
|
139
|
+
else
|
140
|
+
"\n"
|
141
|
+
end
|
137
142
|
|
138
|
-
|
139
|
-
|
143
|
+
when 'td', 'th'
|
144
|
+
"\t"
|
140
145
|
|
141
|
-
|
142
|
-
|
146
|
+
when 'li'
|
147
|
+
'- '
|
143
148
|
end
|
144
149
|
end
|
150
|
+
# rubocop:enable Lint/DuplicateBranch
|
145
151
|
|
152
|
+
# rubocop:disable Lint/DuplicateBranch
|
146
153
|
def suffix_whitespace(node)
|
147
154
|
case node.name.downcase
|
148
|
-
|
149
|
-
|
150
|
-
|
155
|
+
when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
|
156
|
+
# add another line
|
157
|
+
"\n\n"
|
151
158
|
|
152
|
-
|
153
|
-
|
159
|
+
when 'p'
|
160
|
+
"\n\n"
|
154
161
|
|
155
|
-
|
156
|
-
|
157
|
-
"\n"
|
158
|
-
end
|
162
|
+
when 'br'
|
163
|
+
"\n" if next_node_name(node) != 'div' && !next_node_name(node).nil?
|
159
164
|
|
160
|
-
|
161
|
-
|
165
|
+
when 'li'
|
166
|
+
"\n"
|
162
167
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
168
|
+
when 'div'
|
169
|
+
if next_node_is_text?(node)
|
170
|
+
"\n"
|
171
|
+
elsif next_node_name(node) != 'div' && !next_node_name(node).nil?
|
172
|
+
"\n"
|
173
|
+
end
|
169
174
|
end
|
170
175
|
end
|
176
|
+
# rubocop:enable Lint/DuplicateBranch
|
171
177
|
|
172
178
|
# links are returned in [text](link) format
|
173
179
|
def wrap_link(node, output)
|
174
|
-
href = node.attribute(
|
175
|
-
name = node.attribute(
|
180
|
+
href = node.attribute('href')
|
181
|
+
name = node.attribute('name')
|
176
182
|
|
177
183
|
output = output.strip
|
178
184
|
|
179
185
|
# remove double [[ ]]s from linking images
|
180
|
-
if output[0] ==
|
186
|
+
if output[0] == '[' && output[-1] == ']'
|
181
187
|
output = output[1, output.length - 2]
|
182
188
|
|
183
189
|
# for linking images, the title of the <a> overrides the title of the <img>
|
184
|
-
if node.attribute(
|
185
|
-
output = node.attribute("title").to_s
|
186
|
-
end
|
190
|
+
output = node.attribute('title').to_s if node.attribute('title')
|
187
191
|
end
|
188
192
|
|
189
193
|
# if there is no link text, but a title attr
|
190
|
-
if output.empty? && node.attribute(
|
191
|
-
output = node.attribute("title").to_s
|
192
|
-
end
|
194
|
+
output = node.attribute('title').to_s if output.empty? && node.attribute('title')
|
193
195
|
|
194
196
|
if href.nil?
|
195
|
-
|
196
|
-
output = "[#{output}]"
|
197
|
-
end
|
197
|
+
output = "[#{output}]" unless name.nil?
|
198
198
|
else
|
199
199
|
href = href.to_s
|
200
200
|
|
201
201
|
if href != output && href != "mailto:#{output}" &&
|
202
|
-
|
203
|
-
if output.empty?
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
202
|
+
href != "http://#{output}" && href != "https://#{output}"
|
203
|
+
output = if output.empty?
|
204
|
+
href
|
205
|
+
else
|
206
|
+
"[#{output}](#{href})"
|
207
|
+
end
|
208
208
|
end
|
209
209
|
end
|
210
210
|
|
211
211
|
case next_node_name(node)
|
212
|
-
|
213
|
-
|
212
|
+
when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
|
213
|
+
output += "\n"
|
214
214
|
end
|
215
215
|
|
216
216
|
output
|
217
217
|
end
|
218
218
|
|
219
219
|
def image_text(node)
|
220
|
-
if node.attribute(
|
221
|
-
"[
|
222
|
-
elsif node.attribute(
|
223
|
-
"[
|
220
|
+
if node.attribute('title')
|
221
|
+
"[#{node.attribute('title')}]"
|
222
|
+
elsif node.attribute('alt')
|
223
|
+
"[#{node.attribute('alt')}]"
|
224
224
|
else
|
225
|
-
|
225
|
+
''
|
226
226
|
end
|
227
227
|
end
|
228
228
|
|
229
229
|
def next_node_name(node)
|
230
230
|
next_node = node.next_sibling
|
231
|
-
|
231
|
+
until next_node.nil?
|
232
232
|
break if next_node.element?
|
233
|
+
|
233
234
|
next_node = next_node.next_sibling
|
234
235
|
end
|
235
236
|
|
236
|
-
|
237
|
-
|
238
|
-
|
237
|
+
return unless next_node&.element?
|
238
|
+
|
239
|
+
next_node.name.downcase
|
239
240
|
end
|
240
241
|
|
241
242
|
def next_node_is_text?(node)
|
242
|
-
|
243
|
+
!node.next_sibling.nil? && node.next_sibling.text? && !node.next_sibling.text.strip.empty?
|
243
244
|
end
|
244
245
|
|
245
246
|
def previous_node_name(node)
|
246
247
|
previous_node = node.previous_sibling
|
247
|
-
|
248
|
+
until previous_node.nil?
|
248
249
|
break if previous_node.element?
|
250
|
+
|
249
251
|
previous_node = previous_node.previous_sibling
|
250
252
|
end
|
251
253
|
|
252
|
-
|
253
|
-
|
254
|
-
|
254
|
+
return unless previous_node&.element?
|
255
|
+
|
256
|
+
previous_node.name.downcase
|
255
257
|
end
|
256
258
|
|
257
259
|
def previous_node_is_text?(node)
|
258
|
-
|
260
|
+
!node.previous_sibling.nil? && node.previous_sibling.text? && !node.previous_sibling.text.strip.empty?
|
259
261
|
end
|
260
262
|
|
261
263
|
# def previous_node_is_not_text?(node)
|