html2text 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +26 -4
- data/README.md +5 -5
- data/lib/html2text/version.rb +3 -1
- data/lib/html2text.rb +108 -106
- metadata +78 -109
- data/spec/examples/anchors.html +0 -12
- data/spec/examples/anchors.txt +0 -5
- data/spec/examples/basic.html +0 -21
- data/spec/examples/basic.txt +0 -15
- data/spec/examples/dom-processing.html +0 -8
- data/spec/examples/dom-processing.txt +0 -1
- data/spec/examples/empty.html +0 -0
- data/spec/examples/empty.txt +0 -0
- data/spec/examples/full_email.html +0 -220
- data/spec/examples/full_email.txt +0 -54
- data/spec/examples/huge-msoffice.html +0 -1
- data/spec/examples/huge-msoffice.txt +0 -25872
- data/spec/examples/images.html +0 -54
- data/spec/examples/images.txt +0 -27
- data/spec/examples/invalid.html +0 -4
- data/spec/examples/invalid.txt +0 -1
- data/spec/examples/lists.html +0 -24
- data/spec/examples/lists.txt +0 -17
- data/spec/examples/more-anchors.html +0 -14
- data/spec/examples/more-anchors.txt +0 -7
- data/spec/examples/msoffice.html +0 -1
- data/spec/examples/msoffice.txt +0 -12
- data/spec/examples/nbsp.html +0 -1
- data/spec/examples/nbsp.txt +0 -1
- data/spec/examples/nested-divs.html +0 -17
- data/spec/examples/nested-divs.txt +0 -12
- data/spec/examples/newlines.html +0 -50
- data/spec/examples/newlines.txt +0 -35
- data/spec/examples/non-breaking-spaces.html +0 -1
- data/spec/examples/non-breaking-spaces.txt +0 -1
- data/spec/examples/pre.html +0 -10
- data/spec/examples/pre.txt +0 -8
- data/spec/examples/table.html +0 -53
- data/spec/examples/table.txt +0 -7
- data/spec/examples/test3.html +0 -1
- data/spec/examples/test3.txt +0 -2
- data/spec/examples/test4.html +0 -1
- data/spec/examples/test4.txt +0 -5
- data/spec/examples/utf8-example.html +0 -4
- data/spec/examples/utf8-example.txt +0 -2
- data/spec/examples/windows-1252-example.html +0 -4
- data/spec/examples/windows-1252-example.txt +0 -2
- data/spec/examples/zero-width-non-joiners.html +0 -1
- data/spec/examples/zero-width-non-joiners.txt +0 -1
- data/spec/examples_spec.rb +0 -41
- data/spec/html2text_spec.rb +0 -58
- data/spec/spec_helper.rb +0 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 32afc21e326c44b7881358081161b9581c396b167fad44614a96cc0b6df91f23
|
4
|
+
data.tar.gz: fe03a0811cbff965e6b720ad1fdfdd55c0aa1e03165c16c84de7ecac39d65c9d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e26ef2f826da8958c56a390bd4242461abdd26a110216ee3903c902e007b5fb26b38a8f358b420abe6eac480b4a452fedba90b75a36eb7f3f0c2bec4dad040a7
|
7
|
+
data.tar.gz: 31515d14c3ca612f2eb9faaf52655639c3c0b72687fed89c41d83fad816af852854057e3ff0803e9f16e5e1d2b62657699cedef5b9240ab114826d936d0ed3c0
|
data/CHANGELOG.md
CHANGED
@@ -5,8 +5,28 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
5
5
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
6
6
|
|
7
7
|
## [Unreleased]
|
8
|
+
|
9
|
+
## [0.4.0] - 2024-06-08
|
10
|
+
### Added
|
11
|
+
- Switch from Travis to Github Actions for Build and Test
|
12
|
+
- Add rubocop for linting and cleanup existing violations ([#36](https://github.com/soundasleep/html2text_ruby/pull/36))
|
13
|
+
|
14
|
+
### Changed
|
15
|
+
- Add support for Ruby 3.x, removed support for Ruby < 3.0 since it is EOL
|
16
|
+
- Allow subclassing of `Html2Text` to override the default behaviour ([#30](https://github.com/soundasleep/html2text_ruby/pull/30))
|
17
|
+
|
18
|
+
### Fixed
|
19
|
+
- Loosen nokogiri dependency to allow for nokogiri < 2.0 ([#17](https://github.com/soundasleep/html2text_ruby/pull/17))
|
20
|
+
- Fix `NoMethodError` when parsing nodes with no name ([#15](https://github.com/soundasleep/html2text_ruby/pull/15))
|
21
|
+
|
22
|
+
## [0.3.1] - 2019-06-12
|
23
|
+
### Security
|
24
|
+
- Bumped nokogiri requirement to ~> 1.10.3, resolving [CVE-2019-11068](https://nvd.nist.gov/vuln/detail/CVE-2019-11068)
|
25
|
+
([#8](https://github.com/soundasleep/html2text_ruby/issues/8))
|
26
|
+
|
27
|
+
## [0.3.0] - 2019-02-15
|
8
28
|
### Added
|
9
|
-
- Zero-width non-joiners are now stripped [#5](https://github.com/soundasleep/html2text_ruby/pull/5)
|
29
|
+
- Zero-width non-joiners are now stripped ([#5](https://github.com/soundasleep/html2text_ruby/pull/5))
|
10
30
|
- Support both UTF-8 and Windows-1252 encoded files
|
11
31
|
- Support converting `<pre>` blocks, including whitespace within these blocks
|
12
32
|
- MS Office (MsoNormal) documents are now rendered closer to actual render output
|
@@ -22,7 +42,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
22
42
|
|
23
43
|
## [0.2.1] - 2017-09-27
|
24
44
|
### Fixed
|
25
|
-
- Convert non-string input into strings [#3](https://github.com/soundasleep/html2text_ruby/pull/3)
|
45
|
+
- Convert non-string input into strings ([#3](https://github.com/soundasleep/html2text_ruby/pull/3))
|
26
46
|
|
27
|
-
[Unreleased]: https://github.com/soundasleep/
|
28
|
-
[0.
|
47
|
+
[Unreleased]: https://github.com/soundasleep/html2text_ruby/compare/0.3.1...HEAD
|
48
|
+
[0.3.1]: https://github.com/soundasleep/html2text_ruby/compare/0.3.0...0.3.1
|
49
|
+
[0.3.0]: https://github.com/soundasleep/html2text_ruby/compare/0.2.1...0.3.0
|
50
|
+
[0.2.1]: https://github.com/soundasleep/html2text_ruby/compare/0.2.1...0.2.1
|
data/README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
html2text
|
2
|
-
|
1
|
+
html2text  [](https://rubygems.org/gems/html2text)
|
2
|
+
---
|
3
3
|
|
4
4
|
`html2text` is a very simple gem that uses DOM methods to convert HTML into a format similar to what would be
|
5
5
|
rendered by a browser - perfect for places where you need a quick text representation. For example:
|
@@ -20,7 +20,7 @@ rendered by a browser - perfect for places where you need a quick text represent
|
|
20
20
|
<div>Another div</div>
|
21
21
|
<div>A div<div>within a div</div></div>
|
22
22
|
|
23
|
-
<a href="
|
23
|
+
<a href="https://foo.com">A link</a>
|
24
24
|
|
25
25
|
</body>
|
26
26
|
</html>
|
@@ -40,7 +40,7 @@ Another div
|
|
40
40
|
A div
|
41
41
|
within a div
|
42
42
|
|
43
|
-
[A link](
|
43
|
+
[A link](https://foo.com)
|
44
44
|
```
|
45
45
|
|
46
46
|
See the [original blog post](http://journals.jevon.org/users/jevon-phd/entry/19818) or the related [StackOverflow answer](http://stackoverflow.com/a/2564472/39531).
|
@@ -63,7 +63,7 @@ text = Html2Text.convert(html)
|
|
63
63
|
|
64
64
|
## Tests
|
65
65
|
|
66
|
-
See all of the test cases defined in [spec/examples/](spec/examples/). These can be run with `bundle
|
66
|
+
See all of the test cases defined in [spec/examples/](spec/examples/). These can be run with `bundle exec rake`.
|
67
67
|
|
68
68
|
## License
|
69
69
|
|
data/lib/html2text/version.rb
CHANGED
data/lib/html2text.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
|
3
5
|
class Html2Text
|
@@ -10,14 +12,14 @@ class Html2Text
|
|
10
12
|
def self.convert(html)
|
11
13
|
html = html.to_s
|
12
14
|
|
13
|
-
if
|
15
|
+
if office_document?(html)
|
14
16
|
# Emulate the CSS rendering of Office documents
|
15
|
-
html = html.gsub(
|
16
|
-
|
17
|
-
|
17
|
+
html = html.gsub('<p class=MsoNormal>', '<br>')
|
18
|
+
.gsub('<o:p> </o:p>', '<br>')
|
19
|
+
.gsub('<o:p></o:p>', '')
|
18
20
|
end
|
19
21
|
|
20
|
-
|
22
|
+
unless html.include?('<html')
|
21
23
|
# Stop Nokogiri from inserting in <p> tags
|
22
24
|
html = "<div>#{html}</div>"
|
23
25
|
end
|
@@ -25,25 +27,29 @@ class Html2Text
|
|
25
27
|
html = fix_newlines(replace_entities(html))
|
26
28
|
doc = Nokogiri::HTML(html)
|
27
29
|
|
28
|
-
|
30
|
+
new(doc).convert
|
29
31
|
end
|
30
32
|
|
31
33
|
def self.fix_newlines(text)
|
34
|
+
# rubocop:disable Performance/StringReplacement
|
32
35
|
text.gsub("\r\n", "\n").gsub("\r", "\n")
|
36
|
+
# rubocop:enable Performance/StringReplacement
|
33
37
|
end
|
34
38
|
|
35
39
|
def self.replace_entities(text)
|
36
|
-
|
40
|
+
# rubocop:disable Performance/StringReplacement
|
41
|
+
text.gsub(' ', ' ').gsub("\u00a0", ' ').gsub('‌', '')
|
42
|
+
# rubocop:enable Performance/StringReplacement
|
37
43
|
end
|
38
44
|
|
39
45
|
def convert
|
40
46
|
output = iterate_over(doc)
|
41
47
|
output = remove_leading_and_trailing_whitespace(output)
|
42
48
|
output = remove_unnecessary_empty_lines(output)
|
43
|
-
|
49
|
+
output.strip
|
44
50
|
end
|
45
51
|
|
46
|
-
DO_NOT_TOUCH_WHITESPACE =
|
52
|
+
DO_NOT_TOUCH_WHITESPACE = '<do-not-touch-whitespace>'
|
47
53
|
|
48
54
|
def remove_leading_and_trailing_whitespace(text)
|
49
55
|
# ignore any <pre> blocks, which we don't want to interact with
|
@@ -51,22 +57,22 @@ class Html2Text
|
|
51
57
|
|
52
58
|
output = []
|
53
59
|
pre_blocks.each.with_index do |block, index|
|
54
|
-
if index
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
60
|
+
output << if index.even?
|
61
|
+
block.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t")
|
62
|
+
else
|
63
|
+
block
|
64
|
+
end
|
59
65
|
end
|
60
66
|
|
61
|
-
output.join
|
67
|
+
output.join
|
62
68
|
end
|
63
69
|
|
64
|
-
|
65
|
-
|
66
|
-
def self.is_office_document?(text)
|
67
|
-
text.include?("urn:schemas-microsoft-com:office")
|
70
|
+
private_class_method def self.office_document?(text)
|
71
|
+
text.include?('urn:schemas-microsoft-com:office')
|
68
72
|
end
|
69
73
|
|
74
|
+
private
|
75
|
+
|
70
76
|
def remove_unnecessary_empty_lines(text)
|
71
77
|
text.gsub(/\n\n\n*/im, "\n\n")
|
72
78
|
end
|
@@ -75,187 +81,183 @@ class Html2Text
|
|
75
81
|
# Replace whitespace characters with a space (equivalent to \s)
|
76
82
|
# and force any text encoding into UTF-8
|
77
83
|
if text.valid_encoding?
|
78
|
-
text.gsub(/[\t\n\f\r ]+/im,
|
84
|
+
text.gsub(/[\t\n\f\r ]+/im, ' ')
|
79
85
|
else
|
80
|
-
text.force_encoding(
|
81
|
-
|
86
|
+
text.force_encoding('WINDOWS-1252')
|
87
|
+
trimmed_whitespace(text.encode('UTF-16be', invalid: :replace, replace: '?').encode('UTF-8'))
|
82
88
|
end
|
83
89
|
end
|
84
90
|
|
85
91
|
def iterate_over(node)
|
86
|
-
return "\n" if node.name.downcase ==
|
92
|
+
return "\n" if node.name.downcase == 'br' && next_node_is_text?(node)
|
87
93
|
|
88
94
|
return trimmed_whitespace(node.text) if node.text?
|
89
95
|
|
90
|
-
if [
|
91
|
-
return ""
|
92
|
-
end
|
96
|
+
return '' if %w[style head title meta script].include?(node.name.downcase)
|
93
97
|
|
94
|
-
if node.name.downcase ==
|
95
|
-
return "\n#{DO_NOT_TOUCH_WHITESPACE}#{node.text}#{DO_NOT_TOUCH_WHITESPACE}"
|
96
|
-
end
|
98
|
+
return "\n#{DO_NOT_TOUCH_WHITESPACE}#{node.text}#{DO_NOT_TOUCH_WHITESPACE}" if node.name.downcase == 'pre'
|
97
99
|
|
98
100
|
output = []
|
99
101
|
|
100
102
|
output << prefix_whitespace(node)
|
101
103
|
output += node.children.map do |child|
|
102
|
-
iterate_over(child)
|
104
|
+
iterate_over(child) unless child.name.nil?
|
103
105
|
end
|
104
106
|
output << suffix_whitespace(node)
|
105
107
|
|
106
|
-
output = output.compact.join
|
108
|
+
output = output.compact.join || ''
|
107
109
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
110
|
+
unless node.name.nil?
|
111
|
+
if node.name.downcase == 'a'
|
112
|
+
output = wrap_link(node, output)
|
113
|
+
elsif node.name.downcase == 'img'
|
114
|
+
output = image_text(node)
|
115
|
+
end
|
112
116
|
end
|
113
117
|
|
114
|
-
|
118
|
+
output
|
115
119
|
end
|
116
120
|
|
121
|
+
# rubocop:disable Lint/DuplicateBranch
|
117
122
|
def prefix_whitespace(node)
|
118
123
|
case node.name.downcase
|
119
|
-
|
120
|
-
|
124
|
+
when 'hr'
|
125
|
+
"\n---------------------------------------------------------------\n"
|
121
126
|
|
122
|
-
|
123
|
-
|
127
|
+
when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul'
|
128
|
+
"\n\n"
|
124
129
|
|
125
|
-
|
126
|
-
|
130
|
+
when 'p'
|
131
|
+
"\n\n"
|
127
132
|
|
128
|
-
|
129
|
-
|
133
|
+
when 'tr'
|
134
|
+
"\n"
|
130
135
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
136
|
+
when 'div'
|
137
|
+
if node.parent.name == 'div' && (node.parent.text.strip == node.text.strip)
|
138
|
+
''
|
139
|
+
else
|
140
|
+
"\n"
|
141
|
+
end
|
137
142
|
|
138
|
-
|
139
|
-
|
143
|
+
when 'td', 'th'
|
144
|
+
"\t"
|
140
145
|
|
141
|
-
|
142
|
-
|
146
|
+
when 'li'
|
147
|
+
'- '
|
143
148
|
end
|
144
149
|
end
|
150
|
+
# rubocop:enable Lint/DuplicateBranch
|
145
151
|
|
152
|
+
# rubocop:disable Lint/DuplicateBranch
|
146
153
|
def suffix_whitespace(node)
|
147
154
|
case node.name.downcase
|
148
|
-
|
149
|
-
|
150
|
-
|
155
|
+
when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
|
156
|
+
# add another line
|
157
|
+
"\n\n"
|
151
158
|
|
152
|
-
|
153
|
-
|
159
|
+
when 'p'
|
160
|
+
"\n\n"
|
154
161
|
|
155
|
-
|
156
|
-
|
157
|
-
"\n"
|
158
|
-
end
|
162
|
+
when 'br'
|
163
|
+
"\n" if next_node_name(node) != 'div' && !next_node_name(node).nil?
|
159
164
|
|
160
|
-
|
161
|
-
|
165
|
+
when 'li'
|
166
|
+
"\n"
|
162
167
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
168
|
+
when 'div'
|
169
|
+
if next_node_is_text?(node)
|
170
|
+
"\n"
|
171
|
+
elsif next_node_name(node) != 'div' && !next_node_name(node).nil?
|
172
|
+
"\n"
|
173
|
+
end
|
169
174
|
end
|
170
175
|
end
|
176
|
+
# rubocop:enable Lint/DuplicateBranch
|
171
177
|
|
172
178
|
# links are returned in [text](link) format
|
173
179
|
def wrap_link(node, output)
|
174
|
-
href = node.attribute(
|
175
|
-
name = node.attribute(
|
180
|
+
href = node.attribute('href')
|
181
|
+
name = node.attribute('name')
|
176
182
|
|
177
183
|
output = output.strip
|
178
184
|
|
179
185
|
# remove double [[ ]]s from linking images
|
180
|
-
if output[0] ==
|
186
|
+
if output[0] == '[' && output[-1] == ']'
|
181
187
|
output = output[1, output.length - 2]
|
182
188
|
|
183
189
|
# for linking images, the title of the <a> overrides the title of the <img>
|
184
|
-
if node.attribute(
|
185
|
-
output = node.attribute("title").to_s
|
186
|
-
end
|
190
|
+
output = node.attribute('title').to_s if node.attribute('title')
|
187
191
|
end
|
188
192
|
|
189
193
|
# if there is no link text, but a title attr
|
190
|
-
if output.empty? && node.attribute(
|
191
|
-
output = node.attribute("title").to_s
|
192
|
-
end
|
194
|
+
output = node.attribute('title').to_s if output.empty? && node.attribute('title')
|
193
195
|
|
194
196
|
if href.nil?
|
195
|
-
|
196
|
-
output = "[#{output}]"
|
197
|
-
end
|
197
|
+
output = "[#{output}]" unless name.nil?
|
198
198
|
else
|
199
199
|
href = href.to_s
|
200
200
|
|
201
201
|
if href != output && href != "mailto:#{output}" &&
|
202
|
-
|
203
|
-
if output.empty?
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
202
|
+
href != "http://#{output}" && href != "https://#{output}"
|
203
|
+
output = if output.empty?
|
204
|
+
href
|
205
|
+
else
|
206
|
+
"[#{output}](#{href})"
|
207
|
+
end
|
208
208
|
end
|
209
209
|
end
|
210
210
|
|
211
211
|
case next_node_name(node)
|
212
|
-
|
213
|
-
|
212
|
+
when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
|
213
|
+
output += "\n"
|
214
214
|
end
|
215
215
|
|
216
216
|
output
|
217
217
|
end
|
218
218
|
|
219
219
|
def image_text(node)
|
220
|
-
if node.attribute(
|
221
|
-
"[
|
222
|
-
elsif node.attribute(
|
223
|
-
"[
|
220
|
+
if node.attribute('title')
|
221
|
+
"[#{node.attribute('title')}]"
|
222
|
+
elsif node.attribute('alt')
|
223
|
+
"[#{node.attribute('alt')}]"
|
224
224
|
else
|
225
|
-
|
225
|
+
''
|
226
226
|
end
|
227
227
|
end
|
228
228
|
|
229
229
|
def next_node_name(node)
|
230
230
|
next_node = node.next_sibling
|
231
|
-
|
231
|
+
until next_node.nil?
|
232
232
|
break if next_node.element?
|
233
|
+
|
233
234
|
next_node = next_node.next_sibling
|
234
235
|
end
|
235
236
|
|
236
|
-
|
237
|
-
|
238
|
-
|
237
|
+
return unless next_node&.element?
|
238
|
+
|
239
|
+
next_node.name.downcase
|
239
240
|
end
|
240
241
|
|
241
242
|
def next_node_is_text?(node)
|
242
|
-
|
243
|
+
!node.next_sibling.nil? && node.next_sibling.text? && !node.next_sibling.text.strip.empty?
|
243
244
|
end
|
244
245
|
|
245
246
|
def previous_node_name(node)
|
246
247
|
previous_node = node.previous_sibling
|
247
|
-
|
248
|
+
until previous_node.nil?
|
248
249
|
break if previous_node.element?
|
250
|
+
|
249
251
|
previous_node = previous_node.previous_sibling
|
250
252
|
end
|
251
253
|
|
252
|
-
|
253
|
-
|
254
|
-
|
254
|
+
return unless previous_node&.element?
|
255
|
+
|
256
|
+
previous_node.name.downcase
|
255
257
|
end
|
256
258
|
|
257
259
|
def previous_node_is_text?(node)
|
258
|
-
|
260
|
+
!node.previous_sibling.nil? && node.previous_sibling.text? && !node.previous_sibling.text.strip.empty?
|
259
261
|
end
|
260
262
|
|
261
263
|
# def previous_node_is_not_text?(node)
|