html2text 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -13
- data/CHANGELOG.md +50 -0
- data/README.md +19 -14
- data/lib/html2text/version.rb +3 -1
- data/lib/html2text.rb +158 -69
- metadata +90 -73
- data/spec/examples/anchors.html +0 -12
- data/spec/examples/anchors.txt +0 -5
- data/spec/examples/basic.html +0 -21
- data/spec/examples/basic.txt +0 -13
- data/spec/examples/full_email.html +0 -220
- data/spec/examples/full_email.txt +0 -54
- data/spec/examples/images.html +0 -54
- data/spec/examples/images.txt +0 -27
- data/spec/examples/lists.html +0 -24
- data/spec/examples/lists.txt +0 -17
- data/spec/examples/more-anchors.html +0 -14
- data/spec/examples/more-anchors.txt +0 -7
- data/spec/examples/nbsp.html +0 -1
- data/spec/examples/nbsp.txt +0 -1
- data/spec/examples/table.html +0 -53
- data/spec/examples/table.txt +0 -7
- data/spec/examples/test3.html +0 -1
- data/spec/examples/test3.txt +0 -2
- data/spec/examples/test4.html +0 -1
- data/spec/examples/test4.txt +0 -5
- data/spec/examples_spec.rb +0 -29
- data/spec/html2text_spec.rb +0 -37
- data/spec/spec_helper.rb +0 -4
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
NjlhZDRjZjg4MjhjMjcxNGJkNzcyMDg5Mzk0Y2Q0MjA4MTM2MDJmMg==
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 32afc21e326c44b7881358081161b9581c396b167fad44614a96cc0b6df91f23
|
4
|
+
data.tar.gz: fe03a0811cbff965e6b720ad1fdfdd55c0aa1e03165c16c84de7ecac39d65c9d
|
7
5
|
SHA512:
|
8
|
-
metadata.gz:
|
9
|
-
|
10
|
-
OGU4NGM3ZjYwZGJjYzdmZWFlZWUyMzBkNTI1MzIxZDFhMjIwM2E1ZmI2NDI0
|
11
|
-
ZDk3ODViYmRkZGQ4MWUwNmRkMzFmOTE2NjQ3ZWRkZmQ0M2NlYzI=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
OWQ3MzM4ZTkyODA2ZmE0YThjZTA5MjhjYTQ1YzNiYjhjMzJmNWUyMDViNDE5
|
14
|
-
NGMxNGJjZDAwYzZjODJlYWRhOTc5NjY0YmFhNTZlOGFlMzNiNzE1ODE5Njgw
|
15
|
-
MmY0ODNmZDMzZTdkNjNjNTBmNTRmNzBjNTY3NDNhMjg0YjlmZWQ=
|
6
|
+
metadata.gz: e26ef2f826da8958c56a390bd4242461abdd26a110216ee3903c902e007b5fb26b38a8f358b420abe6eac480b4a452fedba90b75a36eb7f3f0c2bec4dad040a7
|
7
|
+
data.tar.gz: 31515d14c3ca612f2eb9faaf52655639c3c0b72687fed89c41d83fad816af852854057e3ff0803e9f16e5e1d2b62657699cedef5b9240ab114826d936d0ed3c0
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
# Changelog
|
2
|
+
All notable changes to this project will be documented in this file.
|
3
|
+
|
4
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
5
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
6
|
+
|
7
|
+
## [Unreleased]
|
8
|
+
|
9
|
+
## [0.4.0] - 2024-06-08
|
10
|
+
### Added
|
11
|
+
- Switch from Travis to Github Actions for Build and Test
|
12
|
+
- Add rubocop for linting and cleanup existing violations ([#36](https://github.com/soundasleep/html2text_ruby/pull/36))
|
13
|
+
|
14
|
+
### Changed
|
15
|
+
- Add support for Ruby 3.x, removed support for Ruby < 3.0 since it is EOL
|
16
|
+
- Allow subclassing of `Html2Text` to override the default behaviour ([#30](https://github.com/soundasleep/html2text_ruby/pull/30))
|
17
|
+
|
18
|
+
### Fixed
|
19
|
+
- Loosen nokogiri dependency to allow for nokogiri < 2.0 ([#17](https://github.com/soundasleep/html2text_ruby/pull/17))
|
20
|
+
- Fix `NoMethodError` when parsing nodes with no name ([#15](https://github.com/soundasleep/html2text_ruby/pull/15))
|
21
|
+
|
22
|
+
## [0.3.1] - 2019-06-12
|
23
|
+
### Security
|
24
|
+
- Bumped nokogiri requirement to ~> 1.10.3, resolving [CVE-2019-11068](https://nvd.nist.gov/vuln/detail/CVE-2019-11068)
|
25
|
+
([#8](https://github.com/soundasleep/html2text_ruby/issues/8))
|
26
|
+
|
27
|
+
## [0.3.0] - 2019-02-15
|
28
|
+
### Added
|
29
|
+
- Zero-width non-joiners are now stripped ([#5](https://github.com/soundasleep/html2text_ruby/pull/5))
|
30
|
+
- Support both UTF-8 and Windows-1252 encoded files
|
31
|
+
- Support converting `<pre>` blocks, including whitespace within these blocks
|
32
|
+
- MS Office (MsoNormal) documents are now rendered closer to actual render output
|
33
|
+
- Note this assumes that the input MS Office document has standard `MsoNormal` CSS.
|
34
|
+
This component is _not_ designed to try and interpret CSS within an HTML document.
|
35
|
+
|
36
|
+
### Changed
|
37
|
+
- Behaviour with multiple and nested `<p>`, `<div>` tags has been improved to be more in line with
|
38
|
+
actual browser render behaviour (see test suite)
|
39
|
+
|
40
|
+
### Fixed
|
41
|
+
- Update nokogiri dependency to 1.8.5
|
42
|
+
|
43
|
+
## [0.2.1] - 2017-09-27
|
44
|
+
### Fixed
|
45
|
+
- Convert non-string input into strings ([#3](https://github.com/soundasleep/html2text_ruby/pull/3))
|
46
|
+
|
47
|
+
[Unreleased]: https://github.com/soundasleep/html2text_ruby/compare/0.3.1...HEAD
|
48
|
+
[0.3.1]: https://github.com/soundasleep/html2text_ruby/compare/0.3.0...0.3.1
|
49
|
+
[0.3.0]: https://github.com/soundasleep/html2text_ruby/compare/0.2.1...0.3.0
|
50
|
+
[0.2.1]: https://github.com/soundasleep/html2text_ruby/compare/0.2.1...0.2.1
|
data/README.md
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
-
html2text
|
2
|
-
|
1
|
+
html2text  [](https://rubygems.org/gems/html2text)
|
2
|
+
---
|
3
3
|
|
4
|
-
`html2text` is a very simple
|
4
|
+
`html2text` is a very simple gem that uses DOM methods to convert HTML into a format similar to what would be
|
5
|
+
rendered by a browser - perfect for places where you need a quick text representation. For example:
|
5
6
|
|
6
7
|
```html
|
7
8
|
<html>
|
@@ -19,7 +20,7 @@ html2text [
|
41
44
|
```
|
42
45
|
|
43
46
|
See the [original blog post](http://journals.jevon.org/users/jevon-phd/entry/19818) or the related [StackOverflow answer](http://stackoverflow.com/a/2564472/39531).
|
44
47
|
|
45
48
|
## Installing
|
46
49
|
|
47
|
-
|
50
|
+
Add [the gem](https://rubygems.org/gems/html2text) into your Gemfile and run `bundle install`:
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
gem 'html2text'
|
54
|
+
```
|
55
|
+
|
56
|
+
Then you can:
|
48
57
|
|
49
58
|
```ruby
|
50
59
|
require 'html2text'
|
@@ -54,17 +63,13 @@ text = Html2Text.convert(html)
|
|
54
63
|
|
55
64
|
## Tests
|
56
65
|
|
57
|
-
See all of the test cases defined in [spec/examples/](spec/examples/). These can be run with
|
58
|
-
|
59
|
-
```
|
60
|
-
bundle install
|
61
|
-
rspec
|
62
|
-
```
|
66
|
+
See all of the test cases defined in [spec/examples/](spec/examples/). These can be run with `bundle exec rake`.
|
63
67
|
|
64
68
|
## License
|
65
69
|
|
66
|
-
`html2text` is licensed under MIT.
|
70
|
+
`html2text` is [licensed under MIT](LICENSE.md).
|
67
71
|
|
68
72
|
## Other versions
|
69
73
|
|
70
|
-
|
74
|
+
1. [html2text](https://github.com/soundasleep/html2text), the original PHP implementation.
|
75
|
+
2. [actionmailer-html2text](https://github.com/soundasleep/actionmailer-html2text), automatically generate text parts for HTML emails sent with ActionMailer.
|
data/lib/html2text/version.rb
CHANGED
data/lib/html2text.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
|
3
5
|
class Html2Text
|
@@ -8,18 +10,36 @@ class Html2Text
|
|
8
10
|
end
|
9
11
|
|
10
12
|
def self.convert(html)
|
13
|
+
html = html.to_s
|
14
|
+
|
15
|
+
if office_document?(html)
|
16
|
+
# Emulate the CSS rendering of Office documents
|
17
|
+
html = html.gsub('<p class=MsoNormal>', '<br>')
|
18
|
+
.gsub('<o:p> </o:p>', '<br>')
|
19
|
+
.gsub('<o:p></o:p>', '')
|
20
|
+
end
|
21
|
+
|
22
|
+
unless html.include?('<html')
|
23
|
+
# Stop Nokogiri from inserting in <p> tags
|
24
|
+
html = "<div>#{html}</div>"
|
25
|
+
end
|
26
|
+
|
11
27
|
html = fix_newlines(replace_entities(html))
|
12
28
|
doc = Nokogiri::HTML(html)
|
13
29
|
|
14
|
-
|
30
|
+
new(doc).convert
|
15
31
|
end
|
16
32
|
|
17
33
|
def self.fix_newlines(text)
|
34
|
+
# rubocop:disable Performance/StringReplacement
|
18
35
|
text.gsub("\r\n", "\n").gsub("\r", "\n")
|
36
|
+
# rubocop:enable Performance/StringReplacement
|
19
37
|
end
|
20
38
|
|
21
39
|
def self.replace_entities(text)
|
22
|
-
|
40
|
+
# rubocop:disable Performance/StringReplacement
|
41
|
+
text.gsub(' ', ' ').gsub("\u00a0", ' ').gsub('‌', '')
|
42
|
+
# rubocop:enable Performance/StringReplacement
|
23
43
|
end
|
24
44
|
|
25
45
|
def convert
|
@@ -29,149 +49,218 @@ class Html2Text
|
|
29
49
|
output.strip
|
30
50
|
end
|
31
51
|
|
52
|
+
DO_NOT_TOUCH_WHITESPACE = '<do-not-touch-whitespace>'
|
53
|
+
|
32
54
|
def remove_leading_and_trailing_whitespace(text)
|
33
|
-
|
55
|
+
# ignore any <pre> blocks, which we don't want to interact with
|
56
|
+
pre_blocks = text.split(DO_NOT_TOUCH_WHITESPACE)
|
57
|
+
|
58
|
+
output = []
|
59
|
+
pre_blocks.each.with_index do |block, index|
|
60
|
+
output << if index.even?
|
61
|
+
block.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t")
|
62
|
+
else
|
63
|
+
block
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
output.join
|
68
|
+
end
|
69
|
+
|
70
|
+
private_class_method def self.office_document?(text)
|
71
|
+
text.include?('urn:schemas-microsoft-com:office')
|
34
72
|
end
|
35
73
|
|
74
|
+
private
|
75
|
+
|
36
76
|
def remove_unnecessary_empty_lines(text)
|
37
77
|
text.gsub(/\n\n\n*/im, "\n\n")
|
38
78
|
end
|
39
79
|
|
40
80
|
def trimmed_whitespace(text)
|
41
81
|
# Replace whitespace characters with a space (equivalent to \s)
|
42
|
-
text
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
break if next_node.element?
|
49
|
-
next_node = next_node.next_sibling
|
50
|
-
end
|
51
|
-
|
52
|
-
if next_node && next_node.element?
|
53
|
-
next_node.name.downcase
|
82
|
+
# and force any text encoding into UTF-8
|
83
|
+
if text.valid_encoding?
|
84
|
+
text.gsub(/[\t\n\f\r ]+/im, ' ')
|
85
|
+
else
|
86
|
+
text.force_encoding('WINDOWS-1252')
|
87
|
+
trimmed_whitespace(text.encode('UTF-16be', invalid: :replace, replace: '?').encode('UTF-8'))
|
54
88
|
end
|
55
89
|
end
|
56
90
|
|
57
91
|
def iterate_over(node)
|
92
|
+
return "\n" if node.name.downcase == 'br' && next_node_is_text?(node)
|
93
|
+
|
58
94
|
return trimmed_whitespace(node.text) if node.text?
|
59
95
|
|
60
|
-
if [
|
61
|
-
|
62
|
-
|
96
|
+
return '' if %w[style head title meta script].include?(node.name.downcase)
|
97
|
+
|
98
|
+
return "\n#{DO_NOT_TOUCH_WHITESPACE}#{node.text}#{DO_NOT_TOUCH_WHITESPACE}" if node.name.downcase == 'pre'
|
63
99
|
|
64
100
|
output = []
|
65
101
|
|
66
102
|
output << prefix_whitespace(node)
|
67
103
|
output += node.children.map do |child|
|
68
|
-
iterate_over(child)
|
104
|
+
iterate_over(child) unless child.name.nil?
|
69
105
|
end
|
70
106
|
output << suffix_whitespace(node)
|
71
107
|
|
72
|
-
output = output.compact.join
|
108
|
+
output = output.compact.join || ''
|
73
109
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
110
|
+
unless node.name.nil?
|
111
|
+
if node.name.downcase == 'a'
|
112
|
+
output = wrap_link(node, output)
|
113
|
+
elsif node.name.downcase == 'img'
|
114
|
+
output = image_text(node)
|
115
|
+
end
|
79
116
|
end
|
80
117
|
|
81
118
|
output
|
82
119
|
end
|
83
120
|
|
121
|
+
# rubocop:disable Lint/DuplicateBranch
|
84
122
|
def prefix_whitespace(node)
|
85
123
|
case node.name.downcase
|
86
|
-
|
87
|
-
|
124
|
+
when 'hr'
|
125
|
+
"\n---------------------------------------------------------------\n"
|
88
126
|
|
89
|
-
|
90
|
-
|
127
|
+
when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul'
|
128
|
+
"\n\n"
|
129
|
+
|
130
|
+
when 'p'
|
131
|
+
"\n\n"
|
132
|
+
|
133
|
+
when 'tr'
|
134
|
+
"\n"
|
91
135
|
|
92
|
-
|
136
|
+
when 'div'
|
137
|
+
if node.parent.name == 'div' && (node.parent.text.strip == node.text.strip)
|
138
|
+
''
|
139
|
+
else
|
93
140
|
"\n"
|
141
|
+
end
|
94
142
|
|
95
|
-
|
96
|
-
|
143
|
+
when 'td', 'th'
|
144
|
+
"\t"
|
97
145
|
|
98
|
-
|
99
|
-
|
146
|
+
when 'li'
|
147
|
+
'- '
|
100
148
|
end
|
101
149
|
end
|
150
|
+
# rubocop:enable Lint/DuplicateBranch
|
102
151
|
|
152
|
+
# rubocop:disable Lint/DuplicateBranch
|
103
153
|
def suffix_whitespace(node)
|
104
154
|
case node.name.downcase
|
105
|
-
|
106
|
-
|
107
|
-
|
155
|
+
when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
|
156
|
+
# add another line
|
157
|
+
"\n\n"
|
108
158
|
|
109
|
-
|
110
|
-
|
159
|
+
when 'p'
|
160
|
+
"\n\n"
|
111
161
|
|
112
|
-
|
113
|
-
|
162
|
+
when 'br'
|
163
|
+
"\n" if next_node_name(node) != 'div' && !next_node_name(node).nil?
|
164
|
+
|
165
|
+
when 'li'
|
166
|
+
"\n"
|
114
167
|
|
115
|
-
|
116
|
-
|
117
|
-
"\n"
|
168
|
+
when 'div'
|
169
|
+
if next_node_is_text?(node)
|
170
|
+
"\n"
|
171
|
+
elsif next_node_name(node) != 'div' && !next_node_name(node).nil?
|
172
|
+
"\n"
|
173
|
+
end
|
118
174
|
end
|
119
175
|
end
|
176
|
+
# rubocop:enable Lint/DuplicateBranch
|
120
177
|
|
121
178
|
# links are returned in [text](link) format
|
122
179
|
def wrap_link(node, output)
|
123
|
-
href = node.attribute(
|
124
|
-
name = node.attribute(
|
180
|
+
href = node.attribute('href')
|
181
|
+
name = node.attribute('name')
|
125
182
|
|
126
183
|
output = output.strip
|
127
184
|
|
128
185
|
# remove double [[ ]]s from linking images
|
129
|
-
if output[0] ==
|
186
|
+
if output[0] == '[' && output[-1] == ']'
|
130
187
|
output = output[1, output.length - 2]
|
131
188
|
|
132
189
|
# for linking images, the title of the <a> overrides the title of the <img>
|
133
|
-
if node.attribute(
|
134
|
-
output = node.attribute("title").to_s
|
135
|
-
end
|
190
|
+
output = node.attribute('title').to_s if node.attribute('title')
|
136
191
|
end
|
137
192
|
|
138
193
|
# if there is no link text, but a title attr
|
139
|
-
if output.empty? && node.attribute(
|
140
|
-
output = node.attribute("title").to_s
|
141
|
-
end
|
194
|
+
output = node.attribute('title').to_s if output.empty? && node.attribute('title')
|
142
195
|
|
143
196
|
if href.nil?
|
144
|
-
|
145
|
-
output = "[#{output}]"
|
146
|
-
end
|
197
|
+
output = "[#{output}]" unless name.nil?
|
147
198
|
else
|
148
199
|
href = href.to_s
|
149
200
|
|
150
201
|
if href != output && href != "mailto:#{output}" &&
|
151
|
-
|
152
|
-
if output.empty?
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
202
|
+
href != "http://#{output}" && href != "https://#{output}"
|
203
|
+
output = if output.empty?
|
204
|
+
href
|
205
|
+
else
|
206
|
+
"[#{output}](#{href})"
|
207
|
+
end
|
157
208
|
end
|
158
209
|
end
|
159
210
|
|
160
211
|
case next_node_name(node)
|
161
|
-
|
162
|
-
|
212
|
+
when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
|
213
|
+
output += "\n"
|
163
214
|
end
|
164
215
|
|
165
216
|
output
|
166
217
|
end
|
167
218
|
|
168
219
|
def image_text(node)
|
169
|
-
if node.attribute(
|
170
|
-
"[
|
171
|
-
elsif node.attribute(
|
172
|
-
"[
|
220
|
+
if node.attribute('title')
|
221
|
+
"[#{node.attribute('title')}]"
|
222
|
+
elsif node.attribute('alt')
|
223
|
+
"[#{node.attribute('alt')}]"
|
173
224
|
else
|
174
|
-
|
225
|
+
''
|
175
226
|
end
|
176
227
|
end
|
228
|
+
|
229
|
+
def next_node_name(node)
|
230
|
+
next_node = node.next_sibling
|
231
|
+
until next_node.nil?
|
232
|
+
break if next_node.element?
|
233
|
+
|
234
|
+
next_node = next_node.next_sibling
|
235
|
+
end
|
236
|
+
|
237
|
+
return unless next_node&.element?
|
238
|
+
|
239
|
+
next_node.name.downcase
|
240
|
+
end
|
241
|
+
|
242
|
+
def next_node_is_text?(node)
|
243
|
+
!node.next_sibling.nil? && node.next_sibling.text? && !node.next_sibling.text.strip.empty?
|
244
|
+
end
|
245
|
+
|
246
|
+
def previous_node_name(node)
|
247
|
+
previous_node = node.previous_sibling
|
248
|
+
until previous_node.nil?
|
249
|
+
break if previous_node.element?
|
250
|
+
|
251
|
+
previous_node = previous_node.previous_sibling
|
252
|
+
end
|
253
|
+
|
254
|
+
return unless previous_node&.element?
|
255
|
+
|
256
|
+
previous_node.name.downcase
|
257
|
+
end
|
258
|
+
|
259
|
+
def previous_node_is_text?(node)
|
260
|
+
!node.previous_sibling.nil? && node.previous_sibling.text? && !node.previous_sibling.text.strip.empty?
|
261
|
+
end
|
262
|
+
|
263
|
+
# def previous_node_is_not_text?(node)
|
264
|
+
# return node.previous_sibling.nil? || !node.previous_sibling.text? || node.previous_sibling.text.strip.empty?
|
265
|
+
# end
|
177
266
|
end
|