html2text 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -13
- data/CHANGELOG.md +37 -0
- data/README.md +16 -11
- data/lib/html2text/version.rb +1 -1
- data/lib/html2text.rb +113 -26
- data/spec/examples/basic.html +21 -21
- data/spec/examples/basic.txt +2 -0
- data/spec/examples/dom-processing.html +8 -0
- data/spec/examples/dom-processing.txt +1 -0
- data/spec/examples/empty.html +0 -0
- data/spec/examples/empty.txt +0 -0
- data/spec/examples/full_email.txt +1 -1
- data/spec/examples/huge-msoffice.html +1 -0
- data/spec/examples/huge-msoffice.txt +25872 -0
- data/spec/examples/invalid.html +4 -0
- data/spec/examples/invalid.txt +1 -0
- data/spec/examples/msoffice.html +1 -0
- data/spec/examples/msoffice.txt +12 -0
- data/spec/examples/nested-divs.html +17 -0
- data/spec/examples/nested-divs.txt +12 -0
- data/spec/examples/newlines.html +50 -0
- data/spec/examples/newlines.txt +35 -0
- data/spec/examples/non-breaking-spaces.html +1 -0
- data/spec/examples/non-breaking-spaces.txt +1 -0
- data/spec/examples/pre.html +10 -0
- data/spec/examples/pre.txt +8 -0
- data/spec/examples/test4.html +1 -1
- data/spec/examples/test4.txt +5 -5
- data/spec/examples/utf8-example.html +4 -0
- data/spec/examples/utf8-example.txt +2 -0
- data/spec/examples/windows-1252-example.html +4 -0
- data/spec/examples/windows-1252-example.txt +2 -0
- data/spec/examples/zero-width-non-joiners.html +1 -0
- data/spec/examples/zero-width-non-joiners.txt +1 -0
- data/spec/examples_spec.rb +13 -1
- data/spec/html2text_spec.rb +21 -0
- metadata +96 -34
@@ -0,0 +1 @@
|
|
1
|
+
Hello &nbsnbsp; world
|
@@ -0,0 +1 @@
|
|
1
|
+
<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=us-ascii"><meta name=Generator content="Microsoft Word 15 (filtered medium)"><style><!-- /* Font Definitions */ @font-face {font-family:"Cambria Math"; panose-1:2 4 5 3 5 4 6 3 2 4;} @font-face {font-family:Calibri; panose-1:2 15 5 2 2 2 4 3 2 4;} /* Style Definitions */ p.MsoNormal, li.MsoNormal, div.MsoNormal {margin:0cm; margin-bottom:.0001pt; font-size:11.0pt; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} a:link, span.MsoHyperlink {mso-style-priority:99; color:#0563C1; text-decoration:underline;} a:visited, span.MsoHyperlinkFollowed {mso-style-priority:99; color:#954F72; text-decoration:underline;} span.EmailStyle17 {mso-style-type:personal-compose; font-family:"Calibri",sans-serif; color:windowtext;} .MsoChpDefault {mso-style-type:export-only; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} @page WordSection1 {size:612.0pt 792.0pt; margin:72.0pt 72.0pt 72.0pt 72.0pt;} div.WordSection1 {page:WordSection1;} --></style><!--[if gte mso 9]><xml> <o:shapedefaults v:ext="edit" spidmax="1026" /> </xml><![endif]--><!--[if gte mso 9]><xml> <o:shapelayout v:ext="edit"> <o:idmap v:ext="edit" data="1" /> </o:shapelayout></xml><![endif]--></head><body lang=EN-GB link="#0563C1" vlink="#954F72"><div class=WordSection1><p class=MsoNormal>Dear html2text,<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>This is an example email that can be used to test html2text conversion of outlook / exchange emails.<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>The addition of <o:p> tags is very annoying!<o:p></o:p></p><p class=MsoNormal>This is a single line return<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal><b>This is bold<o:p></o:p></b></p><p class=MsoNormal><i>This is italic<o:p></o:p></i></p><p class=MsoNormal><u>This is underline<o:p></o:p></u></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>Andrew<o:p></o:p></p></div></body></html>
|
@@ -0,0 +1,12 @@
|
|
1
|
+
Dear html2text,
|
2
|
+
|
3
|
+
This is an example email that can be used to test html2text conversion of outlook / exchange emails.
|
4
|
+
|
5
|
+
The addition of <o:p> tags is very annoying!
|
6
|
+
This is a single line return
|
7
|
+
|
8
|
+
This is bold
|
9
|
+
This is italic
|
10
|
+
This is underline
|
11
|
+
|
12
|
+
Andrew
|
@@ -0,0 +1,17 @@
|
|
1
|
+
<html>
|
2
|
+
<body>
|
3
|
+
<div>
|
4
|
+
Just two divs
|
5
|
+
</div>
|
6
|
+
<div>
|
7
|
+
Hanging out
|
8
|
+
</div>
|
9
|
+
<div><div><div>Nested divs and line breaks</div></div><br></div>
|
10
|
+
<div><div>Nested divs and line breaks</div>More text<br></div>
|
11
|
+
<div><br></div>
|
12
|
+
<div>Just text</div>
|
13
|
+
<div>Just text<br></div>
|
14
|
+
<div>Just text<br><br></div>
|
15
|
+
This is the end!
|
16
|
+
</body>
|
17
|
+
</html>
|
@@ -0,0 +1,50 @@
|
|
1
|
+
<html>
|
2
|
+
<body>
|
3
|
+
<div>
|
4
|
+
Hello
|
5
|
+
<br>
|
6
|
+
</div>
|
7
|
+
<div>
|
8
|
+
How are you?
|
9
|
+
<br>
|
10
|
+
</div>
|
11
|
+
|
12
|
+
<p>
|
13
|
+
How are you?
|
14
|
+
<br>
|
15
|
+
</p>
|
16
|
+
|
17
|
+
<p>
|
18
|
+
How are you?
|
19
|
+
<br>
|
20
|
+
</p>
|
21
|
+
|
22
|
+
<div>
|
23
|
+
Just two divs
|
24
|
+
</div>
|
25
|
+
<div>
|
26
|
+
Hanging out
|
27
|
+
</div>
|
28
|
+
|
29
|
+
This is not the end!
|
30
|
+
<div>
|
31
|
+
How are you again?
|
32
|
+
<br>
|
33
|
+
</div>
|
34
|
+
This is the end!
|
35
|
+
<br>
|
36
|
+
Just kidding
|
37
|
+
<h1>Header 1</h1>
|
38
|
+
Some text
|
39
|
+
<hr>
|
40
|
+
Some more text
|
41
|
+
<p>Paragraph tag!</p>
|
42
|
+
<h2>Header 2</h2>
|
43
|
+
<hr>
|
44
|
+
<h3>Header 3</h3>
|
45
|
+
Some text
|
46
|
+
<h4>Header 4</h4>
|
47
|
+
<p>Paragraph tag!</p>
|
48
|
+
Final line
|
49
|
+
</body>
|
50
|
+
</html>
|
@@ -0,0 +1,35 @@
|
|
1
|
+
Hello
|
2
|
+
How are you?
|
3
|
+
|
4
|
+
How are you?
|
5
|
+
|
6
|
+
How are you?
|
7
|
+
|
8
|
+
Just two divs
|
9
|
+
Hanging out
|
10
|
+
This is not the end!
|
11
|
+
How are you again?
|
12
|
+
This is the end!
|
13
|
+
Just kidding
|
14
|
+
|
15
|
+
Header 1
|
16
|
+
|
17
|
+
Some text
|
18
|
+
---------------------------------------------------------------
|
19
|
+
Some more text
|
20
|
+
|
21
|
+
Paragraph tag!
|
22
|
+
|
23
|
+
Header 2
|
24
|
+
|
25
|
+
---------------------------------------------------------------
|
26
|
+
|
27
|
+
Header 3
|
28
|
+
|
29
|
+
Some text
|
30
|
+
|
31
|
+
Header 4
|
32
|
+
|
33
|
+
Paragraph tag!
|
34
|
+
|
35
|
+
Final line
|
@@ -0,0 +1 @@
|
|
1
|
+
these spaces are non-breaking
|
@@ -0,0 +1 @@
|
|
1
|
+
these spaces are non-breaking
|
data/spec/examples/test4.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1<br />2<br />3<br />4<br />5 6
|
1
|
+
1<br />2<br />3<br />4<br />5 < 6
|
data/spec/examples/test4.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
1
|
2
|
-
2
|
3
|
-
3
|
4
|
-
4
|
5
|
-
5 6
|
1
|
+
1
|
2
|
+
2
|
3
|
+
3
|
4
|
+
4
|
5
|
+
5 < 6
|
@@ -0,0 +1 @@
|
|
1
|
+
<p>foo‌bar</p>
|
@@ -0,0 +1 @@
|
|
1
|
+
foobar
|
data/spec/examples_spec.rb
CHANGED
@@ -17,7 +17,19 @@ describe Html2Text do
|
|
17
17
|
end
|
18
18
|
|
19
19
|
it "converts to text" do
|
20
|
-
|
20
|
+
# Write the output if it failed, for easier comparison
|
21
|
+
if !text.eql?(expected)
|
22
|
+
File.open(filename.sub(".html", ".output"), 'w') do |fp|
|
23
|
+
fp.write(text)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Quick check, don't try to generate a 500kb+ diff,
|
28
|
+
# which can halt the rspec for minutes+
|
29
|
+
expect(text.length).to eq expected.length if text.length > 10000
|
30
|
+
|
31
|
+
# More complete check
|
32
|
+
expect(text).to eq expected
|
21
33
|
end
|
22
34
|
end
|
23
35
|
end
|
data/spec/html2text_spec.rb
CHANGED
@@ -19,6 +19,27 @@ describe Html2Text do
|
|
19
19
|
expect(text).to eq("hello world")
|
20
20
|
end
|
21
21
|
end
|
22
|
+
|
23
|
+
context "input value is non-string" do
|
24
|
+
let(:html) { nil }
|
25
|
+
it '(nil)' do
|
26
|
+
expect(text).to eq("")
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
context "input value is non-string" do
|
31
|
+
let(:html) { 1234 }
|
32
|
+
it "(number)" do
|
33
|
+
expect(text).to eq("1234")
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
context "input value is non-string" do
|
38
|
+
let(:html) { 1234.5600 }
|
39
|
+
it "(float number)" do
|
40
|
+
expect(text).to eq("1234.56")
|
41
|
+
end
|
42
|
+
end
|
22
43
|
end
|
23
44
|
|
24
45
|
describe "#remove_leading_and_trailing_whitespace" do
|
metadata
CHANGED
@@ -1,92 +1,107 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jevon Wright
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-06-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 1.10.3
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 1.10.3
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rspec
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec-collection_matchers
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: colorize
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- -
|
59
|
+
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- -
|
66
|
+
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rake
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- -
|
73
|
+
- - ">="
|
74
74
|
- !ruby/object:Gem::Version
|
75
75
|
version: '0'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- -
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: bundler-audit
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
81
95
|
- !ruby/object:Gem::Version
|
82
96
|
version: '0'
|
83
97
|
description: A Ruby component to convert HTML into a plain text format.
|
84
98
|
email:
|
85
|
-
- jevon@
|
99
|
+
- jevon@jevon.org
|
86
100
|
executables: []
|
87
101
|
extensions: []
|
88
102
|
extra_rdoc_files: []
|
89
103
|
files:
|
104
|
+
- CHANGELOG.md
|
90
105
|
- LICENSE.md
|
91
106
|
- README.md
|
92
107
|
- lib/html2text.rb
|
@@ -95,22 +110,46 @@ files:
|
|
95
110
|
- spec/examples/anchors.txt
|
96
111
|
- spec/examples/basic.html
|
97
112
|
- spec/examples/basic.txt
|
113
|
+
- spec/examples/dom-processing.html
|
114
|
+
- spec/examples/dom-processing.txt
|
115
|
+
- spec/examples/empty.html
|
116
|
+
- spec/examples/empty.txt
|
98
117
|
- spec/examples/full_email.html
|
99
118
|
- spec/examples/full_email.txt
|
119
|
+
- spec/examples/huge-msoffice.html
|
120
|
+
- spec/examples/huge-msoffice.txt
|
100
121
|
- spec/examples/images.html
|
101
122
|
- spec/examples/images.txt
|
123
|
+
- spec/examples/invalid.html
|
124
|
+
- spec/examples/invalid.txt
|
102
125
|
- spec/examples/lists.html
|
103
126
|
- spec/examples/lists.txt
|
104
127
|
- spec/examples/more-anchors.html
|
105
128
|
- spec/examples/more-anchors.txt
|
129
|
+
- spec/examples/msoffice.html
|
130
|
+
- spec/examples/msoffice.txt
|
106
131
|
- spec/examples/nbsp.html
|
107
132
|
- spec/examples/nbsp.txt
|
133
|
+
- spec/examples/nested-divs.html
|
134
|
+
- spec/examples/nested-divs.txt
|
135
|
+
- spec/examples/newlines.html
|
136
|
+
- spec/examples/newlines.txt
|
137
|
+
- spec/examples/non-breaking-spaces.html
|
138
|
+
- spec/examples/non-breaking-spaces.txt
|
139
|
+
- spec/examples/pre.html
|
140
|
+
- spec/examples/pre.txt
|
108
141
|
- spec/examples/table.html
|
109
142
|
- spec/examples/table.txt
|
110
143
|
- spec/examples/test3.html
|
111
144
|
- spec/examples/test3.txt
|
112
145
|
- spec/examples/test4.html
|
113
146
|
- spec/examples/test4.txt
|
147
|
+
- spec/examples/utf8-example.html
|
148
|
+
- spec/examples/utf8-example.txt
|
149
|
+
- spec/examples/windows-1252-example.html
|
150
|
+
- spec/examples/windows-1252-example.txt
|
151
|
+
- spec/examples/zero-width-non-joiners.html
|
152
|
+
- spec/examples/zero-width-non-joiners.txt
|
114
153
|
- spec/examples_spec.rb
|
115
154
|
- spec/html2text_spec.rb
|
116
155
|
- spec/spec_helper.rb
|
@@ -124,41 +163,64 @@ require_paths:
|
|
124
163
|
- lib
|
125
164
|
required_ruby_version: !ruby/object:Gem::Requirement
|
126
165
|
requirements:
|
127
|
-
- -
|
166
|
+
- - ">="
|
128
167
|
- !ruby/object:Gem::Version
|
129
168
|
version: '0'
|
130
169
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
131
170
|
requirements:
|
132
|
-
- -
|
171
|
+
- - ">="
|
133
172
|
- !ruby/object:Gem::Version
|
134
173
|
version: '0'
|
135
174
|
requirements: []
|
136
|
-
|
137
|
-
rubygems_version: 2.4.5
|
175
|
+
rubygems_version: 3.0.3
|
138
176
|
signing_key:
|
139
177
|
specification_version: 4
|
140
178
|
summary: Convert HTML into plain text.
|
141
179
|
test_files:
|
142
|
-
- spec/examples/
|
143
|
-
- spec/examples/
|
180
|
+
- spec/examples/nested-divs.html
|
181
|
+
- spec/examples/pre.html
|
182
|
+
- spec/examples/invalid.html
|
183
|
+
- spec/examples/empty.html
|
184
|
+
- spec/examples/table.html
|
144
185
|
- spec/examples/basic.html
|
145
|
-
- spec/examples/basic.txt
|
146
|
-
- spec/examples/full_email.html
|
147
|
-
- spec/examples/full_email.txt
|
148
|
-
- spec/examples/images.html
|
149
|
-
- spec/examples/images.txt
|
150
|
-
- spec/examples/lists.html
|
151
|
-
- spec/examples/lists.txt
|
152
|
-
- spec/examples/more-anchors.html
|
153
|
-
- spec/examples/more-anchors.txt
|
154
186
|
- spec/examples/nbsp.html
|
187
|
+
- spec/examples/utf8-example.html
|
188
|
+
- spec/examples/newlines.txt
|
189
|
+
- spec/examples/full_email.txt
|
190
|
+
- spec/examples/msoffice.html
|
191
|
+
- spec/examples/zero-width-non-joiners.txt
|
192
|
+
- spec/examples/anchors.html
|
155
193
|
- spec/examples/nbsp.txt
|
156
|
-
- spec/examples/
|
157
|
-
- spec/examples/table.txt
|
194
|
+
- spec/examples/zero-width-non-joiners.html
|
158
195
|
- spec/examples/test3.html
|
159
|
-
- spec/examples/test3.txt
|
160
|
-
- spec/examples/test4.html
|
161
196
|
- spec/examples/test4.txt
|
197
|
+
- spec/examples/huge-msoffice.txt
|
198
|
+
- spec/examples/full_email.html
|
199
|
+
- spec/examples/utf8-example.txt
|
200
|
+
- spec/examples/table.txt
|
201
|
+
- spec/examples/huge-msoffice.html
|
202
|
+
- spec/examples/more-anchors.txt
|
203
|
+
- spec/examples/newlines.html
|
204
|
+
- spec/examples/test4.html
|
205
|
+
- spec/examples/basic.txt
|
206
|
+
- spec/examples/lists.html
|
207
|
+
- spec/examples/nested-divs.txt
|
208
|
+
- spec/examples/non-breaking-spaces.html
|
209
|
+
- spec/examples/invalid.txt
|
210
|
+
- spec/examples/empty.txt
|
211
|
+
- spec/examples/images.txt
|
212
|
+
- spec/examples/non-breaking-spaces.txt
|
213
|
+
- spec/examples/dom-processing.txt
|
214
|
+
- spec/examples/test3.txt
|
215
|
+
- spec/examples/dom-processing.html
|
216
|
+
- spec/examples/lists.txt
|
217
|
+
- spec/examples/pre.txt
|
218
|
+
- spec/examples/anchors.txt
|
219
|
+
- spec/examples/more-anchors.html
|
220
|
+
- spec/examples/windows-1252-example.txt
|
221
|
+
- spec/examples/images.html
|
222
|
+
- spec/examples/msoffice.txt
|
223
|
+
- spec/examples/windows-1252-example.html
|
162
224
|
- spec/examples_spec.rb
|
163
225
|
- spec/html2text_spec.rb
|
164
226
|
- spec/spec_helper.rb
|