html2text 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/CHANGELOG.md +28 -0
- data/README.md +7 -9
- data/lib/html2text.rb +112 -26
- data/lib/html2text/version.rb +1 -1
- data/spec/examples/basic.html +21 -21
- data/spec/examples/basic.txt +2 -0
- data/spec/examples/dom-processing.html +8 -0
- data/spec/examples/dom-processing.txt +1 -0
- data/spec/examples/empty.html +0 -0
- data/spec/examples/empty.txt +0 -0
- data/spec/examples/full_email.txt +1 -1
- data/spec/examples/huge-msoffice.html +1 -0
- data/spec/examples/huge-msoffice.txt +25872 -0
- data/spec/examples/invalid.html +4 -0
- data/spec/examples/invalid.txt +1 -0
- data/spec/examples/msoffice.html +1 -0
- data/spec/examples/msoffice.txt +12 -0
- data/spec/examples/nested-divs.html +17 -0
- data/spec/examples/nested-divs.txt +12 -0
- data/spec/examples/newlines.html +50 -0
- data/spec/examples/newlines.txt +35 -0
- data/spec/examples/pre.html +10 -0
- data/spec/examples/pre.txt +8 -0
- data/spec/examples/test4.html +1 -1
- data/spec/examples/test4.txt +5 -5
- data/spec/examples/utf8-example.html +4 -0
- data/spec/examples/utf8-example.txt +2 -0
- data/spec/examples/windows-1252-example.html +4 -0
- data/spec/examples/windows-1252-example.txt +2 -0
- data/spec/examples/zero-width-non-joiners.html +1 -0
- data/spec/examples/zero-width-non-joiners.txt +1 -0
- data/spec/examples_spec.rb +13 -1
- metadata +67 -23
@@ -0,0 +1 @@
|
|
1
|
+
Hello &nbsnbsp; world
|
@@ -0,0 +1 @@
|
|
1
|
+
<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=us-ascii"><meta name=Generator content="Microsoft Word 15 (filtered medium)"><style><!-- /* Font Definitions */ @font-face {font-family:"Cambria Math"; panose-1:2 4 5 3 5 4 6 3 2 4;} @font-face {font-family:Calibri; panose-1:2 15 5 2 2 2 4 3 2 4;} /* Style Definitions */ p.MsoNormal, li.MsoNormal, div.MsoNormal {margin:0cm; margin-bottom:.0001pt; font-size:11.0pt; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} a:link, span.MsoHyperlink {mso-style-priority:99; color:#0563C1; text-decoration:underline;} a:visited, span.MsoHyperlinkFollowed {mso-style-priority:99; color:#954F72; text-decoration:underline;} span.EmailStyle17 {mso-style-type:personal-compose; font-family:"Calibri",sans-serif; color:windowtext;} .MsoChpDefault {mso-style-type:export-only; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} @page WordSection1 {size:612.0pt 792.0pt; margin:72.0pt 72.0pt 72.0pt 72.0pt;} div.WordSection1 {page:WordSection1;} --></style><!--[if gte mso 9]><xml> <o:shapedefaults v:ext="edit" spidmax="1026" /> </xml><![endif]--><!--[if gte mso 9]><xml> <o:shapelayout v:ext="edit"> <o:idmap v:ext="edit" data="1" /> </o:shapelayout></xml><![endif]--></head><body lang=EN-GB link="#0563C1" vlink="#954F72"><div class=WordSection1><p class=MsoNormal>Dear html2text,<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>This is an example email that can be used to test html2text conversion of outlook / exchange emails.<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>The addition of <o:p> tags is very annoying!<o:p></o:p></p><p class=MsoNormal>This is a single line return<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal><b>This is bold<o:p></o:p></b></p><p class=MsoNormal><i>This is italic<o:p></o:p></i></p><p class=MsoNormal><u>This is underline<o:p></o:p></u></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>Andrew<o:p></o:p></p></div></body></html>
|
@@ -0,0 +1,12 @@
|
|
1
|
+
Dear html2text,
|
2
|
+
|
3
|
+
This is an example email that can be used to test html2text conversion of outlook / exchange emails.
|
4
|
+
|
5
|
+
The addition of <o:p> tags is very annoying!
|
6
|
+
This is a single line return
|
7
|
+
|
8
|
+
This is bold
|
9
|
+
This is italic
|
10
|
+
This is underline
|
11
|
+
|
12
|
+
Andrew
|
@@ -0,0 +1,17 @@
|
|
1
|
+
<html>
|
2
|
+
<body>
|
3
|
+
<div>
|
4
|
+
Just two divs
|
5
|
+
</div>
|
6
|
+
<div>
|
7
|
+
Hanging out
|
8
|
+
</div>
|
9
|
+
<div><div><div>Nested divs and line breaks</div></div><br></div>
|
10
|
+
<div><div>Nested divs and line breaks</div>More text<br></div>
|
11
|
+
<div><br></div>
|
12
|
+
<div>Just text</div>
|
13
|
+
<div>Just text<br></div>
|
14
|
+
<div>Just text<br><br></div>
|
15
|
+
This is the end!
|
16
|
+
</body>
|
17
|
+
</html>
|
@@ -0,0 +1,50 @@
|
|
1
|
+
<html>
|
2
|
+
<body>
|
3
|
+
<div>
|
4
|
+
Hello
|
5
|
+
<br>
|
6
|
+
</div>
|
7
|
+
<div>
|
8
|
+
How are you?
|
9
|
+
<br>
|
10
|
+
</div>
|
11
|
+
|
12
|
+
<p>
|
13
|
+
How are you?
|
14
|
+
<br>
|
15
|
+
</p>
|
16
|
+
|
17
|
+
<p>
|
18
|
+
How are you?
|
19
|
+
<br>
|
20
|
+
</p>
|
21
|
+
|
22
|
+
<div>
|
23
|
+
Just two divs
|
24
|
+
</div>
|
25
|
+
<div>
|
26
|
+
Hanging out
|
27
|
+
</div>
|
28
|
+
|
29
|
+
This is not the end!
|
30
|
+
<div>
|
31
|
+
How are you again?
|
32
|
+
<br>
|
33
|
+
</div>
|
34
|
+
This is the end!
|
35
|
+
<br>
|
36
|
+
Just kidding
|
37
|
+
<h1>Header 1</h1>
|
38
|
+
Some text
|
39
|
+
<hr>
|
40
|
+
Some more text
|
41
|
+
<p>Paragraph tag!</p>
|
42
|
+
<h2>Header 2</h2>
|
43
|
+
<hr>
|
44
|
+
<h3>Header 3</h3>
|
45
|
+
Some text
|
46
|
+
<h4>Header 4</h4>
|
47
|
+
<p>Paragraph tag!</p>
|
48
|
+
Final line
|
49
|
+
</body>
|
50
|
+
</html>
|
@@ -0,0 +1,35 @@
|
|
1
|
+
Hello
|
2
|
+
How are you?
|
3
|
+
|
4
|
+
How are you?
|
5
|
+
|
6
|
+
How are you?
|
7
|
+
|
8
|
+
Just two divs
|
9
|
+
Hanging out
|
10
|
+
This is not the end!
|
11
|
+
How are you again?
|
12
|
+
This is the end!
|
13
|
+
Just kidding
|
14
|
+
|
15
|
+
Header 1
|
16
|
+
|
17
|
+
Some text
|
18
|
+
---------------------------------------------------------------
|
19
|
+
Some more text
|
20
|
+
|
21
|
+
Paragraph tag!
|
22
|
+
|
23
|
+
Header 2
|
24
|
+
|
25
|
+
---------------------------------------------------------------
|
26
|
+
|
27
|
+
Header 3
|
28
|
+
|
29
|
+
Some text
|
30
|
+
|
31
|
+
Header 4
|
32
|
+
|
33
|
+
Paragraph tag!
|
34
|
+
|
35
|
+
Final line
|
data/spec/examples/test4.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1<br />2<br />3<br />4<br />5 6
|
1
|
+
1<br />2<br />3<br />4<br />5 < 6
|
data/spec/examples/test4.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
1
|
2
|
-
2
|
3
|
-
3
|
4
|
-
4
|
5
|
-
5 6
|
1
|
+
1
|
2
|
+
2
|
3
|
+
3
|
4
|
+
4
|
5
|
+
5 < 6
|
@@ -0,0 +1 @@
|
|
1
|
+
<p>foo‌bar</p>
|
@@ -0,0 +1 @@
|
|
1
|
+
foobar
|
data/spec/examples_spec.rb
CHANGED
@@ -17,7 +17,19 @@ describe Html2Text do
|
|
17
17
|
end
|
18
18
|
|
19
19
|
it "converts to text" do
|
20
|
-
|
20
|
+
# Write the output if it failed, for easier comparison
|
21
|
+
if !text.eql?(expected)
|
22
|
+
File.open(filename.sub(".html", ".output"), 'w') do |fp|
|
23
|
+
fp.write(text)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Quick check, don't try to generate a 500kb+ diff,
|
28
|
+
# which can halt the rspec for minutes+
|
29
|
+
expect(text.length).to eq expected.length if text.length > 10000
|
30
|
+
|
31
|
+
# More complete check
|
32
|
+
expect(text).to eq expected
|
21
33
|
end
|
22
34
|
end
|
23
35
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jevon Wright
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-02-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 1.8.5
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 1.8.5
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rspec
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -82,11 +82,12 @@ dependencies:
|
|
82
82
|
version: '0'
|
83
83
|
description: A Ruby component to convert HTML into a plain text format.
|
84
84
|
email:
|
85
|
-
- jevon@
|
85
|
+
- jevon@jevon.org
|
86
86
|
executables: []
|
87
87
|
extensions: []
|
88
88
|
extra_rdoc_files: []
|
89
89
|
files:
|
90
|
+
- CHANGELOG.md
|
90
91
|
- LICENSE.md
|
91
92
|
- README.md
|
92
93
|
- lib/html2text.rb
|
@@ -95,24 +96,46 @@ files:
|
|
95
96
|
- spec/examples/anchors.txt
|
96
97
|
- spec/examples/basic.html
|
97
98
|
- spec/examples/basic.txt
|
99
|
+
- spec/examples/dom-processing.html
|
100
|
+
- spec/examples/dom-processing.txt
|
101
|
+
- spec/examples/empty.html
|
102
|
+
- spec/examples/empty.txt
|
98
103
|
- spec/examples/full_email.html
|
99
104
|
- spec/examples/full_email.txt
|
105
|
+
- spec/examples/huge-msoffice.html
|
106
|
+
- spec/examples/huge-msoffice.txt
|
100
107
|
- spec/examples/images.html
|
101
108
|
- spec/examples/images.txt
|
109
|
+
- spec/examples/invalid.html
|
110
|
+
- spec/examples/invalid.txt
|
102
111
|
- spec/examples/lists.html
|
103
112
|
- spec/examples/lists.txt
|
104
113
|
- spec/examples/more-anchors.html
|
105
114
|
- spec/examples/more-anchors.txt
|
115
|
+
- spec/examples/msoffice.html
|
116
|
+
- spec/examples/msoffice.txt
|
106
117
|
- spec/examples/nbsp.html
|
107
118
|
- spec/examples/nbsp.txt
|
119
|
+
- spec/examples/nested-divs.html
|
120
|
+
- spec/examples/nested-divs.txt
|
121
|
+
- spec/examples/newlines.html
|
122
|
+
- spec/examples/newlines.txt
|
108
123
|
- spec/examples/non-breaking-spaces.html
|
109
124
|
- spec/examples/non-breaking-spaces.txt
|
125
|
+
- spec/examples/pre.html
|
126
|
+
- spec/examples/pre.txt
|
110
127
|
- spec/examples/table.html
|
111
128
|
- spec/examples/table.txt
|
112
129
|
- spec/examples/test3.html
|
113
130
|
- spec/examples/test3.txt
|
114
131
|
- spec/examples/test4.html
|
115
132
|
- spec/examples/test4.txt
|
133
|
+
- spec/examples/utf8-example.html
|
134
|
+
- spec/examples/utf8-example.txt
|
135
|
+
- spec/examples/windows-1252-example.html
|
136
|
+
- spec/examples/windows-1252-example.txt
|
137
|
+
- spec/examples/zero-width-non-joiners.html
|
138
|
+
- spec/examples/zero-width-non-joiners.txt
|
116
139
|
- spec/examples_spec.rb
|
117
140
|
- spec/html2text_spec.rb
|
118
141
|
- spec/spec_helper.rb
|
@@ -135,34 +158,55 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
135
158
|
- !ruby/object:Gem::Version
|
136
159
|
version: '0'
|
137
160
|
requirements: []
|
138
|
-
|
139
|
-
rubygems_version: 2.6.13
|
161
|
+
rubygems_version: 3.0.2
|
140
162
|
signing_key:
|
141
163
|
specification_version: 4
|
142
164
|
summary: Convert HTML into plain text.
|
143
165
|
test_files:
|
144
|
-
- spec/examples/
|
145
|
-
- spec/examples/
|
166
|
+
- spec/examples/nested-divs.html
|
167
|
+
- spec/examples/pre.html
|
168
|
+
- spec/examples/invalid.html
|
169
|
+
- spec/examples/empty.html
|
170
|
+
- spec/examples/table.html
|
146
171
|
- spec/examples/basic.html
|
147
|
-
- spec/examples/basic.txt
|
148
|
-
- spec/examples/full_email.html
|
149
|
-
- spec/examples/full_email.txt
|
150
|
-
- spec/examples/images.html
|
151
|
-
- spec/examples/images.txt
|
152
|
-
- spec/examples/lists.html
|
153
|
-
- spec/examples/lists.txt
|
154
|
-
- spec/examples/more-anchors.html
|
155
|
-
- spec/examples/more-anchors.txt
|
156
172
|
- spec/examples/nbsp.html
|
173
|
+
- spec/examples/utf8-example.html
|
174
|
+
- spec/examples/newlines.txt
|
175
|
+
- spec/examples/full_email.txt
|
176
|
+
- spec/examples/msoffice.html
|
177
|
+
- spec/examples/zero-width-non-joiners.txt
|
178
|
+
- spec/examples/anchors.html
|
157
179
|
- spec/examples/nbsp.txt
|
180
|
+
- spec/examples/zero-width-non-joiners.html
|
181
|
+
- spec/examples/test3.html
|
182
|
+
- spec/examples/test4.txt
|
183
|
+
- spec/examples/huge-msoffice.txt
|
184
|
+
- spec/examples/full_email.html
|
185
|
+
- spec/examples/utf8-example.txt
|
186
|
+
- spec/examples/table.txt
|
187
|
+
- spec/examples/huge-msoffice.html
|
188
|
+
- spec/examples/more-anchors.txt
|
189
|
+
- spec/examples/newlines.html
|
190
|
+
- spec/examples/test4.html
|
191
|
+
- spec/examples/basic.txt
|
192
|
+
- spec/examples/lists.html
|
193
|
+
- spec/examples/nested-divs.txt
|
158
194
|
- spec/examples/non-breaking-spaces.html
|
195
|
+
- spec/examples/invalid.txt
|
196
|
+
- spec/examples/empty.txt
|
197
|
+
- spec/examples/images.txt
|
159
198
|
- spec/examples/non-breaking-spaces.txt
|
160
|
-
- spec/examples/
|
161
|
-
- spec/examples/table.txt
|
162
|
-
- spec/examples/test3.html
|
199
|
+
- spec/examples/dom-processing.txt
|
163
200
|
- spec/examples/test3.txt
|
164
|
-
- spec/examples/
|
165
|
-
- spec/examples/
|
201
|
+
- spec/examples/dom-processing.html
|
202
|
+
- spec/examples/lists.txt
|
203
|
+
- spec/examples/pre.txt
|
204
|
+
- spec/examples/anchors.txt
|
205
|
+
- spec/examples/more-anchors.html
|
206
|
+
- spec/examples/windows-1252-example.txt
|
207
|
+
- spec/examples/images.html
|
208
|
+
- spec/examples/msoffice.txt
|
209
|
+
- spec/examples/windows-1252-example.html
|
166
210
|
- spec/examples_spec.rb
|
167
211
|
- spec/html2text_spec.rb
|
168
212
|
- spec/spec_helper.rb
|