html2text 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/CHANGELOG.md +28 -0
- data/README.md +7 -9
- data/lib/html2text.rb +112 -26
- data/lib/html2text/version.rb +1 -1
- data/spec/examples/basic.html +21 -21
- data/spec/examples/basic.txt +2 -0
- data/spec/examples/dom-processing.html +8 -0
- data/spec/examples/dom-processing.txt +1 -0
- data/spec/examples/empty.html +0 -0
- data/spec/examples/empty.txt +0 -0
- data/spec/examples/full_email.txt +1 -1
- data/spec/examples/huge-msoffice.html +1 -0
- data/spec/examples/huge-msoffice.txt +25872 -0
- data/spec/examples/invalid.html +4 -0
- data/spec/examples/invalid.txt +1 -0
- data/spec/examples/msoffice.html +1 -0
- data/spec/examples/msoffice.txt +12 -0
- data/spec/examples/nested-divs.html +17 -0
- data/spec/examples/nested-divs.txt +12 -0
- data/spec/examples/newlines.html +50 -0
- data/spec/examples/newlines.txt +35 -0
- data/spec/examples/pre.html +10 -0
- data/spec/examples/pre.txt +8 -0
- data/spec/examples/test4.html +1 -1
- data/spec/examples/test4.txt +5 -5
- data/spec/examples/utf8-example.html +4 -0
- data/spec/examples/utf8-example.txt +2 -0
- data/spec/examples/windows-1252-example.html +4 -0
- data/spec/examples/windows-1252-example.txt +2 -0
- data/spec/examples/zero-width-non-joiners.html +1 -0
- data/spec/examples/zero-width-non-joiners.txt +1 -0
- data/spec/examples_spec.rb +13 -1
- metadata +67 -23
@@ -0,0 +1 @@
|
|
1
|
+
Hello &nbsnbsp; world
|
@@ -0,0 +1 @@
|
|
1
|
+
<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=us-ascii"><meta name=Generator content="Microsoft Word 15 (filtered medium)"><style><!-- /* Font Definitions */ @font-face {font-family:"Cambria Math"; panose-1:2 4 5 3 5 4 6 3 2 4;} @font-face {font-family:Calibri; panose-1:2 15 5 2 2 2 4 3 2 4;} /* Style Definitions */ p.MsoNormal, li.MsoNormal, div.MsoNormal {margin:0cm; margin-bottom:.0001pt; font-size:11.0pt; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} a:link, span.MsoHyperlink {mso-style-priority:99; color:#0563C1; text-decoration:underline;} a:visited, span.MsoHyperlinkFollowed {mso-style-priority:99; color:#954F72; text-decoration:underline;} span.EmailStyle17 {mso-style-type:personal-compose; font-family:"Calibri",sans-serif; color:windowtext;} .MsoChpDefault {mso-style-type:export-only; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} @page WordSection1 {size:612.0pt 792.0pt; margin:72.0pt 72.0pt 72.0pt 72.0pt;} div.WordSection1 {page:WordSection1;} --></style><!--[if gte mso 9]><xml> <o:shapedefaults v:ext="edit" spidmax="1026" /> </xml><![endif]--><!--[if gte mso 9]><xml> <o:shapelayout v:ext="edit"> <o:idmap v:ext="edit" data="1" /> </o:shapelayout></xml><![endif]--></head><body lang=EN-GB link="#0563C1" vlink="#954F72"><div class=WordSection1><p class=MsoNormal>Dear html2text,<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>This is an example email that can be used to test html2text conversion of outlook / exchange emails.<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>The addition of <o:p> tags is very annoying!<o:p></o:p></p><p class=MsoNormal>This is a single line return<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal><b>This is bold<o:p></o:p></b></p><p class=MsoNormal><i>This is italic<o:p></o:p></i></p><p class=MsoNormal><u>This is underline<o:p></o:p></u></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>Andrew<o:p></o:p></p></div></body></html>
|
@@ -0,0 +1,12 @@
|
|
1
|
+
Dear html2text,
|
2
|
+
|
3
|
+
This is an example email that can be used to test html2text conversion of outlook / exchange emails.
|
4
|
+
|
5
|
+
The addition of <o:p> tags is very annoying!
|
6
|
+
This is a single line return
|
7
|
+
|
8
|
+
This is bold
|
9
|
+
This is italic
|
10
|
+
This is underline
|
11
|
+
|
12
|
+
Andrew
|
@@ -0,0 +1,17 @@
|
|
1
|
+
<html>
|
2
|
+
<body>
|
3
|
+
<div>
|
4
|
+
Just two divs
|
5
|
+
</div>
|
6
|
+
<div>
|
7
|
+
Hanging out
|
8
|
+
</div>
|
9
|
+
<div><div><div>Nested divs and line breaks</div></div><br></div>
|
10
|
+
<div><div>Nested divs and line breaks</div>More text<br></div>
|
11
|
+
<div><br></div>
|
12
|
+
<div>Just text</div>
|
13
|
+
<div>Just text<br></div>
|
14
|
+
<div>Just text<br><br></div>
|
15
|
+
This is the end!
|
16
|
+
</body>
|
17
|
+
</html>
|
@@ -0,0 +1,50 @@
|
|
1
|
+
<html>
|
2
|
+
<body>
|
3
|
+
<div>
|
4
|
+
Hello
|
5
|
+
<br>
|
6
|
+
</div>
|
7
|
+
<div>
|
8
|
+
How are you?
|
9
|
+
<br>
|
10
|
+
</div>
|
11
|
+
|
12
|
+
<p>
|
13
|
+
How are you?
|
14
|
+
<br>
|
15
|
+
</p>
|
16
|
+
|
17
|
+
<p>
|
18
|
+
How are you?
|
19
|
+
<br>
|
20
|
+
</p>
|
21
|
+
|
22
|
+
<div>
|
23
|
+
Just two divs
|
24
|
+
</div>
|
25
|
+
<div>
|
26
|
+
Hanging out
|
27
|
+
</div>
|
28
|
+
|
29
|
+
This is not the end!
|
30
|
+
<div>
|
31
|
+
How are you again?
|
32
|
+
<br>
|
33
|
+
</div>
|
34
|
+
This is the end!
|
35
|
+
<br>
|
36
|
+
Just kidding
|
37
|
+
<h1>Header 1</h1>
|
38
|
+
Some text
|
39
|
+
<hr>
|
40
|
+
Some more text
|
41
|
+
<p>Paragraph tag!</p>
|
42
|
+
<h2>Header 2</h2>
|
43
|
+
<hr>
|
44
|
+
<h3>Header 3</h3>
|
45
|
+
Some text
|
46
|
+
<h4>Header 4</h4>
|
47
|
+
<p>Paragraph tag!</p>
|
48
|
+
Final line
|
49
|
+
</body>
|
50
|
+
</html>
|
@@ -0,0 +1,35 @@
|
|
1
|
+
Hello
|
2
|
+
How are you?
|
3
|
+
|
4
|
+
How are you?
|
5
|
+
|
6
|
+
How are you?
|
7
|
+
|
8
|
+
Just two divs
|
9
|
+
Hanging out
|
10
|
+
This is not the end!
|
11
|
+
How are you again?
|
12
|
+
This is the end!
|
13
|
+
Just kidding
|
14
|
+
|
15
|
+
Header 1
|
16
|
+
|
17
|
+
Some text
|
18
|
+
---------------------------------------------------------------
|
19
|
+
Some more text
|
20
|
+
|
21
|
+
Paragraph tag!
|
22
|
+
|
23
|
+
Header 2
|
24
|
+
|
25
|
+
---------------------------------------------------------------
|
26
|
+
|
27
|
+
Header 3
|
28
|
+
|
29
|
+
Some text
|
30
|
+
|
31
|
+
Header 4
|
32
|
+
|
33
|
+
Paragraph tag!
|
34
|
+
|
35
|
+
Final line
|
data/spec/examples/test4.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1<br />2<br />3<br />4<br />5 6
|
1
|
+
1<br />2<br />3<br />4<br />5 < 6
|
data/spec/examples/test4.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
1
|
2
|
-
2
|
3
|
-
3
|
4
|
-
4
|
5
|
-
5 6
|
1
|
+
1
|
2
|
+
2
|
3
|
+
3
|
4
|
+
4
|
5
|
+
5 < 6
|
@@ -0,0 +1 @@
|
|
1
|
+
<p>foo‌bar</p>
|
@@ -0,0 +1 @@
|
|
1
|
+
foobar
|
data/spec/examples_spec.rb
CHANGED
@@ -17,7 +17,19 @@ describe Html2Text do
|
|
17
17
|
end
|
18
18
|
|
19
19
|
it "converts to text" do
|
20
|
-
|
20
|
+
# Write the output if it failed, for easier comparison
|
21
|
+
if !text.eql?(expected)
|
22
|
+
File.open(filename.sub(".html", ".output"), 'w') do |fp|
|
23
|
+
fp.write(text)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Quick check, don't try to generate a 500kb+ diff,
|
28
|
+
# which can halt the rspec for minutes+
|
29
|
+
expect(text.length).to eq expected.length if text.length > 10000
|
30
|
+
|
31
|
+
# More complete check
|
32
|
+
expect(text).to eq expected
|
21
33
|
end
|
22
34
|
end
|
23
35
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jevon Wright
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-02-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 1.8.5
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 1.8.5
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rspec
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -82,11 +82,12 @@ dependencies:
|
|
82
82
|
version: '0'
|
83
83
|
description: A Ruby component to convert HTML into a plain text format.
|
84
84
|
email:
|
85
|
-
- jevon@
|
85
|
+
- jevon@jevon.org
|
86
86
|
executables: []
|
87
87
|
extensions: []
|
88
88
|
extra_rdoc_files: []
|
89
89
|
files:
|
90
|
+
- CHANGELOG.md
|
90
91
|
- LICENSE.md
|
91
92
|
- README.md
|
92
93
|
- lib/html2text.rb
|
@@ -95,24 +96,46 @@ files:
|
|
95
96
|
- spec/examples/anchors.txt
|
96
97
|
- spec/examples/basic.html
|
97
98
|
- spec/examples/basic.txt
|
99
|
+
- spec/examples/dom-processing.html
|
100
|
+
- spec/examples/dom-processing.txt
|
101
|
+
- spec/examples/empty.html
|
102
|
+
- spec/examples/empty.txt
|
98
103
|
- spec/examples/full_email.html
|
99
104
|
- spec/examples/full_email.txt
|
105
|
+
- spec/examples/huge-msoffice.html
|
106
|
+
- spec/examples/huge-msoffice.txt
|
100
107
|
- spec/examples/images.html
|
101
108
|
- spec/examples/images.txt
|
109
|
+
- spec/examples/invalid.html
|
110
|
+
- spec/examples/invalid.txt
|
102
111
|
- spec/examples/lists.html
|
103
112
|
- spec/examples/lists.txt
|
104
113
|
- spec/examples/more-anchors.html
|
105
114
|
- spec/examples/more-anchors.txt
|
115
|
+
- spec/examples/msoffice.html
|
116
|
+
- spec/examples/msoffice.txt
|
106
117
|
- spec/examples/nbsp.html
|
107
118
|
- spec/examples/nbsp.txt
|
119
|
+
- spec/examples/nested-divs.html
|
120
|
+
- spec/examples/nested-divs.txt
|
121
|
+
- spec/examples/newlines.html
|
122
|
+
- spec/examples/newlines.txt
|
108
123
|
- spec/examples/non-breaking-spaces.html
|
109
124
|
- spec/examples/non-breaking-spaces.txt
|
125
|
+
- spec/examples/pre.html
|
126
|
+
- spec/examples/pre.txt
|
110
127
|
- spec/examples/table.html
|
111
128
|
- spec/examples/table.txt
|
112
129
|
- spec/examples/test3.html
|
113
130
|
- spec/examples/test3.txt
|
114
131
|
- spec/examples/test4.html
|
115
132
|
- spec/examples/test4.txt
|
133
|
+
- spec/examples/utf8-example.html
|
134
|
+
- spec/examples/utf8-example.txt
|
135
|
+
- spec/examples/windows-1252-example.html
|
136
|
+
- spec/examples/windows-1252-example.txt
|
137
|
+
- spec/examples/zero-width-non-joiners.html
|
138
|
+
- spec/examples/zero-width-non-joiners.txt
|
116
139
|
- spec/examples_spec.rb
|
117
140
|
- spec/html2text_spec.rb
|
118
141
|
- spec/spec_helper.rb
|
@@ -135,34 +158,55 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
135
158
|
- !ruby/object:Gem::Version
|
136
159
|
version: '0'
|
137
160
|
requirements: []
|
138
|
-
|
139
|
-
rubygems_version: 2.6.13
|
161
|
+
rubygems_version: 3.0.2
|
140
162
|
signing_key:
|
141
163
|
specification_version: 4
|
142
164
|
summary: Convert HTML into plain text.
|
143
165
|
test_files:
|
144
|
-
- spec/examples/
|
145
|
-
- spec/examples/
|
166
|
+
- spec/examples/nested-divs.html
|
167
|
+
- spec/examples/pre.html
|
168
|
+
- spec/examples/invalid.html
|
169
|
+
- spec/examples/empty.html
|
170
|
+
- spec/examples/table.html
|
146
171
|
- spec/examples/basic.html
|
147
|
-
- spec/examples/basic.txt
|
148
|
-
- spec/examples/full_email.html
|
149
|
-
- spec/examples/full_email.txt
|
150
|
-
- spec/examples/images.html
|
151
|
-
- spec/examples/images.txt
|
152
|
-
- spec/examples/lists.html
|
153
|
-
- spec/examples/lists.txt
|
154
|
-
- spec/examples/more-anchors.html
|
155
|
-
- spec/examples/more-anchors.txt
|
156
172
|
- spec/examples/nbsp.html
|
173
|
+
- spec/examples/utf8-example.html
|
174
|
+
- spec/examples/newlines.txt
|
175
|
+
- spec/examples/full_email.txt
|
176
|
+
- spec/examples/msoffice.html
|
177
|
+
- spec/examples/zero-width-non-joiners.txt
|
178
|
+
- spec/examples/anchors.html
|
157
179
|
- spec/examples/nbsp.txt
|
180
|
+
- spec/examples/zero-width-non-joiners.html
|
181
|
+
- spec/examples/test3.html
|
182
|
+
- spec/examples/test4.txt
|
183
|
+
- spec/examples/huge-msoffice.txt
|
184
|
+
- spec/examples/full_email.html
|
185
|
+
- spec/examples/utf8-example.txt
|
186
|
+
- spec/examples/table.txt
|
187
|
+
- spec/examples/huge-msoffice.html
|
188
|
+
- spec/examples/more-anchors.txt
|
189
|
+
- spec/examples/newlines.html
|
190
|
+
- spec/examples/test4.html
|
191
|
+
- spec/examples/basic.txt
|
192
|
+
- spec/examples/lists.html
|
193
|
+
- spec/examples/nested-divs.txt
|
158
194
|
- spec/examples/non-breaking-spaces.html
|
195
|
+
- spec/examples/invalid.txt
|
196
|
+
- spec/examples/empty.txt
|
197
|
+
- spec/examples/images.txt
|
159
198
|
- spec/examples/non-breaking-spaces.txt
|
160
|
-
- spec/examples/
|
161
|
-
- spec/examples/table.txt
|
162
|
-
- spec/examples/test3.html
|
199
|
+
- spec/examples/dom-processing.txt
|
163
200
|
- spec/examples/test3.txt
|
164
|
-
- spec/examples/
|
165
|
-
- spec/examples/
|
201
|
+
- spec/examples/dom-processing.html
|
202
|
+
- spec/examples/lists.txt
|
203
|
+
- spec/examples/pre.txt
|
204
|
+
- spec/examples/anchors.txt
|
205
|
+
- spec/examples/more-anchors.html
|
206
|
+
- spec/examples/windows-1252-example.txt
|
207
|
+
- spec/examples/images.html
|
208
|
+
- spec/examples/msoffice.txt
|
209
|
+
- spec/examples/windows-1252-example.html
|
166
210
|
- spec/examples_spec.rb
|
167
211
|
- spec/html2text_spec.rb
|
168
212
|
- spec/spec_helper.rb
|