html2text 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +5 -5
- data/lib/html2text/version.rb +3 -1
- data/lib/html2text.rb +108 -106
- metadata +65 -110
- data/spec/examples/anchors.html +0 -12
- data/spec/examples/anchors.txt +0 -5
- data/spec/examples/basic.html +0 -21
- data/spec/examples/basic.txt +0 -15
- data/spec/examples/dom-processing.html +0 -8
- data/spec/examples/dom-processing.txt +0 -1
- data/spec/examples/empty.html +0 -0
- data/spec/examples/empty.txt +0 -0
- data/spec/examples/full_email.html +0 -220
- data/spec/examples/full_email.txt +0 -54
- data/spec/examples/huge-msoffice.html +0 -1
- data/spec/examples/huge-msoffice.txt +0 -25872
- data/spec/examples/images.html +0 -54
- data/spec/examples/images.txt +0 -27
- data/spec/examples/invalid.html +0 -4
- data/spec/examples/invalid.txt +0 -1
- data/spec/examples/lists.html +0 -24
- data/spec/examples/lists.txt +0 -17
- data/spec/examples/more-anchors.html +0 -14
- data/spec/examples/more-anchors.txt +0 -7
- data/spec/examples/msoffice.html +0 -1
- data/spec/examples/msoffice.txt +0 -12
- data/spec/examples/nbsp.html +0 -1
- data/spec/examples/nbsp.txt +0 -1
- data/spec/examples/nested-divs.html +0 -17
- data/spec/examples/nested-divs.txt +0 -12
- data/spec/examples/newlines.html +0 -50
- data/spec/examples/newlines.txt +0 -35
- data/spec/examples/non-breaking-spaces.html +0 -1
- data/spec/examples/non-breaking-spaces.txt +0 -1
- data/spec/examples/pre.html +0 -10
- data/spec/examples/pre.txt +0 -8
- data/spec/examples/table.html +0 -53
- data/spec/examples/table.txt +0 -7
- data/spec/examples/test3.html +0 -1
- data/spec/examples/test3.txt +0 -2
- data/spec/examples/test4.html +0 -1
- data/spec/examples/test4.txt +0 -5
- data/spec/examples/utf8-example.html +0 -4
- data/spec/examples/utf8-example.txt +0 -2
- data/spec/examples/windows-1252-example.html +0 -4
- data/spec/examples/windows-1252-example.txt +0 -2
- data/spec/examples/zero-width-non-joiners.html +0 -1
- data/spec/examples/zero-width-non-joiners.txt +0 -1
- data/spec/examples_spec.rb +0 -41
- data/spec/html2text_spec.rb +0 -58
- data/spec/spec_helper.rb +0 -4
data/spec/examples/images.html
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
<body>
|
2
|
-
<p>
|
3
|
-
One: <img src="one.png">
|
4
|
-
</p>
|
5
|
-
|
6
|
-
<p>
|
7
|
-
Two: <img src="two.png" alt="two">
|
8
|
-
</p>
|
9
|
-
|
10
|
-
<p>
|
11
|
-
Three: <img src="three.png" title="three">
|
12
|
-
</p>
|
13
|
-
|
14
|
-
<p>
|
15
|
-
Four: <img src="four.png" title="four" alt="four alt">
|
16
|
-
</p>
|
17
|
-
|
18
|
-
<h1>With links</h1>
|
19
|
-
|
20
|
-
<p>
|
21
|
-
One: <a href="http://localhost"><img src="one.png"></a>
|
22
|
-
</p>
|
23
|
-
|
24
|
-
<p>
|
25
|
-
Two: <a href="http://localhost"><img src="two.png" alt="two"></a>
|
26
|
-
</p>
|
27
|
-
|
28
|
-
<p>
|
29
|
-
Three: <a href="http://localhost"><img src="three.png" title="three"></a>
|
30
|
-
</p>
|
31
|
-
|
32
|
-
<p>
|
33
|
-
Four: <a href="http://localhost"><img src="four.png" title="four" alt="four alt"></a>
|
34
|
-
</p>
|
35
|
-
|
36
|
-
<h1>With links with titles</h1>
|
37
|
-
|
38
|
-
<p>
|
39
|
-
One: <a href="http://localhost" title="one link"><img src="one.png"></a>
|
40
|
-
</p>
|
41
|
-
|
42
|
-
<p>
|
43
|
-
Two: <a href="http://localhost" title="two link"><img src="two.png" alt="two"></a>
|
44
|
-
</p>
|
45
|
-
|
46
|
-
<p>
|
47
|
-
Three: <a href="http://localhost" title="three link"><img src="three.png" title="three"></a>
|
48
|
-
</p>
|
49
|
-
|
50
|
-
<p>
|
51
|
-
Four: <a href="http://localhost" title="four link"><img src="four.png" title="four" alt="four alt"></a>
|
52
|
-
</p>
|
53
|
-
</body>
|
54
|
-
</html>
|
data/spec/examples/images.txt
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
One:
|
2
|
-
|
3
|
-
Two: [two]
|
4
|
-
|
5
|
-
Three: [three]
|
6
|
-
|
7
|
-
Four: [four]
|
8
|
-
|
9
|
-
With links
|
10
|
-
|
11
|
-
One: http://localhost
|
12
|
-
|
13
|
-
Two: [two](http://localhost)
|
14
|
-
|
15
|
-
Three: [three](http://localhost)
|
16
|
-
|
17
|
-
Four: [four](http://localhost)
|
18
|
-
|
19
|
-
With links with titles
|
20
|
-
|
21
|
-
One: [one link](http://localhost)
|
22
|
-
|
23
|
-
Two: [two link](http://localhost)
|
24
|
-
|
25
|
-
Three: [three link](http://localhost)
|
26
|
-
|
27
|
-
Four: [four link](http://localhost)
|
data/spec/examples/invalid.html
DELETED
data/spec/examples/invalid.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
Hello &nbsnbsp; world
|
data/spec/examples/lists.html
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
<h1>List tests</h1>
|
2
|
-
|
3
|
-
<p>
|
4
|
-
Add some lists.
|
5
|
-
</p>
|
6
|
-
|
7
|
-
<ol>
|
8
|
-
<li>one</li>
|
9
|
-
<li>two
|
10
|
-
<li>three
|
11
|
-
</ol>
|
12
|
-
|
13
|
-
<h2>An unordered list</h2>
|
14
|
-
|
15
|
-
<ul>
|
16
|
-
<li>one
|
17
|
-
<li>two</li>
|
18
|
-
<li>three</li>
|
19
|
-
</ul>
|
20
|
-
<ul>
|
21
|
-
<li>one
|
22
|
-
<li>two</li>
|
23
|
-
<li>three</li>
|
24
|
-
</ul>
|
data/spec/examples/lists.txt
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
<h1>Anchor tests</h1>
|
2
|
-
|
3
|
-
<p>
|
4
|
-
Visit http://openiaml.org or <a href="http://openiaml.org">openiaml.org</a> or <a href="http://openiaml.org">http://openiaml.org</a>.
|
5
|
-
</p>
|
6
|
-
|
7
|
-
<p>
|
8
|
-
To visit with SSL, visit https://openiaml.org or <a href="https://openiaml.org">openiaml.org</a> or <a href="https://openiaml.org">https://openiaml.org</a>.
|
9
|
-
</p>
|
10
|
-
|
11
|
-
<p>
|
12
|
-
To mail, email support@openiaml.org or mailto:support@openiaml.org
|
13
|
-
or <a href="mailto:support@openiaml.org">support@openiaml.org</a> or <a href="mailto:support@openiaml.org">mailto:support@openiaml.org</a>.
|
14
|
-
</p>
|
@@ -1,7 +0,0 @@
|
|
1
|
-
Anchor tests
|
2
|
-
|
3
|
-
Visit http://openiaml.org or openiaml.org or http://openiaml.org.
|
4
|
-
|
5
|
-
To visit with SSL, visit https://openiaml.org or openiaml.org or https://openiaml.org.
|
6
|
-
|
7
|
-
To mail, email support@openiaml.org or mailto:support@openiaml.org or support@openiaml.org or mailto:support@openiaml.org.
|
data/spec/examples/msoffice.html
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=us-ascii"><meta name=Generator content="Microsoft Word 15 (filtered medium)"><style><!-- /* Font Definitions */ @font-face {font-family:"Cambria Math"; panose-1:2 4 5 3 5 4 6 3 2 4;} @font-face {font-family:Calibri; panose-1:2 15 5 2 2 2 4 3 2 4;} /* Style Definitions */ p.MsoNormal, li.MsoNormal, div.MsoNormal {margin:0cm; margin-bottom:.0001pt; font-size:11.0pt; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} a:link, span.MsoHyperlink {mso-style-priority:99; color:#0563C1; text-decoration:underline;} a:visited, span.MsoHyperlinkFollowed {mso-style-priority:99; color:#954F72; text-decoration:underline;} span.EmailStyle17 {mso-style-type:personal-compose; font-family:"Calibri",sans-serif; color:windowtext;} .MsoChpDefault {mso-style-type:export-only; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} @page WordSection1 {size:612.0pt 792.0pt; margin:72.0pt 72.0pt 72.0pt 72.0pt;} div.WordSection1 {page:WordSection1;} --></style><!--[if gte mso 9]><xml> <o:shapedefaults v:ext="edit" spidmax="1026" /> </xml><![endif]--><!--[if gte mso 9]><xml> <o:shapelayout v:ext="edit"> <o:idmap v:ext="edit" data="1" /> </o:shapelayout></xml><![endif]--></head><body lang=EN-GB link="#0563C1" vlink="#954F72"><div class=WordSection1><p class=MsoNormal>Dear html2text,<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>This is an example email that can be used to test html2text conversion of outlook / exchange emails.<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>The addition of <o:p> tags is very annoying!<o:p></o:p></p><p class=MsoNormal>This is a single line return<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal><b>This is bold<o:p></o:p></b></p><p class=MsoNormal><i>This is italic<o:p></o:p></i></p><p class=MsoNormal><u>This is underline<o:p></o:p></u></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>Andrew<o:p></o:p></p></div></body></html>
|
data/spec/examples/msoffice.txt
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
Dear html2text,
|
2
|
-
|
3
|
-
This is an example email that can be used to test html2text conversion of outlook / exchange emails.
|
4
|
-
|
5
|
-
The addition of <o:p> tags is very annoying!
|
6
|
-
This is a single line return
|
7
|
-
|
8
|
-
This is bold
|
9
|
-
This is italic
|
10
|
-
This is underline
|
11
|
-
|
12
|
-
Andrew
|
data/spec/examples/nbsp.html
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
hello world & people < > &NBSP;
|
data/spec/examples/nbsp.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
hello world & people < > &NBSP;
|
@@ -1,17 +0,0 @@
|
|
1
|
-
<html>
|
2
|
-
<body>
|
3
|
-
<div>
|
4
|
-
Just two divs
|
5
|
-
</div>
|
6
|
-
<div>
|
7
|
-
Hanging out
|
8
|
-
</div>
|
9
|
-
<div><div><div>Nested divs and line breaks</div></div><br></div>
|
10
|
-
<div><div>Nested divs and line breaks</div>More text<br></div>
|
11
|
-
<div><br></div>
|
12
|
-
<div>Just text</div>
|
13
|
-
<div>Just text<br></div>
|
14
|
-
<div>Just text<br><br></div>
|
15
|
-
This is the end!
|
16
|
-
</body>
|
17
|
-
</html>
|
data/spec/examples/newlines.html
DELETED
@@ -1,50 +0,0 @@
|
|
1
|
-
<html>
|
2
|
-
<body>
|
3
|
-
<div>
|
4
|
-
Hello
|
5
|
-
<br>
|
6
|
-
</div>
|
7
|
-
<div>
|
8
|
-
How are you?
|
9
|
-
<br>
|
10
|
-
</div>
|
11
|
-
|
12
|
-
<p>
|
13
|
-
How are you?
|
14
|
-
<br>
|
15
|
-
</p>
|
16
|
-
|
17
|
-
<p>
|
18
|
-
How are you?
|
19
|
-
<br>
|
20
|
-
</p>
|
21
|
-
|
22
|
-
<div>
|
23
|
-
Just two divs
|
24
|
-
</div>
|
25
|
-
<div>
|
26
|
-
Hanging out
|
27
|
-
</div>
|
28
|
-
|
29
|
-
This is not the end!
|
30
|
-
<div>
|
31
|
-
How are you again?
|
32
|
-
<br>
|
33
|
-
</div>
|
34
|
-
This is the end!
|
35
|
-
<br>
|
36
|
-
Just kidding
|
37
|
-
<h1>Header 1</h1>
|
38
|
-
Some text
|
39
|
-
<hr>
|
40
|
-
Some more text
|
41
|
-
<p>Paragraph tag!</p>
|
42
|
-
<h2>Header 2</h2>
|
43
|
-
<hr>
|
44
|
-
<h3>Header 3</h3>
|
45
|
-
Some text
|
46
|
-
<h4>Header 4</h4>
|
47
|
-
<p>Paragraph tag!</p>
|
48
|
-
Final line
|
49
|
-
</body>
|
50
|
-
</html>
|
data/spec/examples/newlines.txt
DELETED
@@ -1,35 +0,0 @@
|
|
1
|
-
Hello
|
2
|
-
How are you?
|
3
|
-
|
4
|
-
How are you?
|
5
|
-
|
6
|
-
How are you?
|
7
|
-
|
8
|
-
Just two divs
|
9
|
-
Hanging out
|
10
|
-
This is not the end!
|
11
|
-
How are you again?
|
12
|
-
This is the end!
|
13
|
-
Just kidding
|
14
|
-
|
15
|
-
Header 1
|
16
|
-
|
17
|
-
Some text
|
18
|
-
---------------------------------------------------------------
|
19
|
-
Some more text
|
20
|
-
|
21
|
-
Paragraph tag!
|
22
|
-
|
23
|
-
Header 2
|
24
|
-
|
25
|
-
---------------------------------------------------------------
|
26
|
-
|
27
|
-
Header 3
|
28
|
-
|
29
|
-
Some text
|
30
|
-
|
31
|
-
Header 4
|
32
|
-
|
33
|
-
Paragraph tag!
|
34
|
-
|
35
|
-
Final line
|
@@ -1 +0,0 @@
|
|
1
|
-
these spaces are non-breaking
|
@@ -1 +0,0 @@
|
|
1
|
-
these spaces are non-breaking
|
data/spec/examples/pre.html
DELETED
data/spec/examples/pre.txt
DELETED
data/spec/examples/table.html
DELETED
@@ -1,53 +0,0 @@
|
|
1
|
-
<html>
|
2
|
-
<title>Ignored Title</title>
|
3
|
-
<body>
|
4
|
-
<h1>Hello, World!</h1>
|
5
|
-
<table>
|
6
|
-
<thead>
|
7
|
-
<tr>
|
8
|
-
<th>Col A</th>
|
9
|
-
<th>Col B</th>
|
10
|
-
</tr>
|
11
|
-
</thead>
|
12
|
-
<tbody>
|
13
|
-
<tr>
|
14
|
-
<td>
|
15
|
-
Data A1
|
16
|
-
</td>
|
17
|
-
<td>
|
18
|
-
Data B1
|
19
|
-
</td>
|
20
|
-
</tr>
|
21
|
-
<tr>
|
22
|
-
<td>
|
23
|
-
Data A2
|
24
|
-
</td>
|
25
|
-
<td>
|
26
|
-
Data B2
|
27
|
-
</td>
|
28
|
-
</tr>
|
29
|
-
<tr>
|
30
|
-
<td>
|
31
|
-
Data A3
|
32
|
-
</td>
|
33
|
-
<td>
|
34
|
-
Data B4
|
35
|
-
</td>
|
36
|
-
</tr>
|
37
|
-
</tbody>
|
38
|
-
<tfoot>
|
39
|
-
<tr>
|
40
|
-
<td>
|
41
|
-
Total A
|
42
|
-
</td>
|
43
|
-
<td>
|
44
|
-
Total B
|
45
|
-
</td>
|
46
|
-
</tr>
|
47
|
-
|
48
|
-
</tfoot>
|
49
|
-
|
50
|
-
</table>
|
51
|
-
|
52
|
-
</body>
|
53
|
-
</html>
|
data/spec/examples/table.txt
DELETED
data/spec/examples/test3.html
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
test one<br />test two
|
data/spec/examples/test3.txt
DELETED
data/spec/examples/test4.html
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
1<br />2<br />3<br />4<br />5 < 6
|
@@ -1 +0,0 @@
|
|
1
|
-
<p>foo‌bar</p>
|
@@ -1 +0,0 @@
|
|
1
|
-
foobar
|
data/spec/examples_spec.rb
DELETED
@@ -1,41 +0,0 @@
|
|
1
|
-
require "spec_helper"
|
2
|
-
|
3
|
-
describe Html2Text do
|
4
|
-
describe "#convert" do
|
5
|
-
let(:text) { Html2Text.convert(html) }
|
6
|
-
|
7
|
-
examples = Dir[File.dirname(__FILE__) + "/examples/*.html"]
|
8
|
-
|
9
|
-
examples.each do |filename|
|
10
|
-
context "#{filename}" do
|
11
|
-
let(:html) { File.read(filename) }
|
12
|
-
let(:text_file) { filename.sub(".html", ".txt") }
|
13
|
-
let(:expected) { Html2Text.fix_newlines(File.read(text_file)) }
|
14
|
-
|
15
|
-
it "has an expected output" do
|
16
|
-
expect(File.exist?(text_file)).to eq(true), "'#{text_file}' did not exist"
|
17
|
-
end
|
18
|
-
|
19
|
-
it "converts to text" do
|
20
|
-
# Write the output if it failed, for easier comparison
|
21
|
-
if !text.eql?(expected)
|
22
|
-
File.open(filename.sub(".html", ".output"), 'w') do |fp|
|
23
|
-
fp.write(text)
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
# Quick check, don't try to generate a 500kb+ diff,
|
28
|
-
# which can halt the rspec for minutes+
|
29
|
-
expect(text.length).to eq expected.length if text.length > 10000
|
30
|
-
|
31
|
-
# More complete check
|
32
|
-
expect(text).to eq expected
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
it "has examples to test" do
|
38
|
-
expect(examples.size).to_not eq(0)
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
data/spec/html2text_spec.rb
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
require "spec_helper"
|
2
|
-
|
3
|
-
describe Html2Text do
|
4
|
-
describe "#convert" do
|
5
|
-
let(:text) { Html2Text.convert(html) }
|
6
|
-
|
7
|
-
context "an empty line" do
|
8
|
-
let(:html) { "" }
|
9
|
-
|
10
|
-
it "is an empty line" do
|
11
|
-
expect(text).to eq("")
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
context "a simple string" do
|
16
|
-
let(:html) { "hello world" }
|
17
|
-
|
18
|
-
it "is an empty line" do
|
19
|
-
expect(text).to eq("hello world")
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
context "input value is non-string" do
|
24
|
-
let(:html) { nil }
|
25
|
-
it '(nil)' do
|
26
|
-
expect(text).to eq("")
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
context "input value is non-string" do
|
31
|
-
let(:html) { 1234 }
|
32
|
-
it "(number)" do
|
33
|
-
expect(text).to eq("1234")
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
context "input value is non-string" do
|
38
|
-
let(:html) { 1234.5600 }
|
39
|
-
it "(float number)" do
|
40
|
-
expect(text).to eq("1234.56")
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
describe "#remove_leading_and_trailing_whitespace" do
|
46
|
-
let(:subject) { Html2Text.new(nil).remove_leading_and_trailing_whitespace(input) }
|
47
|
-
|
48
|
-
context "an empty string" do
|
49
|
-
let(:input) { "" }
|
50
|
-
it { is_expected.to eq("") }
|
51
|
-
end
|
52
|
-
|
53
|
-
context "many new lines" do
|
54
|
-
let(:input) { "hello\n world \n yes" }
|
55
|
-
it { is_expected.to eq("hello\nworld\nyes") }
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
data/spec/spec_helper.rb
DELETED