html2text 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +26 -4
- data/README.md +5 -5
- data/lib/html2text/version.rb +3 -1
- data/lib/html2text.rb +108 -106
- metadata +78 -109
- data/spec/examples/anchors.html +0 -12
- data/spec/examples/anchors.txt +0 -5
- data/spec/examples/basic.html +0 -21
- data/spec/examples/basic.txt +0 -15
- data/spec/examples/dom-processing.html +0 -8
- data/spec/examples/dom-processing.txt +0 -1
- data/spec/examples/empty.html +0 -0
- data/spec/examples/empty.txt +0 -0
- data/spec/examples/full_email.html +0 -220
- data/spec/examples/full_email.txt +0 -54
- data/spec/examples/huge-msoffice.html +0 -1
- data/spec/examples/huge-msoffice.txt +0 -25872
- data/spec/examples/images.html +0 -54
- data/spec/examples/images.txt +0 -27
- data/spec/examples/invalid.html +0 -4
- data/spec/examples/invalid.txt +0 -1
- data/spec/examples/lists.html +0 -24
- data/spec/examples/lists.txt +0 -17
- data/spec/examples/more-anchors.html +0 -14
- data/spec/examples/more-anchors.txt +0 -7
- data/spec/examples/msoffice.html +0 -1
- data/spec/examples/msoffice.txt +0 -12
- data/spec/examples/nbsp.html +0 -1
- data/spec/examples/nbsp.txt +0 -1
- data/spec/examples/nested-divs.html +0 -17
- data/spec/examples/nested-divs.txt +0 -12
- data/spec/examples/newlines.html +0 -50
- data/spec/examples/newlines.txt +0 -35
- data/spec/examples/non-breaking-spaces.html +0 -1
- data/spec/examples/non-breaking-spaces.txt +0 -1
- data/spec/examples/pre.html +0 -10
- data/spec/examples/pre.txt +0 -8
- data/spec/examples/table.html +0 -53
- data/spec/examples/table.txt +0 -7
- data/spec/examples/test3.html +0 -1
- data/spec/examples/test3.txt +0 -2
- data/spec/examples/test4.html +0 -1
- data/spec/examples/test4.txt +0 -5
- data/spec/examples/utf8-example.html +0 -4
- data/spec/examples/utf8-example.txt +0 -2
- data/spec/examples/windows-1252-example.html +0 -4
- data/spec/examples/windows-1252-example.txt +0 -2
- data/spec/examples/zero-width-non-joiners.html +0 -1
- data/spec/examples/zero-width-non-joiners.txt +0 -1
- data/spec/examples_spec.rb +0 -41
- data/spec/html2text_spec.rb +0 -58
- data/spec/spec_helper.rb +0 -4
data/spec/examples/images.html
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
<body>
|
2
|
-
<p>
|
3
|
-
One: <img src="one.png">
|
4
|
-
</p>
|
5
|
-
|
6
|
-
<p>
|
7
|
-
Two: <img src="two.png" alt="two">
|
8
|
-
</p>
|
9
|
-
|
10
|
-
<p>
|
11
|
-
Three: <img src="three.png" title="three">
|
12
|
-
</p>
|
13
|
-
|
14
|
-
<p>
|
15
|
-
Four: <img src="four.png" title="four" alt="four alt">
|
16
|
-
</p>
|
17
|
-
|
18
|
-
<h1>With links</h1>
|
19
|
-
|
20
|
-
<p>
|
21
|
-
One: <a href="http://localhost"><img src="one.png"></a>
|
22
|
-
</p>
|
23
|
-
|
24
|
-
<p>
|
25
|
-
Two: <a href="http://localhost"><img src="two.png" alt="two"></a>
|
26
|
-
</p>
|
27
|
-
|
28
|
-
<p>
|
29
|
-
Three: <a href="http://localhost"><img src="three.png" title="three"></a>
|
30
|
-
</p>
|
31
|
-
|
32
|
-
<p>
|
33
|
-
Four: <a href="http://localhost"><img src="four.png" title="four" alt="four alt"></a>
|
34
|
-
</p>
|
35
|
-
|
36
|
-
<h1>With links with titles</h1>
|
37
|
-
|
38
|
-
<p>
|
39
|
-
One: <a href="http://localhost" title="one link"><img src="one.png"></a>
|
40
|
-
</p>
|
41
|
-
|
42
|
-
<p>
|
43
|
-
Two: <a href="http://localhost" title="two link"><img src="two.png" alt="two"></a>
|
44
|
-
</p>
|
45
|
-
|
46
|
-
<p>
|
47
|
-
Three: <a href="http://localhost" title="three link"><img src="three.png" title="three"></a>
|
48
|
-
</p>
|
49
|
-
|
50
|
-
<p>
|
51
|
-
Four: <a href="http://localhost" title="four link"><img src="four.png" title="four" alt="four alt"></a>
|
52
|
-
</p>
|
53
|
-
</body>
|
54
|
-
</html>
|
data/spec/examples/images.txt
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
One:
|
2
|
-
|
3
|
-
Two: [two]
|
4
|
-
|
5
|
-
Three: [three]
|
6
|
-
|
7
|
-
Four: [four]
|
8
|
-
|
9
|
-
With links
|
10
|
-
|
11
|
-
One: http://localhost
|
12
|
-
|
13
|
-
Two: [two](http://localhost)
|
14
|
-
|
15
|
-
Three: [three](http://localhost)
|
16
|
-
|
17
|
-
Four: [four](http://localhost)
|
18
|
-
|
19
|
-
With links with titles
|
20
|
-
|
21
|
-
One: [one link](http://localhost)
|
22
|
-
|
23
|
-
Two: [two link](http://localhost)
|
24
|
-
|
25
|
-
Three: [three link](http://localhost)
|
26
|
-
|
27
|
-
Four: [four link](http://localhost)
|
data/spec/examples/invalid.html
DELETED
data/spec/examples/invalid.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
Hello &nbsnbsp; world
|
data/spec/examples/lists.html
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
<h1>List tests</h1>
|
2
|
-
|
3
|
-
<p>
|
4
|
-
Add some lists.
|
5
|
-
</p>
|
6
|
-
|
7
|
-
<ol>
|
8
|
-
<li>one</li>
|
9
|
-
<li>two
|
10
|
-
<li>three
|
11
|
-
</ol>
|
12
|
-
|
13
|
-
<h2>An unordered list</h2>
|
14
|
-
|
15
|
-
<ul>
|
16
|
-
<li>one
|
17
|
-
<li>two</li>
|
18
|
-
<li>three</li>
|
19
|
-
</ul>
|
20
|
-
<ul>
|
21
|
-
<li>one
|
22
|
-
<li>two</li>
|
23
|
-
<li>three</li>
|
24
|
-
</ul>
|
data/spec/examples/lists.txt
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
<h1>Anchor tests</h1>
|
2
|
-
|
3
|
-
<p>
|
4
|
-
Visit http://openiaml.org or <a href="http://openiaml.org">openiaml.org</a> or <a href="http://openiaml.org">http://openiaml.org</a>.
|
5
|
-
</p>
|
6
|
-
|
7
|
-
<p>
|
8
|
-
To visit with SSL, visit https://openiaml.org or <a href="https://openiaml.org">openiaml.org</a> or <a href="https://openiaml.org">https://openiaml.org</a>.
|
9
|
-
</p>
|
10
|
-
|
11
|
-
<p>
|
12
|
-
To mail, email support@openiaml.org or mailto:support@openiaml.org
|
13
|
-
or <a href="mailto:support@openiaml.org">support@openiaml.org</a> or <a href="mailto:support@openiaml.org">mailto:support@openiaml.org</a>.
|
14
|
-
</p>
|
@@ -1,7 +0,0 @@
|
|
1
|
-
Anchor tests
|
2
|
-
|
3
|
-
Visit http://openiaml.org or openiaml.org or http://openiaml.org.
|
4
|
-
|
5
|
-
To visit with SSL, visit https://openiaml.org or openiaml.org or https://openiaml.org.
|
6
|
-
|
7
|
-
To mail, email support@openiaml.org or mailto:support@openiaml.org or support@openiaml.org or mailto:support@openiaml.org.
|
data/spec/examples/msoffice.html
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=us-ascii"><meta name=Generator content="Microsoft Word 15 (filtered medium)"><style><!-- /* Font Definitions */ @font-face {font-family:"Cambria Math"; panose-1:2 4 5 3 5 4 6 3 2 4;} @font-face {font-family:Calibri; panose-1:2 15 5 2 2 2 4 3 2 4;} /* Style Definitions */ p.MsoNormal, li.MsoNormal, div.MsoNormal {margin:0cm; margin-bottom:.0001pt; font-size:11.0pt; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} a:link, span.MsoHyperlink {mso-style-priority:99; color:#0563C1; text-decoration:underline;} a:visited, span.MsoHyperlinkFollowed {mso-style-priority:99; color:#954F72; text-decoration:underline;} span.EmailStyle17 {mso-style-type:personal-compose; font-family:"Calibri",sans-serif; color:windowtext;} .MsoChpDefault {mso-style-type:export-only; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} @page WordSection1 {size:612.0pt 792.0pt; margin:72.0pt 72.0pt 72.0pt 72.0pt;} div.WordSection1 {page:WordSection1;} --></style><!--[if gte mso 9]><xml> <o:shapedefaults v:ext="edit" spidmax="1026" /> </xml><![endif]--><!--[if gte mso 9]><xml> <o:shapelayout v:ext="edit"> <o:idmap v:ext="edit" data="1" /> </o:shapelayout></xml><![endif]--></head><body lang=EN-GB link="#0563C1" vlink="#954F72"><div class=WordSection1><p class=MsoNormal>Dear html2text,<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>This is an example email that can be used to test html2text conversion of outlook / exchange emails.<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>The addition of <o:p> tags is very annoying!<o:p></o:p></p><p class=MsoNormal>This is a single line return<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal><b>This is bold<o:p></o:p></b></p><p class=MsoNormal><i>This is italic<o:p></o:p></i></p><p class=MsoNormal><u>This is underline<o:p></o:p></u></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>Andrew<o:p></o:p></p></div></body></html>
|
data/spec/examples/msoffice.txt
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
Dear html2text,
|
2
|
-
|
3
|
-
This is an example email that can be used to test html2text conversion of outlook / exchange emails.
|
4
|
-
|
5
|
-
The addition of <o:p> tags is very annoying!
|
6
|
-
This is a single line return
|
7
|
-
|
8
|
-
This is bold
|
9
|
-
This is italic
|
10
|
-
This is underline
|
11
|
-
|
12
|
-
Andrew
|
data/spec/examples/nbsp.html
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
hello world & people < > &NBSP;
|
data/spec/examples/nbsp.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
hello world & people < > &NBSP;
|
@@ -1,17 +0,0 @@
|
|
1
|
-
<html>
|
2
|
-
<body>
|
3
|
-
<div>
|
4
|
-
Just two divs
|
5
|
-
</div>
|
6
|
-
<div>
|
7
|
-
Hanging out
|
8
|
-
</div>
|
9
|
-
<div><div><div>Nested divs and line breaks</div></div><br></div>
|
10
|
-
<div><div>Nested divs and line breaks</div>More text<br></div>
|
11
|
-
<div><br></div>
|
12
|
-
<div>Just text</div>
|
13
|
-
<div>Just text<br></div>
|
14
|
-
<div>Just text<br><br></div>
|
15
|
-
This is the end!
|
16
|
-
</body>
|
17
|
-
</html>
|
data/spec/examples/newlines.html
DELETED
@@ -1,50 +0,0 @@
|
|
1
|
-
<html>
|
2
|
-
<body>
|
3
|
-
<div>
|
4
|
-
Hello
|
5
|
-
<br>
|
6
|
-
</div>
|
7
|
-
<div>
|
8
|
-
How are you?
|
9
|
-
<br>
|
10
|
-
</div>
|
11
|
-
|
12
|
-
<p>
|
13
|
-
How are you?
|
14
|
-
<br>
|
15
|
-
</p>
|
16
|
-
|
17
|
-
<p>
|
18
|
-
How are you?
|
19
|
-
<br>
|
20
|
-
</p>
|
21
|
-
|
22
|
-
<div>
|
23
|
-
Just two divs
|
24
|
-
</div>
|
25
|
-
<div>
|
26
|
-
Hanging out
|
27
|
-
</div>
|
28
|
-
|
29
|
-
This is not the end!
|
30
|
-
<div>
|
31
|
-
How are you again?
|
32
|
-
<br>
|
33
|
-
</div>
|
34
|
-
This is the end!
|
35
|
-
<br>
|
36
|
-
Just kidding
|
37
|
-
<h1>Header 1</h1>
|
38
|
-
Some text
|
39
|
-
<hr>
|
40
|
-
Some more text
|
41
|
-
<p>Paragraph tag!</p>
|
42
|
-
<h2>Header 2</h2>
|
43
|
-
<hr>
|
44
|
-
<h3>Header 3</h3>
|
45
|
-
Some text
|
46
|
-
<h4>Header 4</h4>
|
47
|
-
<p>Paragraph tag!</p>
|
48
|
-
Final line
|
49
|
-
</body>
|
50
|
-
</html>
|
data/spec/examples/newlines.txt
DELETED
@@ -1,35 +0,0 @@
|
|
1
|
-
Hello
|
2
|
-
How are you?
|
3
|
-
|
4
|
-
How are you?
|
5
|
-
|
6
|
-
How are you?
|
7
|
-
|
8
|
-
Just two divs
|
9
|
-
Hanging out
|
10
|
-
This is not the end!
|
11
|
-
How are you again?
|
12
|
-
This is the end!
|
13
|
-
Just kidding
|
14
|
-
|
15
|
-
Header 1
|
16
|
-
|
17
|
-
Some text
|
18
|
-
---------------------------------------------------------------
|
19
|
-
Some more text
|
20
|
-
|
21
|
-
Paragraph tag!
|
22
|
-
|
23
|
-
Header 2
|
24
|
-
|
25
|
-
---------------------------------------------------------------
|
26
|
-
|
27
|
-
Header 3
|
28
|
-
|
29
|
-
Some text
|
30
|
-
|
31
|
-
Header 4
|
32
|
-
|
33
|
-
Paragraph tag!
|
34
|
-
|
35
|
-
Final line
|
@@ -1 +0,0 @@
|
|
1
|
-
these spaces are non-breaking
|
@@ -1 +0,0 @@
|
|
1
|
-
these spaces are non-breaking
|
data/spec/examples/pre.html
DELETED
data/spec/examples/pre.txt
DELETED
data/spec/examples/table.html
DELETED
@@ -1,53 +0,0 @@
|
|
1
|
-
<html>
|
2
|
-
<title>Ignored Title</title>
|
3
|
-
<body>
|
4
|
-
<h1>Hello, World!</h1>
|
5
|
-
<table>
|
6
|
-
<thead>
|
7
|
-
<tr>
|
8
|
-
<th>Col A</th>
|
9
|
-
<th>Col B</th>
|
10
|
-
</tr>
|
11
|
-
</thead>
|
12
|
-
<tbody>
|
13
|
-
<tr>
|
14
|
-
<td>
|
15
|
-
Data A1
|
16
|
-
</td>
|
17
|
-
<td>
|
18
|
-
Data B1
|
19
|
-
</td>
|
20
|
-
</tr>
|
21
|
-
<tr>
|
22
|
-
<td>
|
23
|
-
Data A2
|
24
|
-
</td>
|
25
|
-
<td>
|
26
|
-
Data B2
|
27
|
-
</td>
|
28
|
-
</tr>
|
29
|
-
<tr>
|
30
|
-
<td>
|
31
|
-
Data A3
|
32
|
-
</td>
|
33
|
-
<td>
|
34
|
-
Data B4
|
35
|
-
</td>
|
36
|
-
</tr>
|
37
|
-
</tbody>
|
38
|
-
<tfoot>
|
39
|
-
<tr>
|
40
|
-
<td>
|
41
|
-
Total A
|
42
|
-
</td>
|
43
|
-
<td>
|
44
|
-
Total B
|
45
|
-
</td>
|
46
|
-
</tr>
|
47
|
-
|
48
|
-
</tfoot>
|
49
|
-
|
50
|
-
</table>
|
51
|
-
|
52
|
-
</body>
|
53
|
-
</html>
|
data/spec/examples/table.txt
DELETED
data/spec/examples/test3.html
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
test one<br />test two
|
data/spec/examples/test3.txt
DELETED
data/spec/examples/test4.html
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
1<br />2<br />3<br />4<br />5 < 6
|
@@ -1 +0,0 @@
|
|
1
|
-
<p>foo‌bar</p>
|
@@ -1 +0,0 @@
|
|
1
|
-
foobar
|
data/spec/examples_spec.rb
DELETED
@@ -1,41 +0,0 @@
|
|
1
|
-
require "spec_helper"
|
2
|
-
|
3
|
-
describe Html2Text do
|
4
|
-
describe "#convert" do
|
5
|
-
let(:text) { Html2Text.convert(html) }
|
6
|
-
|
7
|
-
examples = Dir[File.dirname(__FILE__) + "/examples/*.html"]
|
8
|
-
|
9
|
-
examples.each do |filename|
|
10
|
-
context "#{filename}" do
|
11
|
-
let(:html) { File.read(filename) }
|
12
|
-
let(:text_file) { filename.sub(".html", ".txt") }
|
13
|
-
let(:expected) { Html2Text.fix_newlines(File.read(text_file)) }
|
14
|
-
|
15
|
-
it "has an expected output" do
|
16
|
-
expect(File.exist?(text_file)).to eq(true), "'#{text_file}' did not exist"
|
17
|
-
end
|
18
|
-
|
19
|
-
it "converts to text" do
|
20
|
-
# Write the output if it failed, for easier comparison
|
21
|
-
if !text.eql?(expected)
|
22
|
-
File.open(filename.sub(".html", ".output"), 'w') do |fp|
|
23
|
-
fp.write(text)
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
# Quick check, don't try to generate a 500kb+ diff,
|
28
|
-
# which can halt the rspec for minutes+
|
29
|
-
expect(text.length).to eq expected.length if text.length > 10000
|
30
|
-
|
31
|
-
# More complete check
|
32
|
-
expect(text).to eq expected
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
it "has examples to test" do
|
38
|
-
expect(examples.size).to_not eq(0)
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
data/spec/html2text_spec.rb
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
require "spec_helper"
|
2
|
-
|
3
|
-
describe Html2Text do
|
4
|
-
describe "#convert" do
|
5
|
-
let(:text) { Html2Text.convert(html) }
|
6
|
-
|
7
|
-
context "an empty line" do
|
8
|
-
let(:html) { "" }
|
9
|
-
|
10
|
-
it "is an empty line" do
|
11
|
-
expect(text).to eq("")
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
context "a simple string" do
|
16
|
-
let(:html) { "hello world" }
|
17
|
-
|
18
|
-
it "is an empty line" do
|
19
|
-
expect(text).to eq("hello world")
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
context "input value is non-string" do
|
24
|
-
let(:html) { nil }
|
25
|
-
it '(nil)' do
|
26
|
-
expect(text).to eq("")
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
context "input value is non-string" do
|
31
|
-
let(:html) { 1234 }
|
32
|
-
it "(number)" do
|
33
|
-
expect(text).to eq("1234")
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
context "input value is non-string" do
|
38
|
-
let(:html) { 1234.5600 }
|
39
|
-
it "(float number)" do
|
40
|
-
expect(text).to eq("1234.56")
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
describe "#remove_leading_and_trailing_whitespace" do
|
46
|
-
let(:subject) { Html2Text.new(nil).remove_leading_and_trailing_whitespace(input) }
|
47
|
-
|
48
|
-
context "an empty string" do
|
49
|
-
let(:input) { "" }
|
50
|
-
it { is_expected.to eq("") }
|
51
|
-
end
|
52
|
-
|
53
|
-
context "many new lines" do
|
54
|
-
let(:input) { "hello\n world \n yes" }
|
55
|
-
it { is_expected.to eq("hello\nworld\nyes") }
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
data/spec/spec_helper.rb
DELETED