html2text 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +13 -0
  3. data/README.md +5 -5
  4. data/lib/html2text/version.rb +3 -1
  5. data/lib/html2text.rb +108 -106
  6. metadata +65 -110
  7. data/spec/examples/anchors.html +0 -12
  8. data/spec/examples/anchors.txt +0 -5
  9. data/spec/examples/basic.html +0 -21
  10. data/spec/examples/basic.txt +0 -15
  11. data/spec/examples/dom-processing.html +0 -8
  12. data/spec/examples/dom-processing.txt +0 -1
  13. data/spec/examples/empty.html +0 -0
  14. data/spec/examples/empty.txt +0 -0
  15. data/spec/examples/full_email.html +0 -220
  16. data/spec/examples/full_email.txt +0 -54
  17. data/spec/examples/huge-msoffice.html +0 -1
  18. data/spec/examples/huge-msoffice.txt +0 -25872
  19. data/spec/examples/images.html +0 -54
  20. data/spec/examples/images.txt +0 -27
  21. data/spec/examples/invalid.html +0 -4
  22. data/spec/examples/invalid.txt +0 -1
  23. data/spec/examples/lists.html +0 -24
  24. data/spec/examples/lists.txt +0 -17
  25. data/spec/examples/more-anchors.html +0 -14
  26. data/spec/examples/more-anchors.txt +0 -7
  27. data/spec/examples/msoffice.html +0 -1
  28. data/spec/examples/msoffice.txt +0 -12
  29. data/spec/examples/nbsp.html +0 -1
  30. data/spec/examples/nbsp.txt +0 -1
  31. data/spec/examples/nested-divs.html +0 -17
  32. data/spec/examples/nested-divs.txt +0 -12
  33. data/spec/examples/newlines.html +0 -50
  34. data/spec/examples/newlines.txt +0 -35
  35. data/spec/examples/non-breaking-spaces.html +0 -1
  36. data/spec/examples/non-breaking-spaces.txt +0 -1
  37. data/spec/examples/pre.html +0 -10
  38. data/spec/examples/pre.txt +0 -8
  39. data/spec/examples/table.html +0 -53
  40. data/spec/examples/table.txt +0 -7
  41. data/spec/examples/test3.html +0 -1
  42. data/spec/examples/test3.txt +0 -2
  43. data/spec/examples/test4.html +0 -1
  44. data/spec/examples/test4.txt +0 -5
  45. data/spec/examples/utf8-example.html +0 -4
  46. data/spec/examples/utf8-example.txt +0 -2
  47. data/spec/examples/windows-1252-example.html +0 -4
  48. data/spec/examples/windows-1252-example.txt +0 -2
  49. data/spec/examples/zero-width-non-joiners.html +0 -1
  50. data/spec/examples/zero-width-non-joiners.txt +0 -1
  51. data/spec/examples_spec.rb +0 -41
  52. data/spec/html2text_spec.rb +0 -58
  53. data/spec/spec_helper.rb +0 -4
@@ -1,54 +0,0 @@
1
- <body>
2
- <p>
3
- One: <img src="one.png">
4
- </p>
5
-
6
- <p>
7
- Two: <img src="two.png" alt="two">
8
- </p>
9
-
10
- <p>
11
- Three: <img src="three.png" title="three">
12
- </p>
13
-
14
- <p>
15
- Four: <img src="four.png" title="four" alt="four alt">
16
- </p>
17
-
18
- <h1>With links</h1>
19
-
20
- <p>
21
- One: <a href="http://localhost"><img src="one.png"></a>
22
- </p>
23
-
24
- <p>
25
- Two: <a href="http://localhost"><img src="two.png" alt="two"></a>
26
- </p>
27
-
28
- <p>
29
- Three: <a href="http://localhost"><img src="three.png" title="three"></a>
30
- </p>
31
-
32
- <p>
33
- Four: <a href="http://localhost"><img src="four.png" title="four" alt="four alt"></a>
34
- </p>
35
-
36
- <h1>With links with titles</h1>
37
-
38
- <p>
39
- One: <a href="http://localhost" title="one link"><img src="one.png"></a>
40
- </p>
41
-
42
- <p>
43
- Two: <a href="http://localhost" title="two link"><img src="two.png" alt="two"></a>
44
- </p>
45
-
46
- <p>
47
- Three: <a href="http://localhost" title="three link"><img src="three.png" title="three"></a>
48
- </p>
49
-
50
- <p>
51
- Four: <a href="http://localhost" title="four link"><img src="four.png" title="four" alt="four alt"></a>
52
- </p>
53
- </body>
54
- </html>
@@ -1,27 +0,0 @@
1
- One:
2
-
3
- Two: [two]
4
-
5
- Three: [three]
6
-
7
- Four: [four]
8
-
9
- With links
10
-
11
- One: http://localhost
12
-
13
- Two: [two](http://localhost)
14
-
15
- Three: [three](http://localhost)
16
-
17
- Four: [four](http://localhost)
18
-
19
- With links with titles
20
-
21
- One: [one link](http://localhost)
22
-
23
- Two: [two link](http://localhost)
24
-
25
- Three: [three link](http://localhost)
26
-
27
- Four: [four link](http://localhost)
@@ -1,4 +0,0 @@
1
- <b>Hello &nbsnbsp; world</b>
2
- <div class=">
3
- Error
4
- </div>
@@ -1 +0,0 @@
1
- Hello &nbsnbsp; world
@@ -1,24 +0,0 @@
1
- <h1>List tests</h1>
2
-
3
- <p>
4
- Add some lists.
5
- </p>
6
-
7
- <ol>
8
- <li>one</li>
9
- <li>two
10
- <li>three
11
- </ol>
12
-
13
- <h2>An unordered list</h2>
14
-
15
- <ul>
16
- <li>one
17
- <li>two</li>
18
- <li>three</li>
19
- </ul>
20
- <ul>
21
- <li>one
22
- <li>two</li>
23
- <li>three</li>
24
- </ul>
@@ -1,17 +0,0 @@
1
- List tests
2
-
3
- Add some lists.
4
-
5
- - one
6
- - two
7
- - three
8
-
9
- An unordered list
10
-
11
- - one
12
- - two
13
- - three
14
-
15
- - one
16
- - two
17
- - three
@@ -1,14 +0,0 @@
1
- <h1>Anchor tests</h1>
2
-
3
- <p>
4
- Visit http://openiaml.org or <a href="http://openiaml.org">openiaml.org</a> or <a href="http://openiaml.org">http://openiaml.org</a>.
5
- </p>
6
-
7
- <p>
8
- To visit with SSL, visit https://openiaml.org or <a href="https://openiaml.org">openiaml.org</a> or <a href="https://openiaml.org">https://openiaml.org</a>.
9
- </p>
10
-
11
- <p>
12
- To mail, email support@openiaml.org or mailto:support@openiaml.org
13
- or <a href="mailto:support@openiaml.org">support@openiaml.org</a> or <a href="mailto:support@openiaml.org">mailto:support@openiaml.org</a>.
14
- </p>
@@ -1,7 +0,0 @@
1
- Anchor tests
2
-
3
- Visit http://openiaml.org or openiaml.org or http://openiaml.org.
4
-
5
- To visit with SSL, visit https://openiaml.org or openiaml.org or https://openiaml.org.
6
-
7
- To mail, email support@openiaml.org or mailto:support@openiaml.org or support@openiaml.org or mailto:support@openiaml.org.
@@ -1 +0,0 @@
1
- <html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=us-ascii"><meta name=Generator content="Microsoft Word 15 (filtered medium)"><style><!-- /* Font Definitions */ @font-face {font-family:"Cambria Math"; panose-1:2 4 5 3 5 4 6 3 2 4;} @font-face {font-family:Calibri; panose-1:2 15 5 2 2 2 4 3 2 4;} /* Style Definitions */ p.MsoNormal, li.MsoNormal, div.MsoNormal {margin:0cm; margin-bottom:.0001pt; font-size:11.0pt; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} a:link, span.MsoHyperlink {mso-style-priority:99; color:#0563C1; text-decoration:underline;} a:visited, span.MsoHyperlinkFollowed {mso-style-priority:99; color:#954F72; text-decoration:underline;} span.EmailStyle17 {mso-style-type:personal-compose; font-family:"Calibri",sans-serif; color:windowtext;} .MsoChpDefault {mso-style-type:export-only; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} @page WordSection1 {size:612.0pt 792.0pt; margin:72.0pt 72.0pt 72.0pt 72.0pt;} div.WordSection1 {page:WordSection1;} --></style><!--[if gte mso 9]><xml> <o:shapedefaults v:ext="edit" spidmax="1026" /> </xml><![endif]--><!--[if gte mso 9]><xml> <o:shapelayout v:ext="edit"> <o:idmap v:ext="edit" data="1" /> </o:shapelayout></xml><![endif]--></head><body lang=EN-GB link="#0563C1" vlink="#954F72"><div class=WordSection1><p class=MsoNormal>Dear html2text,<o:p></o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>This is an example email that can be used to test html2text conversion of outlook / exchange emails.<o:p></o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>The addition of &lt;o:p&gt; tags is very annoying!<o:p></o:p></p><p class=MsoNormal>This is a single line return<o:p></o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal><b>This is bold<o:p></o:p></b></p><p class=MsoNormal><i>This is italic<o:p></o:p></i></p><p class=MsoNormal><u>This is underline<o:p></o:p></u></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>Andrew<o:p></o:p></p></div></body></html>
@@ -1,12 +0,0 @@
1
- Dear html2text,
2
-
3
- This is an example email that can be used to test html2text conversion of outlook / exchange emails.
4
-
5
- The addition of <o:p> tags is very annoying!
6
- This is a single line return
7
-
8
- This is bold
9
- This is italic
10
- This is underline
11
-
12
- Andrew
@@ -1 +0,0 @@
1
- hello &nbsp; world &amp; people &lt; &gt; &NBSP;
@@ -1 +0,0 @@
1
- hello world & people < > &NBSP;
@@ -1,17 +0,0 @@
1
- <html>
2
- <body>
3
- <div>
4
- Just two divs
5
- </div>
6
- <div>
7
- Hanging out
8
- </div>
9
- <div><div><div>Nested divs and line breaks</div></div><br></div>
10
- <div><div>Nested divs and line breaks</div>More text<br></div>
11
- <div><br></div>
12
- <div>Just text</div>
13
- <div>Just text<br></div>
14
- <div>Just text<br><br></div>
15
- This is the end!
16
- </body>
17
- </html>
@@ -1,12 +0,0 @@
1
- Just two divs
2
- Hanging out
3
- Nested divs and line breaks
4
-
5
- Nested divs and line breaks
6
- More text
7
-
8
- Just text
9
- Just text
10
- Just text
11
-
12
- This is the end!
@@ -1,50 +0,0 @@
1
- <html>
2
- <body>
3
- <div>
4
- Hello
5
- <br>
6
- </div>
7
- <div>
8
- How are you?
9
- <br>
10
- </div>
11
-
12
- <p>
13
- How are you?
14
- <br>
15
- </p>
16
-
17
- <p>
18
- How are you?
19
- <br>
20
- </p>
21
-
22
- <div>
23
- Just two divs
24
- </div>
25
- <div>
26
- Hanging out
27
- </div>
28
-
29
- This is not the end!
30
- <div>
31
- How are you again?
32
- <br>
33
- </div>
34
- This is the end!
35
- <br>
36
- Just kidding
37
- <h1>Header 1</h1>
38
- Some text
39
- <hr>
40
- Some more text
41
- <p>Paragraph tag!</p>
42
- <h2>Header 2</h2>
43
- <hr>
44
- <h3>Header 3</h3>
45
- Some text
46
- <h4>Header 4</h4>
47
- <p>Paragraph tag!</p>
48
- Final line
49
- </body>
50
- </html>
@@ -1,35 +0,0 @@
1
- Hello
2
- How are you?
3
-
4
- How are you?
5
-
6
- How are you?
7
-
8
- Just two divs
9
- Hanging out
10
- This is not the end!
11
- How are you again?
12
- This is the end!
13
- Just kidding
14
-
15
- Header 1
16
-
17
- Some text
18
- ---------------------------------------------------------------
19
- Some more text
20
-
21
- Paragraph tag!
22
-
23
- Header 2
24
-
25
- ---------------------------------------------------------------
26
-
27
- Header 3
28
-
29
- Some text
30
-
31
- Header 4
32
-
33
- Paragraph tag!
34
-
35
- Final line
@@ -1 +0,0 @@
1
- these spaces are non-breaking
@@ -1 +0,0 @@
1
- these spaces are non-breaking
@@ -1,10 +0,0 @@
1
- Here is the code
2
- <pre>
3
- #include &lt;stdlib.h&gt;
4
- #include &lt;stdio.h&gt;
5
-
6
- int main(){
7
- return 0;
8
- };
9
-
10
- </pre>
@@ -1,8 +0,0 @@
1
- Here is the code
2
-
3
- #include <stdlib.h>
4
- #include <stdio.h>
5
-
6
- int main(){
7
- return 0;
8
- };
@@ -1,53 +0,0 @@
1
- <html>
2
- <title>Ignored Title</title>
3
- <body>
4
- <h1>Hello, World!</h1>
5
- <table>
6
- <thead>
7
- <tr>
8
- <th>Col A</th>
9
- <th>Col B</th>
10
- </tr>
11
- </thead>
12
- <tbody>
13
- <tr>
14
- <td>
15
- Data A1
16
- </td>
17
- <td>
18
- Data B1
19
- </td>
20
- </tr>
21
- <tr>
22
- <td>
23
- Data A2
24
- </td>
25
- <td>
26
- Data B2
27
- </td>
28
- </tr>
29
- <tr>
30
- <td>
31
- Data A3
32
- </td>
33
- <td>
34
- Data B4
35
- </td>
36
- </tr>
37
- </tbody>
38
- <tfoot>
39
- <tr>
40
- <td>
41
- Total A
42
- </td>
43
- <td>
44
- Total B
45
- </td>
46
- </tr>
47
-
48
- </tfoot>
49
-
50
- </table>
51
-
52
- </body>
53
- </html>
@@ -1,7 +0,0 @@
1
- Hello, World!
2
-
3
- Col A Col B
4
- Data A1 Data B1
5
- Data A2 Data B2
6
- Data A3 Data B4
7
- Total A Total B
@@ -1 +0,0 @@
1
- test one<br />test two
@@ -1,2 +0,0 @@
1
- test one
2
- test two
@@ -1 +0,0 @@
1
- 1<br />2<br />3<br />4<br />5 &lt; 6
@@ -1,5 +0,0 @@
1
- 1
2
- 2
3
- 3
4
- 4
5
- 5 < 6
@@ -1,4 +0,0 @@
1
- <ul>
2
- <li>ÅÄÖ</li>
3
- <li>åäö</li>
4
- </ul>
@@ -1,2 +0,0 @@
1
- - ÅÄÖ
2
- - åäö
@@ -1,4 +0,0 @@
1
- <ul>
2
- <li>���</li>
3
- <li>���</li>
4
- </ul>
@@ -1,2 +0,0 @@
1
- - ÅÄÖ
2
- - åäö
@@ -1 +0,0 @@
1
- <p>foo&zwnj;bar</p>
@@ -1 +0,0 @@
1
- foobar
@@ -1,41 +0,0 @@
1
- require "spec_helper"
2
-
3
- describe Html2Text do
4
- describe "#convert" do
5
- let(:text) { Html2Text.convert(html) }
6
-
7
- examples = Dir[File.dirname(__FILE__) + "/examples/*.html"]
8
-
9
- examples.each do |filename|
10
- context "#{filename}" do
11
- let(:html) { File.read(filename) }
12
- let(:text_file) { filename.sub(".html", ".txt") }
13
- let(:expected) { Html2Text.fix_newlines(File.read(text_file)) }
14
-
15
- it "has an expected output" do
16
- expect(File.exist?(text_file)).to eq(true), "'#{text_file}' did not exist"
17
- end
18
-
19
- it "converts to text" do
20
- # Write the output if it failed, for easier comparison
21
- if !text.eql?(expected)
22
- File.open(filename.sub(".html", ".output"), 'w') do |fp|
23
- fp.write(text)
24
- end
25
- end
26
-
27
- # Quick check, don't try to generate a 500kb+ diff,
28
- # which can halt the rspec for minutes+
29
- expect(text.length).to eq expected.length if text.length > 10000
30
-
31
- # More complete check
32
- expect(text).to eq expected
33
- end
34
- end
35
- end
36
-
37
- it "has examples to test" do
38
- expect(examples.size).to_not eq(0)
39
- end
40
- end
41
- end
@@ -1,58 +0,0 @@
1
- require "spec_helper"
2
-
3
- describe Html2Text do
4
- describe "#convert" do
5
- let(:text) { Html2Text.convert(html) }
6
-
7
- context "an empty line" do
8
- let(:html) { "" }
9
-
10
- it "is an empty line" do
11
- expect(text).to eq("")
12
- end
13
- end
14
-
15
- context "a simple string" do
16
- let(:html) { "hello world" }
17
-
18
- it "is an empty line" do
19
- expect(text).to eq("hello world")
20
- end
21
- end
22
-
23
- context "input value is non-string" do
24
- let(:html) { nil }
25
- it '(nil)' do
26
- expect(text).to eq("")
27
- end
28
- end
29
-
30
- context "input value is non-string" do
31
- let(:html) { 1234 }
32
- it "(number)" do
33
- expect(text).to eq("1234")
34
- end
35
- end
36
-
37
- context "input value is non-string" do
38
- let(:html) { 1234.5600 }
39
- it "(float number)" do
40
- expect(text).to eq("1234.56")
41
- end
42
- end
43
- end
44
-
45
- describe "#remove_leading_and_trailing_whitespace" do
46
- let(:subject) { Html2Text.new(nil).remove_leading_and_trailing_whitespace(input) }
47
-
48
- context "an empty string" do
49
- let(:input) { "" }
50
- it { is_expected.to eq("") }
51
- end
52
-
53
- context "many new lines" do
54
- let(:input) { "hello\n world \n yes" }
55
- it { is_expected.to eq("hello\nworld\nyes") }
56
- end
57
- end
58
- end
data/spec/spec_helper.rb DELETED
@@ -1,4 +0,0 @@
1
- require "rspec"
2
- require "rspec/collection_matchers"
3
-
4
- require File.join(File.dirname(__FILE__), "..", "lib", "html2text")