html2text 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +26 -4
  3. data/README.md +5 -5
  4. data/lib/html2text/version.rb +3 -1
  5. data/lib/html2text.rb +108 -106
  6. metadata +78 -109
  7. data/spec/examples/anchors.html +0 -12
  8. data/spec/examples/anchors.txt +0 -5
  9. data/spec/examples/basic.html +0 -21
  10. data/spec/examples/basic.txt +0 -15
  11. data/spec/examples/dom-processing.html +0 -8
  12. data/spec/examples/dom-processing.txt +0 -1
  13. data/spec/examples/empty.html +0 -0
  14. data/spec/examples/empty.txt +0 -0
  15. data/spec/examples/full_email.html +0 -220
  16. data/spec/examples/full_email.txt +0 -54
  17. data/spec/examples/huge-msoffice.html +0 -1
  18. data/spec/examples/huge-msoffice.txt +0 -25872
  19. data/spec/examples/images.html +0 -54
  20. data/spec/examples/images.txt +0 -27
  21. data/spec/examples/invalid.html +0 -4
  22. data/spec/examples/invalid.txt +0 -1
  23. data/spec/examples/lists.html +0 -24
  24. data/spec/examples/lists.txt +0 -17
  25. data/spec/examples/more-anchors.html +0 -14
  26. data/spec/examples/more-anchors.txt +0 -7
  27. data/spec/examples/msoffice.html +0 -1
  28. data/spec/examples/msoffice.txt +0 -12
  29. data/spec/examples/nbsp.html +0 -1
  30. data/spec/examples/nbsp.txt +0 -1
  31. data/spec/examples/nested-divs.html +0 -17
  32. data/spec/examples/nested-divs.txt +0 -12
  33. data/spec/examples/newlines.html +0 -50
  34. data/spec/examples/newlines.txt +0 -35
  35. data/spec/examples/non-breaking-spaces.html +0 -1
  36. data/spec/examples/non-breaking-spaces.txt +0 -1
  37. data/spec/examples/pre.html +0 -10
  38. data/spec/examples/pre.txt +0 -8
  39. data/spec/examples/table.html +0 -53
  40. data/spec/examples/table.txt +0 -7
  41. data/spec/examples/test3.html +0 -1
  42. data/spec/examples/test3.txt +0 -2
  43. data/spec/examples/test4.html +0 -1
  44. data/spec/examples/test4.txt +0 -5
  45. data/spec/examples/utf8-example.html +0 -4
  46. data/spec/examples/utf8-example.txt +0 -2
  47. data/spec/examples/windows-1252-example.html +0 -4
  48. data/spec/examples/windows-1252-example.txt +0 -2
  49. data/spec/examples/zero-width-non-joiners.html +0 -1
  50. data/spec/examples/zero-width-non-joiners.txt +0 -1
  51. data/spec/examples_spec.rb +0 -41
  52. data/spec/html2text_spec.rb +0 -58
  53. data/spec/spec_helper.rb +0 -4
@@ -1,54 +0,0 @@
1
- <body>
2
- <p>
3
- One: <img src="one.png">
4
- </p>
5
-
6
- <p>
7
- Two: <img src="two.png" alt="two">
8
- </p>
9
-
10
- <p>
11
- Three: <img src="three.png" title="three">
12
- </p>
13
-
14
- <p>
15
- Four: <img src="four.png" title="four" alt="four alt">
16
- </p>
17
-
18
- <h1>With links</h1>
19
-
20
- <p>
21
- One: <a href="http://localhost"><img src="one.png"></a>
22
- </p>
23
-
24
- <p>
25
- Two: <a href="http://localhost"><img src="two.png" alt="two"></a>
26
- </p>
27
-
28
- <p>
29
- Three: <a href="http://localhost"><img src="three.png" title="three"></a>
30
- </p>
31
-
32
- <p>
33
- Four: <a href="http://localhost"><img src="four.png" title="four" alt="four alt"></a>
34
- </p>
35
-
36
- <h1>With links with titles</h1>
37
-
38
- <p>
39
- One: <a href="http://localhost" title="one link"><img src="one.png"></a>
40
- </p>
41
-
42
- <p>
43
- Two: <a href="http://localhost" title="two link"><img src="two.png" alt="two"></a>
44
- </p>
45
-
46
- <p>
47
- Three: <a href="http://localhost" title="three link"><img src="three.png" title="three"></a>
48
- </p>
49
-
50
- <p>
51
- Four: <a href="http://localhost" title="four link"><img src="four.png" title="four" alt="four alt"></a>
52
- </p>
53
- </body>
54
- </html>
@@ -1,27 +0,0 @@
1
- One:
2
-
3
- Two: [two]
4
-
5
- Three: [three]
6
-
7
- Four: [four]
8
-
9
- With links
10
-
11
- One: http://localhost
12
-
13
- Two: [two](http://localhost)
14
-
15
- Three: [three](http://localhost)
16
-
17
- Four: [four](http://localhost)
18
-
19
- With links with titles
20
-
21
- One: [one link](http://localhost)
22
-
23
- Two: [two link](http://localhost)
24
-
25
- Three: [three link](http://localhost)
26
-
27
- Four: [four link](http://localhost)
@@ -1,4 +0,0 @@
1
- <b>Hello &nbsnbsp; world</b>
2
- <div class=">
3
- Error
4
- </div>
@@ -1 +0,0 @@
1
- Hello &nbsnbsp; world
@@ -1,24 +0,0 @@
1
- <h1>List tests</h1>
2
-
3
- <p>
4
- Add some lists.
5
- </p>
6
-
7
- <ol>
8
- <li>one</li>
9
- <li>two
10
- <li>three
11
- </ol>
12
-
13
- <h2>An unordered list</h2>
14
-
15
- <ul>
16
- <li>one
17
- <li>two</li>
18
- <li>three</li>
19
- </ul>
20
- <ul>
21
- <li>one
22
- <li>two</li>
23
- <li>three</li>
24
- </ul>
@@ -1,17 +0,0 @@
1
- List tests
2
-
3
- Add some lists.
4
-
5
- - one
6
- - two
7
- - three
8
-
9
- An unordered list
10
-
11
- - one
12
- - two
13
- - three
14
-
15
- - one
16
- - two
17
- - three
@@ -1,14 +0,0 @@
1
- <h1>Anchor tests</h1>
2
-
3
- <p>
4
- Visit http://openiaml.org or <a href="http://openiaml.org">openiaml.org</a> or <a href="http://openiaml.org">http://openiaml.org</a>.
5
- </p>
6
-
7
- <p>
8
- To visit with SSL, visit https://openiaml.org or <a href="https://openiaml.org">openiaml.org</a> or <a href="https://openiaml.org">https://openiaml.org</a>.
9
- </p>
10
-
11
- <p>
12
- To mail, email support@openiaml.org or mailto:support@openiaml.org
13
- or <a href="mailto:support@openiaml.org">support@openiaml.org</a> or <a href="mailto:support@openiaml.org">mailto:support@openiaml.org</a>.
14
- </p>
@@ -1,7 +0,0 @@
1
- Anchor tests
2
-
3
- Visit http://openiaml.org or openiaml.org or http://openiaml.org.
4
-
5
- To visit with SSL, visit https://openiaml.org or openiaml.org or https://openiaml.org.
6
-
7
- To mail, email support@openiaml.org or mailto:support@openiaml.org or support@openiaml.org or mailto:support@openiaml.org.
@@ -1 +0,0 @@
1
- <html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=us-ascii"><meta name=Generator content="Microsoft Word 15 (filtered medium)"><style><!-- /* Font Definitions */ @font-face {font-family:"Cambria Math"; panose-1:2 4 5 3 5 4 6 3 2 4;} @font-face {font-family:Calibri; panose-1:2 15 5 2 2 2 4 3 2 4;} /* Style Definitions */ p.MsoNormal, li.MsoNormal, div.MsoNormal {margin:0cm; margin-bottom:.0001pt; font-size:11.0pt; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} a:link, span.MsoHyperlink {mso-style-priority:99; color:#0563C1; text-decoration:underline;} a:visited, span.MsoHyperlinkFollowed {mso-style-priority:99; color:#954F72; text-decoration:underline;} span.EmailStyle17 {mso-style-type:personal-compose; font-family:"Calibri",sans-serif; color:windowtext;} .MsoChpDefault {mso-style-type:export-only; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} @page WordSection1 {size:612.0pt 792.0pt; margin:72.0pt 72.0pt 72.0pt 72.0pt;} div.WordSection1 {page:WordSection1;} --></style><!--[if gte mso 9]><xml> <o:shapedefaults v:ext="edit" spidmax="1026" /> </xml><![endif]--><!--[if gte mso 9]><xml> <o:shapelayout v:ext="edit"> <o:idmap v:ext="edit" data="1" /> </o:shapelayout></xml><![endif]--></head><body lang=EN-GB link="#0563C1" vlink="#954F72"><div class=WordSection1><p class=MsoNormal>Dear html2text,<o:p></o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>This is an example email that can be used to test html2text conversion of outlook / exchange emails.<o:p></o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>The addition of &lt;o:p&gt; tags is very annoying!<o:p></o:p></p><p class=MsoNormal>This is a single line return<o:p></o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal><b>This is bold<o:p></o:p></b></p><p class=MsoNormal><i>This is italic<o:p></o:p></i></p><p class=MsoNormal><u>This is underline<o:p></o:p></u></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>Andrew<o:p></o:p></p></div></body></html>
@@ -1,12 +0,0 @@
1
- Dear html2text,
2
-
3
- This is an example email that can be used to test html2text conversion of outlook / exchange emails.
4
-
5
- The addition of <o:p> tags is very annoying!
6
- This is a single line return
7
-
8
- This is bold
9
- This is italic
10
- This is underline
11
-
12
- Andrew
@@ -1 +0,0 @@
1
- hello &nbsp; world &amp; people &lt; &gt; &NBSP;
@@ -1 +0,0 @@
1
- hello world & people < > &NBSP;
@@ -1,17 +0,0 @@
1
- <html>
2
- <body>
3
- <div>
4
- Just two divs
5
- </div>
6
- <div>
7
- Hanging out
8
- </div>
9
- <div><div><div>Nested divs and line breaks</div></div><br></div>
10
- <div><div>Nested divs and line breaks</div>More text<br></div>
11
- <div><br></div>
12
- <div>Just text</div>
13
- <div>Just text<br></div>
14
- <div>Just text<br><br></div>
15
- This is the end!
16
- </body>
17
- </html>
@@ -1,12 +0,0 @@
1
- Just two divs
2
- Hanging out
3
- Nested divs and line breaks
4
-
5
- Nested divs and line breaks
6
- More text
7
-
8
- Just text
9
- Just text
10
- Just text
11
-
12
- This is the end!
@@ -1,50 +0,0 @@
1
- <html>
2
- <body>
3
- <div>
4
- Hello
5
- <br>
6
- </div>
7
- <div>
8
- How are you?
9
- <br>
10
- </div>
11
-
12
- <p>
13
- How are you?
14
- <br>
15
- </p>
16
-
17
- <p>
18
- How are you?
19
- <br>
20
- </p>
21
-
22
- <div>
23
- Just two divs
24
- </div>
25
- <div>
26
- Hanging out
27
- </div>
28
-
29
- This is not the end!
30
- <div>
31
- How are you again?
32
- <br>
33
- </div>
34
- This is the end!
35
- <br>
36
- Just kidding
37
- <h1>Header 1</h1>
38
- Some text
39
- <hr>
40
- Some more text
41
- <p>Paragraph tag!</p>
42
- <h2>Header 2</h2>
43
- <hr>
44
- <h3>Header 3</h3>
45
- Some text
46
- <h4>Header 4</h4>
47
- <p>Paragraph tag!</p>
48
- Final line
49
- </body>
50
- </html>
@@ -1,35 +0,0 @@
1
- Hello
2
- How are you?
3
-
4
- How are you?
5
-
6
- How are you?
7
-
8
- Just two divs
9
- Hanging out
10
- This is not the end!
11
- How are you again?
12
- This is the end!
13
- Just kidding
14
-
15
- Header 1
16
-
17
- Some text
18
- ---------------------------------------------------------------
19
- Some more text
20
-
21
- Paragraph tag!
22
-
23
- Header 2
24
-
25
- ---------------------------------------------------------------
26
-
27
- Header 3
28
-
29
- Some text
30
-
31
- Header 4
32
-
33
- Paragraph tag!
34
-
35
- Final line
@@ -1 +0,0 @@
1
- these spaces are non-breaking
@@ -1 +0,0 @@
1
- these spaces are non-breaking
@@ -1,10 +0,0 @@
1
- Here is the code
2
- <pre>
3
- #include &lt;stdlib.h&gt;
4
- #include &lt;stdio.h&gt;
5
-
6
- int main(){
7
- return 0;
8
- };
9
-
10
- </pre>
@@ -1,8 +0,0 @@
1
- Here is the code
2
-
3
- #include <stdlib.h>
4
- #include <stdio.h>
5
-
6
- int main(){
7
- return 0;
8
- };
@@ -1,53 +0,0 @@
1
- <html>
2
- <title>Ignored Title</title>
3
- <body>
4
- <h1>Hello, World!</h1>
5
- <table>
6
- <thead>
7
- <tr>
8
- <th>Col A</th>
9
- <th>Col B</th>
10
- </tr>
11
- </thead>
12
- <tbody>
13
- <tr>
14
- <td>
15
- Data A1
16
- </td>
17
- <td>
18
- Data B1
19
- </td>
20
- </tr>
21
- <tr>
22
- <td>
23
- Data A2
24
- </td>
25
- <td>
26
- Data B2
27
- </td>
28
- </tr>
29
- <tr>
30
- <td>
31
- Data A3
32
- </td>
33
- <td>
34
- Data B4
35
- </td>
36
- </tr>
37
- </tbody>
38
- <tfoot>
39
- <tr>
40
- <td>
41
- Total A
42
- </td>
43
- <td>
44
- Total B
45
- </td>
46
- </tr>
47
-
48
- </tfoot>
49
-
50
- </table>
51
-
52
- </body>
53
- </html>
@@ -1,7 +0,0 @@
1
- Hello, World!
2
-
3
- Col A Col B
4
- Data A1 Data B1
5
- Data A2 Data B2
6
- Data A3 Data B4
7
- Total A Total B
@@ -1 +0,0 @@
1
- test one<br />test two
@@ -1,2 +0,0 @@
1
- test one
2
- test two
@@ -1 +0,0 @@
1
- 1<br />2<br />3<br />4<br />5 &lt; 6
@@ -1,5 +0,0 @@
1
- 1
2
- 2
3
- 3
4
- 4
5
- 5 < 6
@@ -1,4 +0,0 @@
1
- <ul>
2
- <li>ÅÄÖ</li>
3
- <li>åäö</li>
4
- </ul>
@@ -1,2 +0,0 @@
1
- - ÅÄÖ
2
- - åäö
@@ -1,4 +0,0 @@
1
- <ul>
2
- <li>���</li>
3
- <li>���</li>
4
- </ul>
@@ -1,2 +0,0 @@
1
- - ÅÄÖ
2
- - åäö
@@ -1 +0,0 @@
1
- <p>foo&zwnj;bar</p>
@@ -1 +0,0 @@
1
- foobar
@@ -1,41 +0,0 @@
1
- require "spec_helper"
2
-
3
- describe Html2Text do
4
- describe "#convert" do
5
- let(:text) { Html2Text.convert(html) }
6
-
7
- examples = Dir[File.dirname(__FILE__) + "/examples/*.html"]
8
-
9
- examples.each do |filename|
10
- context "#{filename}" do
11
- let(:html) { File.read(filename) }
12
- let(:text_file) { filename.sub(".html", ".txt") }
13
- let(:expected) { Html2Text.fix_newlines(File.read(text_file)) }
14
-
15
- it "has an expected output" do
16
- expect(File.exist?(text_file)).to eq(true), "'#{text_file}' did not exist"
17
- end
18
-
19
- it "converts to text" do
20
- # Write the output if it failed, for easier comparison
21
- if !text.eql?(expected)
22
- File.open(filename.sub(".html", ".output"), 'w') do |fp|
23
- fp.write(text)
24
- end
25
- end
26
-
27
- # Quick check, don't try to generate a 500kb+ diff,
28
- # which can halt the rspec for minutes+
29
- expect(text.length).to eq expected.length if text.length > 10000
30
-
31
- # More complete check
32
- expect(text).to eq expected
33
- end
34
- end
35
- end
36
-
37
- it "has examples to test" do
38
- expect(examples.size).to_not eq(0)
39
- end
40
- end
41
- end
@@ -1,58 +0,0 @@
1
- require "spec_helper"
2
-
3
- describe Html2Text do
4
- describe "#convert" do
5
- let(:text) { Html2Text.convert(html) }
6
-
7
- context "an empty line" do
8
- let(:html) { "" }
9
-
10
- it "is an empty line" do
11
- expect(text).to eq("")
12
- end
13
- end
14
-
15
- context "a simple string" do
16
- let(:html) { "hello world" }
17
-
18
- it "is an empty line" do
19
- expect(text).to eq("hello world")
20
- end
21
- end
22
-
23
- context "input value is non-string" do
24
- let(:html) { nil }
25
- it '(nil)' do
26
- expect(text).to eq("")
27
- end
28
- end
29
-
30
- context "input value is non-string" do
31
- let(:html) { 1234 }
32
- it "(number)" do
33
- expect(text).to eq("1234")
34
- end
35
- end
36
-
37
- context "input value is non-string" do
38
- let(:html) { 1234.5600 }
39
- it "(float number)" do
40
- expect(text).to eq("1234.56")
41
- end
42
- end
43
- end
44
-
45
- describe "#remove_leading_and_trailing_whitespace" do
46
- let(:subject) { Html2Text.new(nil).remove_leading_and_trailing_whitespace(input) }
47
-
48
- context "an empty string" do
49
- let(:input) { "" }
50
- it { is_expected.to eq("") }
51
- end
52
-
53
- context "many new lines" do
54
- let(:input) { "hello\n world \n yes" }
55
- it { is_expected.to eq("hello\nworld\nyes") }
56
- end
57
- end
58
- end
data/spec/spec_helper.rb DELETED
@@ -1,4 +0,0 @@
1
- require "rspec"
2
- require "rspec/collection_matchers"
3
-
4
- require File.join(File.dirname(__FILE__), "..", "lib", "html2text")