html2text 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ <b>Hello &nbsnbsp; world</b>
2
+ <div class=">
3
+ Error
4
+ </div>
@@ -0,0 +1 @@
1
+ Hello &nbsnbsp; world
@@ -0,0 +1 @@
1
+ <html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=us-ascii"><meta name=Generator content="Microsoft Word 15 (filtered medium)"><style><!-- /* Font Definitions */ @font-face {font-family:"Cambria Math"; panose-1:2 4 5 3 5 4 6 3 2 4;} @font-face {font-family:Calibri; panose-1:2 15 5 2 2 2 4 3 2 4;} /* Style Definitions */ p.MsoNormal, li.MsoNormal, div.MsoNormal {margin:0cm; margin-bottom:.0001pt; font-size:11.0pt; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} a:link, span.MsoHyperlink {mso-style-priority:99; color:#0563C1; text-decoration:underline;} a:visited, span.MsoHyperlinkFollowed {mso-style-priority:99; color:#954F72; text-decoration:underline;} span.EmailStyle17 {mso-style-type:personal-compose; font-family:"Calibri",sans-serif; color:windowtext;} .MsoChpDefault {mso-style-type:export-only; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} @page WordSection1 {size:612.0pt 792.0pt; margin:72.0pt 72.0pt 72.0pt 72.0pt;} div.WordSection1 {page:WordSection1;} --></style><!--[if gte mso 9]><xml> <o:shapedefaults v:ext="edit" spidmax="1026" /> </xml><![endif]--><!--[if gte mso 9]><xml> <o:shapelayout v:ext="edit"> <o:idmap v:ext="edit" data="1" /> </o:shapelayout></xml><![endif]--></head><body lang=EN-GB link="#0563C1" vlink="#954F72"><div class=WordSection1><p class=MsoNormal>Dear html2text,<o:p></o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>This is an example email that can be used to test html2text conversion of outlook / exchange emails.<o:p></o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>The addition of &lt;o:p&gt; tags is very annoying!<o:p></o:p></p><p class=MsoNormal>This is a single line return<o:p></o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal><b>This is bold<o:p></o:p></b></p><p class=MsoNormal><i>This is italic<o:p></o:p></i></p><p class=MsoNormal><u>This is underline<o:p></o:p></u></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>Andrew<o:p></o:p></p></div></body></html>
@@ -0,0 +1,12 @@
1
+ Dear html2text,
2
+
3
+ This is an example email that can be used to test html2text conversion of outlook / exchange emails.
4
+
5
+ The addition of <o:p> tags is very annoying!
6
+ This is a single line return
7
+
8
+ This is bold
9
+ This is italic
10
+ This is underline
11
+
12
+ Andrew
@@ -0,0 +1,17 @@
1
+ <html>
2
+ <body>
3
+ <div>
4
+ Just two divs
5
+ </div>
6
+ <div>
7
+ Hanging out
8
+ </div>
9
+ <div><div><div>Nested divs and line breaks</div></div><br></div>
10
+ <div><div>Nested divs and line breaks</div>More text<br></div>
11
+ <div><br></div>
12
+ <div>Just text</div>
13
+ <div>Just text<br></div>
14
+ <div>Just text<br><br></div>
15
+ This is the end!
16
+ </body>
17
+ </html>
@@ -0,0 +1,12 @@
1
+ Just two divs
2
+ Hanging out
3
+ Nested divs and line breaks
4
+
5
+ Nested divs and line breaks
6
+ More text
7
+
8
+ Just text
9
+ Just text
10
+ Just text
11
+
12
+ This is the end!
@@ -0,0 +1,50 @@
1
+ <html>
2
+ <body>
3
+ <div>
4
+ Hello
5
+ <br>
6
+ </div>
7
+ <div>
8
+ How are you?
9
+ <br>
10
+ </div>
11
+
12
+ <p>
13
+ How are you?
14
+ <br>
15
+ </p>
16
+
17
+ <p>
18
+ How are you?
19
+ <br>
20
+ </p>
21
+
22
+ <div>
23
+ Just two divs
24
+ </div>
25
+ <div>
26
+ Hanging out
27
+ </div>
28
+
29
+ This is not the end!
30
+ <div>
31
+ How are you again?
32
+ <br>
33
+ </div>
34
+ This is the end!
35
+ <br>
36
+ Just kidding
37
+ <h1>Header 1</h1>
38
+ Some text
39
+ <hr>
40
+ Some more text
41
+ <p>Paragraph tag!</p>
42
+ <h2>Header 2</h2>
43
+ <hr>
44
+ <h3>Header 3</h3>
45
+ Some text
46
+ <h4>Header 4</h4>
47
+ <p>Paragraph tag!</p>
48
+ Final line
49
+ </body>
50
+ </html>
@@ -0,0 +1,35 @@
1
+ Hello
2
+ How are you?
3
+
4
+ How are you?
5
+
6
+ How are you?
7
+
8
+ Just two divs
9
+ Hanging out
10
+ This is not the end!
11
+ How are you again?
12
+ This is the end!
13
+ Just kidding
14
+
15
+ Header 1
16
+
17
+ Some text
18
+ ---------------------------------------------------------------
19
+ Some more text
20
+
21
+ Paragraph tag!
22
+
23
+ Header 2
24
+
25
+ ---------------------------------------------------------------
26
+
27
+ Header 3
28
+
29
+ Some text
30
+
31
+ Header 4
32
+
33
+ Paragraph tag!
34
+
35
+ Final line
@@ -0,0 +1,10 @@
1
+ Here is the code
2
+ <pre>
3
+ #include &lt;stdlib.h&gt;
4
+ #include &lt;stdio.h&gt;
5
+
6
+ int main(){
7
+ return 0;
8
+ };
9
+
10
+ </pre>
@@ -0,0 +1,8 @@
1
+ Here is the code
2
+
3
+ #include <stdlib.h>
4
+ #include <stdio.h>
5
+
6
+ int main(){
7
+ return 0;
8
+ };
@@ -1 +1 @@
1
- 1<br />2<br />3<br />4<br />5 6
1
+ 1<br />2<br />3<br />4<br />5 &lt; 6
@@ -1,5 +1,5 @@
1
- 1
2
- 2
3
- 3
4
- 4
5
- 5 6
1
+ 1
2
+ 2
3
+ 3
4
+ 4
5
+ 5 < 6
@@ -0,0 +1,4 @@
1
+ <ul>
2
+ <li>ÅÄÖ</li>
3
+ <li>åäö</li>
4
+ </ul>
@@ -0,0 +1,2 @@
1
+ - ÅÄÖ
2
+ - åäö
@@ -0,0 +1,4 @@
1
+ <ul>
2
+ <li>���</li>
3
+ <li>���</li>
4
+ </ul>
@@ -0,0 +1,2 @@
1
+ - ÅÄÖ
2
+ - åäö
@@ -0,0 +1 @@
1
+ <p>foo&zwnj;bar</p>
@@ -0,0 +1 @@
1
+ foobar
@@ -17,7 +17,19 @@ describe Html2Text do
17
17
  end
18
18
 
19
19
  it "converts to text" do
20
- expect(text).to eq(expected)
20
+ # Write the output if it failed, for easier comparison
21
+ if !text.eql?(expected)
22
+ File.open(filename.sub(".html", ".output"), 'w') do |fp|
23
+ fp.write(text)
24
+ end
25
+ end
26
+
27
+ # Quick check, don't try to generate a 500kb+ diff,
28
+ # which can halt the rspec for minutes+
29
+ expect(text.length).to eq expected.length if text.length > 10000
30
+
31
+ # More complete check
32
+ expect(text).to eq expected
21
33
  end
22
34
  end
23
35
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2text
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jevon Wright
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-09-28 00:00:00.000000000 Z
11
+ date: 2019-02-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.6'
19
+ version: 1.8.5
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.6'
26
+ version: 1.8.5
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rspec
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -82,11 +82,12 @@ dependencies:
82
82
  version: '0'
83
83
  description: A Ruby component to convert HTML into a plain text format.
84
84
  email:
85
- - jevon@powershop.co.nz
85
+ - jevon@jevon.org
86
86
  executables: []
87
87
  extensions: []
88
88
  extra_rdoc_files: []
89
89
  files:
90
+ - CHANGELOG.md
90
91
  - LICENSE.md
91
92
  - README.md
92
93
  - lib/html2text.rb
@@ -95,24 +96,46 @@ files:
95
96
  - spec/examples/anchors.txt
96
97
  - spec/examples/basic.html
97
98
  - spec/examples/basic.txt
99
+ - spec/examples/dom-processing.html
100
+ - spec/examples/dom-processing.txt
101
+ - spec/examples/empty.html
102
+ - spec/examples/empty.txt
98
103
  - spec/examples/full_email.html
99
104
  - spec/examples/full_email.txt
105
+ - spec/examples/huge-msoffice.html
106
+ - spec/examples/huge-msoffice.txt
100
107
  - spec/examples/images.html
101
108
  - spec/examples/images.txt
109
+ - spec/examples/invalid.html
110
+ - spec/examples/invalid.txt
102
111
  - spec/examples/lists.html
103
112
  - spec/examples/lists.txt
104
113
  - spec/examples/more-anchors.html
105
114
  - spec/examples/more-anchors.txt
115
+ - spec/examples/msoffice.html
116
+ - spec/examples/msoffice.txt
106
117
  - spec/examples/nbsp.html
107
118
  - spec/examples/nbsp.txt
119
+ - spec/examples/nested-divs.html
120
+ - spec/examples/nested-divs.txt
121
+ - spec/examples/newlines.html
122
+ - spec/examples/newlines.txt
108
123
  - spec/examples/non-breaking-spaces.html
109
124
  - spec/examples/non-breaking-spaces.txt
125
+ - spec/examples/pre.html
126
+ - spec/examples/pre.txt
110
127
  - spec/examples/table.html
111
128
  - spec/examples/table.txt
112
129
  - spec/examples/test3.html
113
130
  - spec/examples/test3.txt
114
131
  - spec/examples/test4.html
115
132
  - spec/examples/test4.txt
133
+ - spec/examples/utf8-example.html
134
+ - spec/examples/utf8-example.txt
135
+ - spec/examples/windows-1252-example.html
136
+ - spec/examples/windows-1252-example.txt
137
+ - spec/examples/zero-width-non-joiners.html
138
+ - spec/examples/zero-width-non-joiners.txt
116
139
  - spec/examples_spec.rb
117
140
  - spec/html2text_spec.rb
118
141
  - spec/spec_helper.rb
@@ -135,34 +158,55 @@ required_rubygems_version: !ruby/object:Gem::Requirement
135
158
  - !ruby/object:Gem::Version
136
159
  version: '0'
137
160
  requirements: []
138
- rubyforge_project:
139
- rubygems_version: 2.6.13
161
+ rubygems_version: 3.0.2
140
162
  signing_key:
141
163
  specification_version: 4
142
164
  summary: Convert HTML into plain text.
143
165
  test_files:
144
- - spec/examples/anchors.html
145
- - spec/examples/anchors.txt
166
+ - spec/examples/nested-divs.html
167
+ - spec/examples/pre.html
168
+ - spec/examples/invalid.html
169
+ - spec/examples/empty.html
170
+ - spec/examples/table.html
146
171
  - spec/examples/basic.html
147
- - spec/examples/basic.txt
148
- - spec/examples/full_email.html
149
- - spec/examples/full_email.txt
150
- - spec/examples/images.html
151
- - spec/examples/images.txt
152
- - spec/examples/lists.html
153
- - spec/examples/lists.txt
154
- - spec/examples/more-anchors.html
155
- - spec/examples/more-anchors.txt
156
172
  - spec/examples/nbsp.html
173
+ - spec/examples/utf8-example.html
174
+ - spec/examples/newlines.txt
175
+ - spec/examples/full_email.txt
176
+ - spec/examples/msoffice.html
177
+ - spec/examples/zero-width-non-joiners.txt
178
+ - spec/examples/anchors.html
157
179
  - spec/examples/nbsp.txt
180
+ - spec/examples/zero-width-non-joiners.html
181
+ - spec/examples/test3.html
182
+ - spec/examples/test4.txt
183
+ - spec/examples/huge-msoffice.txt
184
+ - spec/examples/full_email.html
185
+ - spec/examples/utf8-example.txt
186
+ - spec/examples/table.txt
187
+ - spec/examples/huge-msoffice.html
188
+ - spec/examples/more-anchors.txt
189
+ - spec/examples/newlines.html
190
+ - spec/examples/test4.html
191
+ - spec/examples/basic.txt
192
+ - spec/examples/lists.html
193
+ - spec/examples/nested-divs.txt
158
194
  - spec/examples/non-breaking-spaces.html
195
+ - spec/examples/invalid.txt
196
+ - spec/examples/empty.txt
197
+ - spec/examples/images.txt
159
198
  - spec/examples/non-breaking-spaces.txt
160
- - spec/examples/table.html
161
- - spec/examples/table.txt
162
- - spec/examples/test3.html
199
+ - spec/examples/dom-processing.txt
163
200
  - spec/examples/test3.txt
164
- - spec/examples/test4.html
165
- - spec/examples/test4.txt
201
+ - spec/examples/dom-processing.html
202
+ - spec/examples/lists.txt
203
+ - spec/examples/pre.txt
204
+ - spec/examples/anchors.txt
205
+ - spec/examples/more-anchors.html
206
+ - spec/examples/windows-1252-example.txt
207
+ - spec/examples/images.html
208
+ - spec/examples/msoffice.txt
209
+ - spec/examples/windows-1252-example.html
166
210
  - spec/examples_spec.rb
167
211
  - spec/html2text_spec.rb
168
212
  - spec/spec_helper.rb