html2text 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ <b>Hello &nbsnbsp; world</b>
2
+ <div class=">
3
+ Error
4
+ </div>
@@ -0,0 +1 @@
1
+ Hello &nbsnbsp; world
@@ -0,0 +1 @@
1
+ <html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=us-ascii"><meta name=Generator content="Microsoft Word 15 (filtered medium)"><style><!-- /* Font Definitions */ @font-face {font-family:"Cambria Math"; panose-1:2 4 5 3 5 4 6 3 2 4;} @font-face {font-family:Calibri; panose-1:2 15 5 2 2 2 4 3 2 4;} /* Style Definitions */ p.MsoNormal, li.MsoNormal, div.MsoNormal {margin:0cm; margin-bottom:.0001pt; font-size:11.0pt; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} a:link, span.MsoHyperlink {mso-style-priority:99; color:#0563C1; text-decoration:underline;} a:visited, span.MsoHyperlinkFollowed {mso-style-priority:99; color:#954F72; text-decoration:underline;} span.EmailStyle17 {mso-style-type:personal-compose; font-family:"Calibri",sans-serif; color:windowtext;} .MsoChpDefault {mso-style-type:export-only; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} @page WordSection1 {size:612.0pt 792.0pt; margin:72.0pt 72.0pt 72.0pt 72.0pt;} div.WordSection1 {page:WordSection1;} --></style><!--[if gte mso 9]><xml> <o:shapedefaults v:ext="edit" spidmax="1026" /> </xml><![endif]--><!--[if gte mso 9]><xml> <o:shapelayout v:ext="edit"> <o:idmap v:ext="edit" data="1" /> </o:shapelayout></xml><![endif]--></head><body lang=EN-GB link="#0563C1" vlink="#954F72"><div class=WordSection1><p class=MsoNormal>Dear html2text,<o:p></o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>This is an example email that can be used to test html2text conversion of outlook / exchange emails.<o:p></o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>The addition of &lt;o:p&gt; tags is very annoying!<o:p></o:p></p><p class=MsoNormal>This is a single line return<o:p></o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal><b>This is bold<o:p></o:p></b></p><p class=MsoNormal><i>This is italic<o:p></o:p></i></p><p class=MsoNormal><u>This is underline<o:p></o:p></u></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>Andrew<o:p></o:p></p></div></body></html>
@@ -0,0 +1,12 @@
1
+ Dear html2text,
2
+
3
+ This is an example email that can be used to test html2text conversion of outlook / exchange emails.
4
+
5
+ The addition of <o:p> tags is very annoying!
6
+ This is a single line return
7
+
8
+ This is bold
9
+ This is italic
10
+ This is underline
11
+
12
+ Andrew
@@ -0,0 +1,17 @@
1
+ <html>
2
+ <body>
3
+ <div>
4
+ Just two divs
5
+ </div>
6
+ <div>
7
+ Hanging out
8
+ </div>
9
+ <div><div><div>Nested divs and line breaks</div></div><br></div>
10
+ <div><div>Nested divs and line breaks</div>More text<br></div>
11
+ <div><br></div>
12
+ <div>Just text</div>
13
+ <div>Just text<br></div>
14
+ <div>Just text<br><br></div>
15
+ This is the end!
16
+ </body>
17
+ </html>
@@ -0,0 +1,12 @@
1
+ Just two divs
2
+ Hanging out
3
+ Nested divs and line breaks
4
+
5
+ Nested divs and line breaks
6
+ More text
7
+
8
+ Just text
9
+ Just text
10
+ Just text
11
+
12
+ This is the end!
@@ -0,0 +1,50 @@
1
+ <html>
2
+ <body>
3
+ <div>
4
+ Hello
5
+ <br>
6
+ </div>
7
+ <div>
8
+ How are you?
9
+ <br>
10
+ </div>
11
+
12
+ <p>
13
+ How are you?
14
+ <br>
15
+ </p>
16
+
17
+ <p>
18
+ How are you?
19
+ <br>
20
+ </p>
21
+
22
+ <div>
23
+ Just two divs
24
+ </div>
25
+ <div>
26
+ Hanging out
27
+ </div>
28
+
29
+ This is not the end!
30
+ <div>
31
+ How are you again?
32
+ <br>
33
+ </div>
34
+ This is the end!
35
+ <br>
36
+ Just kidding
37
+ <h1>Header 1</h1>
38
+ Some text
39
+ <hr>
40
+ Some more text
41
+ <p>Paragraph tag!</p>
42
+ <h2>Header 2</h2>
43
+ <hr>
44
+ <h3>Header 3</h3>
45
+ Some text
46
+ <h4>Header 4</h4>
47
+ <p>Paragraph tag!</p>
48
+ Final line
49
+ </body>
50
+ </html>
@@ -0,0 +1,35 @@
1
+ Hello
2
+ How are you?
3
+
4
+ How are you?
5
+
6
+ How are you?
7
+
8
+ Just two divs
9
+ Hanging out
10
+ This is not the end!
11
+ How are you again?
12
+ This is the end!
13
+ Just kidding
14
+
15
+ Header 1
16
+
17
+ Some text
18
+ ---------------------------------------------------------------
19
+ Some more text
20
+
21
+ Paragraph tag!
22
+
23
+ Header 2
24
+
25
+ ---------------------------------------------------------------
26
+
27
+ Header 3
28
+
29
+ Some text
30
+
31
+ Header 4
32
+
33
+ Paragraph tag!
34
+
35
+ Final line
@@ -0,0 +1,10 @@
1
+ Here is the code
2
+ <pre>
3
+ #include &lt;stdlib.h&gt;
4
+ #include &lt;stdio.h&gt;
5
+
6
+ int main(){
7
+ return 0;
8
+ };
9
+
10
+ </pre>
@@ -0,0 +1,8 @@
1
+ Here is the code
2
+
3
+ #include <stdlib.h>
4
+ #include <stdio.h>
5
+
6
+ int main(){
7
+ return 0;
8
+ };
@@ -1 +1 @@
1
- 1<br />2<br />3<br />4<br />5 6
1
+ 1<br />2<br />3<br />4<br />5 &lt; 6
@@ -1,5 +1,5 @@
1
- 1
2
- 2
3
- 3
4
- 4
5
- 5 6
1
+ 1
2
+ 2
3
+ 3
4
+ 4
5
+ 5 < 6
@@ -0,0 +1,4 @@
1
+ <ul>
2
+ <li>ÅÄÖ</li>
3
+ <li>åäö</li>
4
+ </ul>
@@ -0,0 +1,2 @@
1
+ - ÅÄÖ
2
+ - åäö
@@ -0,0 +1,4 @@
1
+ <ul>
2
+ <li>���</li>
3
+ <li>���</li>
4
+ </ul>
@@ -0,0 +1,2 @@
1
+ - ÅÄÖ
2
+ - åäö
@@ -0,0 +1 @@
1
+ <p>foo&zwnj;bar</p>
@@ -0,0 +1 @@
1
+ foobar
@@ -17,7 +17,19 @@ describe Html2Text do
17
17
  end
18
18
 
19
19
  it "converts to text" do
20
- expect(text).to eq(expected)
20
+ # Write the output if it failed, for easier comparison
21
+ if !text.eql?(expected)
22
+ File.open(filename.sub(".html", ".output"), 'w') do |fp|
23
+ fp.write(text)
24
+ end
25
+ end
26
+
27
+ # Quick check, don't try to generate a 500kb+ diff,
28
+ # which can halt the rspec for minutes+
29
+ expect(text.length).to eq expected.length if text.length > 10000
30
+
31
+ # More complete check
32
+ expect(text).to eq expected
21
33
  end
22
34
  end
23
35
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2text
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jevon Wright
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-09-28 00:00:00.000000000 Z
11
+ date: 2019-02-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.6'
19
+ version: 1.8.5
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.6'
26
+ version: 1.8.5
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rspec
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -82,11 +82,12 @@ dependencies:
82
82
  version: '0'
83
83
  description: A Ruby component to convert HTML into a plain text format.
84
84
  email:
85
- - jevon@powershop.co.nz
85
+ - jevon@jevon.org
86
86
  executables: []
87
87
  extensions: []
88
88
  extra_rdoc_files: []
89
89
  files:
90
+ - CHANGELOG.md
90
91
  - LICENSE.md
91
92
  - README.md
92
93
  - lib/html2text.rb
@@ -95,24 +96,46 @@ files:
95
96
  - spec/examples/anchors.txt
96
97
  - spec/examples/basic.html
97
98
  - spec/examples/basic.txt
99
+ - spec/examples/dom-processing.html
100
+ - spec/examples/dom-processing.txt
101
+ - spec/examples/empty.html
102
+ - spec/examples/empty.txt
98
103
  - spec/examples/full_email.html
99
104
  - spec/examples/full_email.txt
105
+ - spec/examples/huge-msoffice.html
106
+ - spec/examples/huge-msoffice.txt
100
107
  - spec/examples/images.html
101
108
  - spec/examples/images.txt
109
+ - spec/examples/invalid.html
110
+ - spec/examples/invalid.txt
102
111
  - spec/examples/lists.html
103
112
  - spec/examples/lists.txt
104
113
  - spec/examples/more-anchors.html
105
114
  - spec/examples/more-anchors.txt
115
+ - spec/examples/msoffice.html
116
+ - spec/examples/msoffice.txt
106
117
  - spec/examples/nbsp.html
107
118
  - spec/examples/nbsp.txt
119
+ - spec/examples/nested-divs.html
120
+ - spec/examples/nested-divs.txt
121
+ - spec/examples/newlines.html
122
+ - spec/examples/newlines.txt
108
123
  - spec/examples/non-breaking-spaces.html
109
124
  - spec/examples/non-breaking-spaces.txt
125
+ - spec/examples/pre.html
126
+ - spec/examples/pre.txt
110
127
  - spec/examples/table.html
111
128
  - spec/examples/table.txt
112
129
  - spec/examples/test3.html
113
130
  - spec/examples/test3.txt
114
131
  - spec/examples/test4.html
115
132
  - spec/examples/test4.txt
133
+ - spec/examples/utf8-example.html
134
+ - spec/examples/utf8-example.txt
135
+ - spec/examples/windows-1252-example.html
136
+ - spec/examples/windows-1252-example.txt
137
+ - spec/examples/zero-width-non-joiners.html
138
+ - spec/examples/zero-width-non-joiners.txt
116
139
  - spec/examples_spec.rb
117
140
  - spec/html2text_spec.rb
118
141
  - spec/spec_helper.rb
@@ -135,34 +158,55 @@ required_rubygems_version: !ruby/object:Gem::Requirement
135
158
  - !ruby/object:Gem::Version
136
159
  version: '0'
137
160
  requirements: []
138
- rubyforge_project:
139
- rubygems_version: 2.6.13
161
+ rubygems_version: 3.0.2
140
162
  signing_key:
141
163
  specification_version: 4
142
164
  summary: Convert HTML into plain text.
143
165
  test_files:
144
- - spec/examples/anchors.html
145
- - spec/examples/anchors.txt
166
+ - spec/examples/nested-divs.html
167
+ - spec/examples/pre.html
168
+ - spec/examples/invalid.html
169
+ - spec/examples/empty.html
170
+ - spec/examples/table.html
146
171
  - spec/examples/basic.html
147
- - spec/examples/basic.txt
148
- - spec/examples/full_email.html
149
- - spec/examples/full_email.txt
150
- - spec/examples/images.html
151
- - spec/examples/images.txt
152
- - spec/examples/lists.html
153
- - spec/examples/lists.txt
154
- - spec/examples/more-anchors.html
155
- - spec/examples/more-anchors.txt
156
172
  - spec/examples/nbsp.html
173
+ - spec/examples/utf8-example.html
174
+ - spec/examples/newlines.txt
175
+ - spec/examples/full_email.txt
176
+ - spec/examples/msoffice.html
177
+ - spec/examples/zero-width-non-joiners.txt
178
+ - spec/examples/anchors.html
157
179
  - spec/examples/nbsp.txt
180
+ - spec/examples/zero-width-non-joiners.html
181
+ - spec/examples/test3.html
182
+ - spec/examples/test4.txt
183
+ - spec/examples/huge-msoffice.txt
184
+ - spec/examples/full_email.html
185
+ - spec/examples/utf8-example.txt
186
+ - spec/examples/table.txt
187
+ - spec/examples/huge-msoffice.html
188
+ - spec/examples/more-anchors.txt
189
+ - spec/examples/newlines.html
190
+ - spec/examples/test4.html
191
+ - spec/examples/basic.txt
192
+ - spec/examples/lists.html
193
+ - spec/examples/nested-divs.txt
158
194
  - spec/examples/non-breaking-spaces.html
195
+ - spec/examples/invalid.txt
196
+ - spec/examples/empty.txt
197
+ - spec/examples/images.txt
159
198
  - spec/examples/non-breaking-spaces.txt
160
- - spec/examples/table.html
161
- - spec/examples/table.txt
162
- - spec/examples/test3.html
199
+ - spec/examples/dom-processing.txt
163
200
  - spec/examples/test3.txt
164
- - spec/examples/test4.html
165
- - spec/examples/test4.txt
201
+ - spec/examples/dom-processing.html
202
+ - spec/examples/lists.txt
203
+ - spec/examples/pre.txt
204
+ - spec/examples/anchors.txt
205
+ - spec/examples/more-anchors.html
206
+ - spec/examples/windows-1252-example.txt
207
+ - spec/examples/images.html
208
+ - spec/examples/msoffice.txt
209
+ - spec/examples/windows-1252-example.html
166
210
  - spec/examples_spec.rb
167
211
  - spec/html2text_spec.rb
168
212
  - spec/spec_helper.rb