html2text 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7c84c460e75e64099fa12a010871f9859ab48b9f
4
+ data.tar.gz: ea56a52568f22804cdcbc44b5f35e6b99164ea6c
5
+ SHA512:
6
+ metadata.gz: a3833c4546b86912872d777fc57be15cc0fac89e273e5ad65b6714a0b723f4815a81a3865e9ee0b05746ef7dee356baf5824ace242ab914d26eb79bf3aa6bf65
7
+ data.tar.gz: 737d869f81c782f93d520e935bb5b26a0a88798f940b60856519a084eabd1dfca84171d673f3abd5e73ecf0f84917909573cd6d92a67510fcdfcc075c4a676ed
@@ -0,0 +1,20 @@
1
+ Copyright 2015 Jevon Wright
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,70 @@
1
+ html2text [![Build Status](https://travis-ci.org/soundasleep/html2text_ruby.svg?branch=master)](https://travis-ci.org/soundasleep/html2text_ruby)
2
+ ==============
3
+
4
+ `html2text` is a very simple script that uses Ruby's DOM methods to load HTML from a string, and then iterates over the resulting DOM to correctly output plain text. For example:
5
+
6
+ ```html
7
+ <html>
8
+ <title>Ignored Title</title>
9
+ <body>
10
+ <h1>Hello, World!</h1>
11
+
12
+ <p>This is some e-mail content.
13
+ Even though it has whitespace and newlines, the e-mail converter
14
+ will handle it correctly.
15
+
16
+ <p>Even mismatched tags.</p>
17
+
18
+ <div>A div</div>
19
+ <div>Another div</div>
20
+ <div>A div<div>within a div</div></div>
21
+
22
+ <a href="http://foo.com">A link</a>
23
+
24
+ </body>
25
+ </html>
26
+ ```
27
+
28
+ Will be converted into:
29
+
30
+ ```text
31
+ Hello, World!
32
+
33
+ This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly.
34
+
35
+ Even mismatched tags.
36
+ A div
37
+ Another div
38
+ A div
39
+ within a div
40
+ [A link](http://foo.com)
41
+ ```
42
+
43
+ See the [original blog post](http://journals.jevon.org/users/jevon-phd/entry/19818) or the related [StackOverflow answer](http://stackoverflow.com/a/2564472/39531).
44
+
45
+ ## Installing
46
+
47
+ TODO Install the gem, then you can:
48
+
49
+ ```ruby
50
+ require 'html2text'
51
+
52
+ text = Html2Text.convert(html)
53
+ ```
54
+
55
+ ## Tests
56
+
57
+ See all of the test cases defined in [spec/examples/](spec/examples/). These can be run with:
58
+
59
+ ```
60
+ bundle install
61
+ rspec
62
+ ```
63
+
64
+ ## License
65
+
66
+ `html2text` is licensed under MIT.
67
+
68
+ ## Other versions
69
+
70
+ Also see [html2text](https://github.com/soundasleep/html2text), the original PHP implementation.
@@ -0,0 +1,138 @@
1
+ require 'nokogiri'
2
+
3
+ class Html2Text
4
+ attr_reader :doc
5
+
6
+ def initialize(doc)
7
+ @doc = doc
8
+ end
9
+
10
+ def self.convert(html)
11
+ html = fix_newlines(replace_entities(html))
12
+ doc = Nokogiri::HTML(html)
13
+
14
+ Html2Text.new(doc).convert
15
+ end
16
+
17
+ def self.fix_newlines(text)
18
+ text.gsub("\r\n", "\n").gsub("\r", "\n")
19
+ end
20
+
21
+ def self.replace_entities(text)
22
+ text.gsub("&nbsp;", " ")
23
+ end
24
+
25
+ def convert
26
+ output = iterate_over(doc)
27
+ output = remove_leading_and_trailing_whitespace(output)
28
+ output.strip
29
+ end
30
+
31
+ def remove_leading_and_trailing_whitespace(text)
32
+ text.gsub(/[ \t]*\n[ \t]*/im, "\n")
33
+ end
34
+
35
+ def trimmed_whitespace(text)
36
+ # Replace whitespace characters with a space (equivalent to \s)
37
+ text.gsub(/[\t\n\f\r ]+/im, " ")
38
+ end
39
+
40
+ def next_node_name(node)
41
+ next_node = node.next_sibling
42
+ while next_node != nil
43
+ break if next_node.element?
44
+ next_node = next_node.next_sibling
45
+ end
46
+
47
+ if next_node && next_node.element?
48
+ next_node.name.downcase
49
+ end
50
+ end
51
+
52
+ def iterate_over(node)
53
+ return trimmed_whitespace(node.text) if node.text?
54
+
55
+ if ["style", "head", "title", "meta", "script"].include?(node.name.downcase)
56
+ return ""
57
+ end
58
+
59
+ output = []
60
+
61
+ output << prefix_whitespace(node)
62
+ output += node.children.map do |child|
63
+ iterate_over(child)
64
+ end
65
+ output << suffix_whitespace(node)
66
+
67
+ output = output.compact.join("") || ""
68
+
69
+ if node.name.downcase == "a"
70
+ output = wrap_link(node, output)
71
+ end
72
+
73
+ output
74
+ end
75
+
76
+ def prefix_whitespace(node)
77
+ case node.name.downcase
78
+ when "hr"
79
+ "------\n"
80
+
81
+ when "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul"
82
+ "\n"
83
+
84
+ when "tr", "p", "div"
85
+ "\n"
86
+
87
+ when "td", "th"
88
+ "\t"
89
+
90
+ when "li"
91
+ "- "
92
+ end
93
+ end
94
+
95
+ def suffix_whitespace(node)
96
+ case node.name.downcase
97
+ when "h1", "h2", "h3", "h4", "h5", "h6"
98
+ # add another line
99
+ "\n"
100
+
101
+ when "p", "br"
102
+ "\n" if next_node_name(node) != "div"
103
+
104
+ when "li"
105
+ "\n"
106
+
107
+ when "div"
108
+ # add one line only if the next child isn't a div
109
+ "\n" if next_node_name(node) != "div" && next_node_name(node) != nil
110
+ end
111
+ end
112
+
113
+ # links are returned in [text](link) format
114
+ def wrap_link(node, output)
115
+ href = node.attribute("href")
116
+ name = node.attribute("name")
117
+
118
+ if href.nil?
119
+ if !name.nil?
120
+ output = "[#{output}]"
121
+ end
122
+ else
123
+ href = href.to_s
124
+
125
+ if href != output && href != "mailto:#{output}" &&
126
+ href != "http://#{output}" && href != "https://#{output}"
127
+ output = "[#{output}](#{href})"
128
+ end
129
+ end
130
+
131
+ case next_node_name(node)
132
+ when "h1", "h2", "h3", "h4", "h5", "h6"
133
+ output += "\n"
134
+ end
135
+
136
+ output
137
+ end
138
+ end
@@ -0,0 +1,3 @@
1
+ class Html2Text
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,12 @@
1
+ A document without any HTML open/closing tags.
2
+
3
+ <hr>
4
+
5
+ We try and use the representation given by common browsers of the
6
+ HTML document, so that it looks similar when converted to plain text.
7
+
8
+ <a href="http://foo.com">visit foo.com</a> - or <a href="http://www.foo.com">http://www.foo.com</a>
9
+
10
+ <a href="http://foo.com" title="a link with a title">link</a>
11
+
12
+ <h2><a name="anchor">An anchor which will not appear</a></h2>
@@ -0,0 +1,5 @@
1
+ A document without any HTML open/closing tags.
2
+ ------
3
+ We try and use the representation given by common browsers of the HTML document, so that it looks similar when converted to plain text. [visit foo.com](http://foo.com) - or http://www.foo.com [link](http://foo.com)
4
+
5
+ [An anchor which will not appear]
@@ -0,0 +1,21 @@
1
+ <html>
2
+ <title>Ignored Title</title>
3
+ <body>
4
+ <h1>Hello, World!</h1>
5
+
6
+ <p>This is some e-mail content.
7
+ Even though it has whitespace and newlines, the e-mail converter
8
+ will handle it correctly.
9
+
10
+ <p>Even mismatched tags.</p>
11
+
12
+ <div>A div</div>
13
+ <div>Another div</div>
14
+ <div>A div<div>within a div</div></div>
15
+
16
+ <p>Another line<br />Yet another line</p>
17
+
18
+ <a href="http://foo.com">A link</a>
19
+
20
+ </body>
21
+ </html>
@@ -0,0 +1,13 @@
1
+ Hello, World!
2
+
3
+ This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly.
4
+
5
+ Even mismatched tags.
6
+ A div
7
+ Another div
8
+ A div
9
+ within a div
10
+
11
+ Another line
12
+ Yet another line
13
+ [A link](http://foo.com)
@@ -0,0 +1,24 @@
1
+ <h1>List tests</h1>
2
+
3
+ <p>
4
+ Add some lists.
5
+ </p>
6
+
7
+ <ol>
8
+ <li>one</li>
9
+ <li>two
10
+ <li>three
11
+ </ol>
12
+
13
+ <h2>An unordered list</h2>
14
+
15
+ <ul>
16
+ <li>one
17
+ <li>two</li>
18
+ <li>three</li>
19
+ </ul>
20
+ <ul>
21
+ <li>one
22
+ <li>two</li>
23
+ <li>three</li>
24
+ </ul>
@@ -0,0 +1,17 @@
1
+ List tests
2
+
3
+ Add some lists.
4
+
5
+ - one
6
+ - two
7
+ - three
8
+
9
+ An unordered list
10
+
11
+ - one
12
+ - two
13
+ - three
14
+
15
+ - one
16
+ - two
17
+ - three
@@ -0,0 +1,14 @@
1
+ <h1>Anchor tests</h1>
2
+
3
+ <p>
4
+ Visit http://openiaml.org or <a href="http://openiaml.org">openiaml.org</a> or <a href="http://openiaml.org">http://openiaml.org</a>.
5
+ </p>
6
+
7
+ <p>
8
+ To visit with SSL, visit https://openiaml.org or <a href="https://openiaml.org">openiaml.org</a> or <a href="https://openiaml.org">https://openiaml.org</a>.
9
+ </p>
10
+
11
+ <p>
12
+ To mail, email support@openiaml.org or mailto:support@openiaml.org
13
+ or <a href="mailto:support@openiaml.org">support@openiaml.org</a> or <a href="mailto:support@openiaml.org">mailto:support@openiaml.org</a>.
14
+ </p>
@@ -0,0 +1,7 @@
1
+ Anchor tests
2
+
3
+ Visit http://openiaml.org or openiaml.org or http://openiaml.org.
4
+
5
+ To visit with SSL, visit https://openiaml.org or openiaml.org or https://openiaml.org.
6
+
7
+ To mail, email support@openiaml.org or mailto:support@openiaml.org or support@openiaml.org or mailto:support@openiaml.org.
@@ -0,0 +1 @@
1
+ hello &nbsp; world &amp; people &lt; &gt; &NBSP;
@@ -0,0 +1 @@
1
+ hello world & people < > &NBSP;
@@ -0,0 +1,53 @@
1
+ <html>
2
+ <title>Ignored Title</title>
3
+ <body>
4
+ <h1>Hello, World!</h1>
5
+ <table>
6
+ <thead>
7
+ <tr>
8
+ <th>Col A</th>
9
+ <th>Col B</th>
10
+ </tr>
11
+ </thead>
12
+ <tbody>
13
+ <tr>
14
+ <td>
15
+ Data A1
16
+ </td>
17
+ <td>
18
+ Data B1
19
+ </td>
20
+ </tr>
21
+ <tr>
22
+ <td>
23
+ Data A2
24
+ </td>
25
+ <td>
26
+ Data B2
27
+ </td>
28
+ </tr>
29
+ <tr>
30
+ <td>
31
+ Data A3
32
+ </td>
33
+ <td>
34
+ Data B4
35
+ </td>
36
+ </tr>
37
+ </tbody>
38
+ <tfoot>
39
+ <tr>
40
+ <td>
41
+ Total A
42
+ </td>
43
+ <td>
44
+ Total B
45
+ </td>
46
+ </tr>
47
+
48
+ </tfoot>
49
+
50
+ </table>
51
+
52
+ </body>
53
+ </html>
@@ -0,0 +1,7 @@
1
+ Hello, World!
2
+
3
+ Col A Col B
4
+ Data A1 Data B1
5
+ Data A2 Data B2
6
+ Data A3 Data B4
7
+ Total A Total B
@@ -0,0 +1 @@
1
+ test one<br />test two
@@ -0,0 +1,2 @@
1
+ test one
2
+ test two
@@ -0,0 +1 @@
1
+ 1<br />2<br />3<br />4<br />5 6
@@ -0,0 +1,5 @@
1
+ 1
2
+ 2
3
+ 3
4
+ 4
5
+ 5 6
@@ -0,0 +1,25 @@
1
+ require "spec_helper"
2
+
3
+ describe Html2Text do
4
+ describe "#convert" do
5
+ let(:text) { Html2Text.convert(html) }
6
+
7
+ examples = Dir[File.dirname(__FILE__) + "/examples/*.html"]
8
+
9
+ examples.each do |filename|
10
+ context "#{filename}" do
11
+ let(:html) { File.read(filename) }
12
+ let(:text_file) { filename.sub(".html", ".txt") }
13
+ let(:expected) { Html2Text.fix_newlines(File.read(text_file)) }
14
+
15
+ it "converts to text" do
16
+ expect(text).to eq(expected)
17
+ end
18
+ end
19
+ end
20
+
21
+ it "has examples to test" do
22
+ expect(examples.size).to_not eq(0)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,37 @@
1
+ require "spec_helper"
2
+
3
+ describe Html2Text do
4
+ describe "#convert" do
5
+ let(:text) { Html2Text.convert(html) }
6
+
7
+ context "an empty line" do
8
+ let(:html) { "" }
9
+
10
+ it "is an empty line" do
11
+ expect(text).to eq("")
12
+ end
13
+ end
14
+
15
+ context "a simple string" do
16
+ let(:html) { "hello world" }
17
+
18
+ it "is an empty line" do
19
+ expect(text).to eq("hello world")
20
+ end
21
+ end
22
+ end
23
+
24
+ describe "#remove_leading_and_trailing_whitespace" do
25
+ let(:subject) { Html2Text.new(nil).remove_leading_and_trailing_whitespace(input) }
26
+
27
+ context "an empty string" do
28
+ let(:input) { "" }
29
+ it { is_expected.to eq("") }
30
+ end
31
+
32
+ context "many new lines" do
33
+ let(:input) { "hello\n world \n yes" }
34
+ it { is_expected.to eq("hello\nworld\nyes") }
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,4 @@
1
+ require "rspec"
2
+ require "rspec/collection_matchers"
3
+
4
+ require File.join(File.dirname(__FILE__), "..", "lib", "html2text")
metadata ADDED
@@ -0,0 +1,156 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html2text
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Jevon Wright
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-17 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec-collection_matchers
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: colorize
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rake
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: A Ruby component to convert HTML into a plain text format.
84
+ email:
85
+ - jevon@powershop.co.nz
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - MIT-LICENSE
91
+ - README.md
92
+ - lib/html2text.rb
93
+ - lib/html2text/version.rb
94
+ - spec/examples/anchors.html
95
+ - spec/examples/anchors.txt
96
+ - spec/examples/basic.html
97
+ - spec/examples/basic.txt
98
+ - spec/examples/lists.html
99
+ - spec/examples/lists.txt
100
+ - spec/examples/more-anchors.html
101
+ - spec/examples/more-anchors.txt
102
+ - spec/examples/nbsp.html
103
+ - spec/examples/nbsp.txt
104
+ - spec/examples/table.html
105
+ - spec/examples/table.txt
106
+ - spec/examples/test3.html
107
+ - spec/examples/test3.txt
108
+ - spec/examples/test4.html
109
+ - spec/examples/test4.txt
110
+ - spec/examples_spec.rb
111
+ - spec/html2text_spec.rb
112
+ - spec/spec_helper.rb
113
+ homepage: https://github.com/soundasleep/html2text_ruby
114
+ licenses:
115
+ - MIT
116
+ metadata: {}
117
+ post_install_message:
118
+ rdoc_options: []
119
+ require_paths:
120
+ - lib
121
+ required_ruby_version: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - ">="
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ required_rubygems_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ requirements: []
132
+ rubyforge_project:
133
+ rubygems_version: 2.4.5
134
+ signing_key:
135
+ specification_version: 4
136
+ summary: Convert HTML into plain text.
137
+ test_files:
138
+ - spec/examples/anchors.html
139
+ - spec/examples/anchors.txt
140
+ - spec/examples/basic.html
141
+ - spec/examples/basic.txt
142
+ - spec/examples/lists.html
143
+ - spec/examples/lists.txt
144
+ - spec/examples/more-anchors.html
145
+ - spec/examples/more-anchors.txt
146
+ - spec/examples/nbsp.html
147
+ - spec/examples/nbsp.txt
148
+ - spec/examples/table.html
149
+ - spec/examples/table.txt
150
+ - spec/examples/test3.html
151
+ - spec/examples/test3.txt
152
+ - spec/examples/test4.html
153
+ - spec/examples/test4.txt
154
+ - spec/examples_spec.rb
155
+ - spec/html2text_spec.rb
156
+ - spec/spec_helper.rb