html2text 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7c84c460e75e64099fa12a010871f9859ab48b9f
4
+ data.tar.gz: ea56a52568f22804cdcbc44b5f35e6b99164ea6c
5
+ SHA512:
6
+ metadata.gz: a3833c4546b86912872d777fc57be15cc0fac89e273e5ad65b6714a0b723f4815a81a3865e9ee0b05746ef7dee356baf5824ace242ab914d26eb79bf3aa6bf65
7
+ data.tar.gz: 737d869f81c782f93d520e935bb5b26a0a88798f940b60856519a084eabd1dfca84171d673f3abd5e73ecf0f84917909573cd6d92a67510fcdfcc075c4a676ed
@@ -0,0 +1,20 @@
1
+ Copyright 2015 Jevon Wright
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,70 @@
1
+ html2text [![Build Status](https://travis-ci.org/soundasleep/html2text_ruby.svg?branch=master)](https://travis-ci.org/soundasleep/html2text_ruby)
2
+ ==============
3
+
4
+ `html2text` is a very simple script that uses Ruby's DOM methods to load HTML from a string, and then iterates over the resulting DOM to correctly output plain text. For example:
5
+
6
+ ```html
7
+ <html>
8
+ <title>Ignored Title</title>
9
+ <body>
10
+ <h1>Hello, World!</h1>
11
+
12
+ <p>This is some e-mail content.
13
+ Even though it has whitespace and newlines, the e-mail converter
14
+ will handle it correctly.
15
+
16
+ <p>Even mismatched tags.</p>
17
+
18
+ <div>A div</div>
19
+ <div>Another div</div>
20
+ <div>A div<div>within a div</div></div>
21
+
22
+ <a href="http://foo.com">A link</a>
23
+
24
+ </body>
25
+ </html>
26
+ ```
27
+
28
+ Will be converted into:
29
+
30
+ ```text
31
+ Hello, World!
32
+
33
+ This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly.
34
+
35
+ Even mismatched tags.
36
+ A div
37
+ Another div
38
+ A div
39
+ within a div
40
+ [A link](http://foo.com)
41
+ ```
42
+
43
+ See the [original blog post](http://journals.jevon.org/users/jevon-phd/entry/19818) or the related [StackOverflow answer](http://stackoverflow.com/a/2564472/39531).
44
+
45
+ ## Installing
46
+
47
+ TODO Install the gem, then you can:
48
+
49
+ ```ruby
50
+ require 'html2text'
51
+
52
+ text = Html2Text.convert(html)
53
+ ```
54
+
55
+ ## Tests
56
+
57
+ See all of the test cases defined in [spec/examples/](spec/examples/). These can be run with:
58
+
59
+ ```
60
+ bundle install
61
+ rspec
62
+ ```
63
+
64
+ ## License
65
+
66
+ `html2text` is licensed under MIT.
67
+
68
+ ## Other versions
69
+
70
+ Also see [html2text](https://github.com/soundasleep/html2text), the original PHP implementation.
@@ -0,0 +1,138 @@
1
+ require 'nokogiri'
2
+
3
+ class Html2Text
4
+ attr_reader :doc
5
+
6
+ def initialize(doc)
7
+ @doc = doc
8
+ end
9
+
10
+ def self.convert(html)
11
+ html = fix_newlines(replace_entities(html))
12
+ doc = Nokogiri::HTML(html)
13
+
14
+ Html2Text.new(doc).convert
15
+ end
16
+
17
+ def self.fix_newlines(text)
18
+ text.gsub("\r\n", "\n").gsub("\r", "\n")
19
+ end
20
+
21
+ def self.replace_entities(text)
22
+ text.gsub("&nbsp;", " ")
23
+ end
24
+
25
+ def convert
26
+ output = iterate_over(doc)
27
+ output = remove_leading_and_trailing_whitespace(output)
28
+ output.strip
29
+ end
30
+
31
+ def remove_leading_and_trailing_whitespace(text)
32
+ text.gsub(/[ \t]*\n[ \t]*/im, "\n")
33
+ end
34
+
35
+ def trimmed_whitespace(text)
36
+ # Replace whitespace characters with a space (equivalent to \s)
37
+ text.gsub(/[\t\n\f\r ]+/im, " ")
38
+ end
39
+
40
+ def next_node_name(node)
41
+ next_node = node.next_sibling
42
+ while next_node != nil
43
+ break if next_node.element?
44
+ next_node = next_node.next_sibling
45
+ end
46
+
47
+ if next_node && next_node.element?
48
+ next_node.name.downcase
49
+ end
50
+ end
51
+
52
+ def iterate_over(node)
53
+ return trimmed_whitespace(node.text) if node.text?
54
+
55
+ if ["style", "head", "title", "meta", "script"].include?(node.name.downcase)
56
+ return ""
57
+ end
58
+
59
+ output = []
60
+
61
+ output << prefix_whitespace(node)
62
+ output += node.children.map do |child|
63
+ iterate_over(child)
64
+ end
65
+ output << suffix_whitespace(node)
66
+
67
+ output = output.compact.join("") || ""
68
+
69
+ if node.name.downcase == "a"
70
+ output = wrap_link(node, output)
71
+ end
72
+
73
+ output
74
+ end
75
+
76
+ def prefix_whitespace(node)
77
+ case node.name.downcase
78
+ when "hr"
79
+ "------\n"
80
+
81
+ when "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul"
82
+ "\n"
83
+
84
+ when "tr", "p", "div"
85
+ "\n"
86
+
87
+ when "td", "th"
88
+ "\t"
89
+
90
+ when "li"
91
+ "- "
92
+ end
93
+ end
94
+
95
+ def suffix_whitespace(node)
96
+ case node.name.downcase
97
+ when "h1", "h2", "h3", "h4", "h5", "h6"
98
+ # add another line
99
+ "\n"
100
+
101
+ when "p", "br"
102
+ "\n" if next_node_name(node) != "div"
103
+
104
+ when "li"
105
+ "\n"
106
+
107
+ when "div"
108
+ # add one line only if the next child isn't a div
109
+ "\n" if next_node_name(node) != "div" && next_node_name(node) != nil
110
+ end
111
+ end
112
+
113
+ # links are returned in [text](link) format
114
+ def wrap_link(node, output)
115
+ href = node.attribute("href")
116
+ name = node.attribute("name")
117
+
118
+ if href.nil?
119
+ if !name.nil?
120
+ output = "[#{output}]"
121
+ end
122
+ else
123
+ href = href.to_s
124
+
125
+ if href != output && href != "mailto:#{output}" &&
126
+ href != "http://#{output}" && href != "https://#{output}"
127
+ output = "[#{output}](#{href})"
128
+ end
129
+ end
130
+
131
+ case next_node_name(node)
132
+ when "h1", "h2", "h3", "h4", "h5", "h6"
133
+ output += "\n"
134
+ end
135
+
136
+ output
137
+ end
138
+ end
@@ -0,0 +1,3 @@
1
+ class Html2Text
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,12 @@
1
+ A document without any HTML open/closing tags.
2
+
3
+ <hr>
4
+
5
+ We try and use the representation given by common browsers of the
6
+ HTML document, so that it looks similar when converted to plain text.
7
+
8
+ <a href="http://foo.com">visit foo.com</a> - or <a href="http://www.foo.com">http://www.foo.com</a>
9
+
10
+ <a href="http://foo.com" title="a link with a title">link</a>
11
+
12
+ <h2><a name="anchor">An anchor which will not appear</a></h2>
@@ -0,0 +1,5 @@
1
+ A document without any HTML open/closing tags.
2
+ ------
3
+ We try and use the representation given by common browsers of the HTML document, so that it looks similar when converted to plain text. [visit foo.com](http://foo.com) - or http://www.foo.com [link](http://foo.com)
4
+
5
+ [An anchor which will not appear]
@@ -0,0 +1,21 @@
1
+ <html>
2
+ <title>Ignored Title</title>
3
+ <body>
4
+ <h1>Hello, World!</h1>
5
+
6
+ <p>This is some e-mail content.
7
+ Even though it has whitespace and newlines, the e-mail converter
8
+ will handle it correctly.
9
+
10
+ <p>Even mismatched tags.</p>
11
+
12
+ <div>A div</div>
13
+ <div>Another div</div>
14
+ <div>A div<div>within a div</div></div>
15
+
16
+ <p>Another line<br />Yet another line</p>
17
+
18
+ <a href="http://foo.com">A link</a>
19
+
20
+ </body>
21
+ </html>
@@ -0,0 +1,13 @@
1
+ Hello, World!
2
+
3
+ This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly.
4
+
5
+ Even mismatched tags.
6
+ A div
7
+ Another div
8
+ A div
9
+ within a div
10
+
11
+ Another line
12
+ Yet another line
13
+ [A link](http://foo.com)
@@ -0,0 +1,24 @@
1
+ <h1>List tests</h1>
2
+
3
+ <p>
4
+ Add some lists.
5
+ </p>
6
+
7
+ <ol>
8
+ <li>one</li>
9
+ <li>two
10
+ <li>three
11
+ </ol>
12
+
13
+ <h2>An unordered list</h2>
14
+
15
+ <ul>
16
+ <li>one
17
+ <li>two</li>
18
+ <li>three</li>
19
+ </ul>
20
+ <ul>
21
+ <li>one
22
+ <li>two</li>
23
+ <li>three</li>
24
+ </ul>
@@ -0,0 +1,17 @@
1
+ List tests
2
+
3
+ Add some lists.
4
+
5
+ - one
6
+ - two
7
+ - three
8
+
9
+ An unordered list
10
+
11
+ - one
12
+ - two
13
+ - three
14
+
15
+ - one
16
+ - two
17
+ - three
@@ -0,0 +1,14 @@
1
+ <h1>Anchor tests</h1>
2
+
3
+ <p>
4
+ Visit http://openiaml.org or <a href="http://openiaml.org">openiaml.org</a> or <a href="http://openiaml.org">http://openiaml.org</a>.
5
+ </p>
6
+
7
+ <p>
8
+ To visit with SSL, visit https://openiaml.org or <a href="https://openiaml.org">openiaml.org</a> or <a href="https://openiaml.org">https://openiaml.org</a>.
9
+ </p>
10
+
11
+ <p>
12
+ To mail, email support@openiaml.org or mailto:support@openiaml.org
13
+ or <a href="mailto:support@openiaml.org">support@openiaml.org</a> or <a href="mailto:support@openiaml.org">mailto:support@openiaml.org</a>.
14
+ </p>
@@ -0,0 +1,7 @@
1
+ Anchor tests
2
+
3
+ Visit http://openiaml.org or openiaml.org or http://openiaml.org.
4
+
5
+ To visit with SSL, visit https://openiaml.org or openiaml.org or https://openiaml.org.
6
+
7
+ To mail, email support@openiaml.org or mailto:support@openiaml.org or support@openiaml.org or mailto:support@openiaml.org.
@@ -0,0 +1 @@
1
+ hello &nbsp; world &amp; people &lt; &gt; &NBSP;
@@ -0,0 +1 @@
1
+ hello world & people < > &NBSP;
@@ -0,0 +1,53 @@
1
+ <html>
2
+ <title>Ignored Title</title>
3
+ <body>
4
+ <h1>Hello, World!</h1>
5
+ <table>
6
+ <thead>
7
+ <tr>
8
+ <th>Col A</th>
9
+ <th>Col B</th>
10
+ </tr>
11
+ </thead>
12
+ <tbody>
13
+ <tr>
14
+ <td>
15
+ Data A1
16
+ </td>
17
+ <td>
18
+ Data B1
19
+ </td>
20
+ </tr>
21
+ <tr>
22
+ <td>
23
+ Data A2
24
+ </td>
25
+ <td>
26
+ Data B2
27
+ </td>
28
+ </tr>
29
+ <tr>
30
+ <td>
31
+ Data A3
32
+ </td>
33
+ <td>
34
+ Data B4
35
+ </td>
36
+ </tr>
37
+ </tbody>
38
+ <tfoot>
39
+ <tr>
40
+ <td>
41
+ Total A
42
+ </td>
43
+ <td>
44
+ Total B
45
+ </td>
46
+ </tr>
47
+
48
+ </tfoot>
49
+
50
+ </table>
51
+
52
+ </body>
53
+ </html>
@@ -0,0 +1,7 @@
1
+ Hello, World!
2
+
3
+ Col A Col B
4
+ Data A1 Data B1
5
+ Data A2 Data B2
6
+ Data A3 Data B4
7
+ Total A Total B
@@ -0,0 +1 @@
1
+ test one<br />test two
@@ -0,0 +1,2 @@
1
+ test one
2
+ test two
@@ -0,0 +1 @@
1
+ 1<br />2<br />3<br />4<br />5 6
@@ -0,0 +1,5 @@
1
+ 1
2
+ 2
3
+ 3
4
+ 4
5
+ 5 6
@@ -0,0 +1,25 @@
1
+ require "spec_helper"
2
+
3
+ describe Html2Text do
4
+ describe "#convert" do
5
+ let(:text) { Html2Text.convert(html) }
6
+
7
+ examples = Dir[File.dirname(__FILE__) + "/examples/*.html"]
8
+
9
+ examples.each do |filename|
10
+ context "#{filename}" do
11
+ let(:html) { File.read(filename) }
12
+ let(:text_file) { filename.sub(".html", ".txt") }
13
+ let(:expected) { Html2Text.fix_newlines(File.read(text_file)) }
14
+
15
+ it "converts to text" do
16
+ expect(text).to eq(expected)
17
+ end
18
+ end
19
+ end
20
+
21
+ it "has examples to test" do
22
+ expect(examples.size).to_not eq(0)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,37 @@
1
+ require "spec_helper"
2
+
3
+ describe Html2Text do
4
+ describe "#convert" do
5
+ let(:text) { Html2Text.convert(html) }
6
+
7
+ context "an empty line" do
8
+ let(:html) { "" }
9
+
10
+ it "is an empty line" do
11
+ expect(text).to eq("")
12
+ end
13
+ end
14
+
15
+ context "a simple string" do
16
+ let(:html) { "hello world" }
17
+
18
+ it "is an empty line" do
19
+ expect(text).to eq("hello world")
20
+ end
21
+ end
22
+ end
23
+
24
+ describe "#remove_leading_and_trailing_whitespace" do
25
+ let(:subject) { Html2Text.new(nil).remove_leading_and_trailing_whitespace(input) }
26
+
27
+ context "an empty string" do
28
+ let(:input) { "" }
29
+ it { is_expected.to eq("") }
30
+ end
31
+
32
+ context "many new lines" do
33
+ let(:input) { "hello\n world \n yes" }
34
+ it { is_expected.to eq("hello\nworld\nyes") }
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,4 @@
1
+ require "rspec"
2
+ require "rspec/collection_matchers"
3
+
4
+ require File.join(File.dirname(__FILE__), "..", "lib", "html2text")
metadata ADDED
@@ -0,0 +1,156 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html2text
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Jevon Wright
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-17 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec-collection_matchers
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: colorize
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rake
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: A Ruby component to convert HTML into a plain text format.
84
+ email:
85
+ - jevon@powershop.co.nz
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - MIT-LICENSE
91
+ - README.md
92
+ - lib/html2text.rb
93
+ - lib/html2text/version.rb
94
+ - spec/examples/anchors.html
95
+ - spec/examples/anchors.txt
96
+ - spec/examples/basic.html
97
+ - spec/examples/basic.txt
98
+ - spec/examples/lists.html
99
+ - spec/examples/lists.txt
100
+ - spec/examples/more-anchors.html
101
+ - spec/examples/more-anchors.txt
102
+ - spec/examples/nbsp.html
103
+ - spec/examples/nbsp.txt
104
+ - spec/examples/table.html
105
+ - spec/examples/table.txt
106
+ - spec/examples/test3.html
107
+ - spec/examples/test3.txt
108
+ - spec/examples/test4.html
109
+ - spec/examples/test4.txt
110
+ - spec/examples_spec.rb
111
+ - spec/html2text_spec.rb
112
+ - spec/spec_helper.rb
113
+ homepage: https://github.com/soundasleep/html2text_ruby
114
+ licenses:
115
+ - MIT
116
+ metadata: {}
117
+ post_install_message:
118
+ rdoc_options: []
119
+ require_paths:
120
+ - lib
121
+ required_ruby_version: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - ">="
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ required_rubygems_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ requirements: []
132
+ rubyforge_project:
133
+ rubygems_version: 2.4.5
134
+ signing_key:
135
+ specification_version: 4
136
+ summary: Convert HTML into plain text.
137
+ test_files:
138
+ - spec/examples/anchors.html
139
+ - spec/examples/anchors.txt
140
+ - spec/examples/basic.html
141
+ - spec/examples/basic.txt
142
+ - spec/examples/lists.html
143
+ - spec/examples/lists.txt
144
+ - spec/examples/more-anchors.html
145
+ - spec/examples/more-anchors.txt
146
+ - spec/examples/nbsp.html
147
+ - spec/examples/nbsp.txt
148
+ - spec/examples/table.html
149
+ - spec/examples/table.txt
150
+ - spec/examples/test3.html
151
+ - spec/examples/test3.txt
152
+ - spec/examples/test4.html
153
+ - spec/examples/test4.txt
154
+ - spec/examples_spec.rb
155
+ - spec/html2text_spec.rb
156
+ - spec/spec_helper.rb