html_to_plain_text 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/MIT_LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Brian Durand
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,18 @@
1
+ = HTML To Plain Text
2
+
3
+ A simple gem that provide code to convert HTML into a plain text alternative. Line breaks from HTML block level elements will be maintained. Lists and tables will also maintain a little bit of formatting.
4
+
5
+ * Line breaks will be approximated using the generally established default margins for HTML tags (i.e. <p>
6
+ tag generates two line breaks, <div> generates one)
7
+ * Lists items will be numbered or bulleted with an asterisk
8
+ * <br> tags will add line breaks
9
+ * <hr> tags will add a string of hyphens to serve as a horizontal rule
10
+ * <table> elements will enclosed in "|" delimiters
11
+ * <a> tags will have the href URL appended to the text in parentheses
12
+ * Formatting tags like <strong> or <b> will be stripped
13
+ * Formatting inside <pre> or <plaintext> elements will be honored
14
+ * Code-like tags like <script> or <style> will be stripped
15
+
16
+ == Usage
17
+
18
+ HtmlToPlainText.plain_text(html)
data/Rakefile ADDED
@@ -0,0 +1,29 @@
1
+ require 'rubygems'
2
+ require 'rubygems/package_task'
3
+ require 'rake'
4
+
5
+ desc 'Default: run unit tests.'
6
+ task :default => :test
7
+
8
+ desc 'RVM likes to call it tests'
9
+ task :tests => :test
10
+
11
+ begin
12
+ require 'rspec'
13
+ require 'rspec/core/rake_task'
14
+ desc 'Run the unit tests'
15
+ RSpec::Core::RakeTask.new(:test)
16
+ rescue LoadError
17
+ task :test do
18
+ STDERR.puts "You must have rspec 2.0 installed to run the tests"
19
+ end
20
+ end
21
+
22
+ spec_file = File.expand_path('../html_to_plain_text.gemspec', __FILE__)
23
+ if File.exist?(spec_file)
24
+ spec = eval(File.read(spec_file))
25
+
26
+ Gem::PackageTask.new(spec) do |p|
27
+ p.gem_spec = spec
28
+ end
29
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
@@ -0,0 +1,131 @@
1
+ require 'nokogiri'
2
+
3
+ # The main method on this module +plain_text+ will convert a string of HTML to a plain text approximation.
4
+ module HtmlToPlainText
5
+ IGNORE_TAGS = %w(script style object applet iframe).inject({}){|h, t| h[t] = true; h}.freeze
6
+ PARAGRAPH_TAGS = %w(p h1 h2 h3 h4 h5 h6 table ol ul dl dd blockquote dialog figure aside section).inject({}){|h, t| h[t] = true; h}.freeze
7
+ BLOCK_TAGS = %w(div address li dt center del article header header footer nav pre legend tr).inject({}){|h, t| h[t] = true; h}.freeze
8
+ WHITESPACE = [" ", "\n", "\r"].freeze
9
+ PLAINTEXT = "plaintext".freeze
10
+ PRE = "pre".freeze
11
+ BR = "br".freeze
12
+ HR = "hr".freeze
13
+ TD = "td".freeze
14
+ TH = "th".freeze
15
+ TR = "tr".freeze
16
+ OL = "ol".freeze
17
+ UL = "ul".freeze
18
+ LI = "li".freeze
19
+ NUMBERS = ["1", "a"]
20
+ ABSOLUTE_URL_PATTERN = /^[a-z]+:\/\/[a-z0-9]/i
21
+
22
+ # Helper instance method for converting HTML into plain text. This method simply calls HtmlToPlainText.plain_text.
23
+ def plain_text(html)
24
+ HtmlToPlainText.plain_text(html)
25
+ end
26
+
27
+ class << self
28
+ # Convert some HTML into a plain text approximation.
29
+ def plain_text(html)
30
+ return if html.nil? || html.empty?
31
+ body = Nokogiri::HTML::Document.parse(html).css("body").first
32
+ return unless body
33
+ convert_node_to_plain_text(body).strip.gsub(/\r(\n?)/, "\n")
34
+ end
35
+
36
+ private
37
+
38
+ # Convert an HTML node to plain text. This method is called recursively with the output and
39
+ # formatting options for special tags.
40
+ def convert_node_to_plain_text(parent, out = "", options = {})
41
+ if PARAGRAPH_TAGS.include?(parent.name)
42
+ append_paragraph_breaks(out)
43
+ elsif BLOCK_TAGS.include?(parent.name)
44
+ append_block_breaks(out)
45
+ end
46
+
47
+ format_list_item(out, options) if parent.name == LI
48
+ out << "| " if parent.name == TR
49
+
50
+ parent.children.each do |node|
51
+ if node.text? || node.cdata?
52
+ text = node.text
53
+ unless options[:pre]
54
+ text = node.text.gsub(/[\n\r]/, " ").squeeze(" ")
55
+ text.lstrip! if WHITESPACE.include?(out[-1, 1])
56
+ end
57
+ out << text
58
+ elsif node.name == PLAINTEXT
59
+ out << node.text
60
+ elsif node.element? && !IGNORE_TAGS.include?(node.name)
61
+ convert_node_to_plain_text(node, out, child_options(node, options))
62
+
63
+ if node.name == BR
64
+ out << "\n"
65
+ elsif node.name == HR
66
+ out << "\n" unless out.end_with?("\n")
67
+ out << "-------------------------------\n"
68
+ elsif node.name == TD || node.name == TH
69
+ out << " | "
70
+ elsif node.name == "a"
71
+ href = node["href"]
72
+ if href && href.match(ABSOLUTE_URL_PATTERN) && node.text.match(/\S/)
73
+ out << " (#{href}) "
74
+ end
75
+ elsif PARAGRAPH_TAGS.include?(node.name)
76
+ append_paragraph_breaks(out)
77
+ elsif BLOCK_TAGS.include?(node.name)
78
+ append_block_breaks(out)
79
+ end
80
+ end
81
+ end
82
+ out
83
+ end
84
+
85
+ # Set formatting options that will be passed to child elements for a tag.
86
+ def child_options(node, options)
87
+ if node.name == UL
88
+ level = options[:ul] || -1
89
+ level += 1
90
+ options.merge(:list => :ul, :ul => level)
91
+ elsif node.name == OL
92
+ level = options[:ol] || -1
93
+ level += 1
94
+ options.merge(:list => :ol, :ol => level, :number => NUMBERS[level % 2])
95
+ elsif node.name == PRE
96
+ options.merge(:pre => true)
97
+ else
98
+ options
99
+ end
100
+ end
101
+
102
+ # Add double line breaks between paragraph elements. If line breaks already exist,
103
+ # new ones will only be added to get to two.
104
+ def append_paragraph_breaks(out)
105
+ out.chomp!(" ")
106
+ if out.end_with?("\n")
107
+ out << "\n" unless out.end_with?("\n\n")
108
+ else
109
+ out << "\n\n"
110
+ end
111
+ end
112
+
113
+ # Add a single line break between block elements. If a line break already exists,
114
+ # none will be added.
115
+ def append_block_breaks(out)
116
+ out.chomp!(" ")
117
+ out << "\n" unless out.end_with?("\n")
118
+ end
119
+
120
+ # Add an appropriate bullet or number to a list element.
121
+ def format_list_item(out, options)
122
+ if options[:list] == :ul
123
+ out << "#{'*' * (options[:ul] + 1)} "
124
+ elsif options[:list] == :ol
125
+ number = options[:number]
126
+ options[:number] = number.next
127
+ out << "#{number}. "
128
+ end
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,93 @@
1
+ require 'spec_helper'
2
+
3
+ describe HtmlToPlainText do
4
+ it "should format paragraph tags" do
5
+ html = "<h1>Test</h1><h2>More Test</h2><p>This is a test</p>"
6
+ HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\n\nThis is a test"
7
+ end
8
+
9
+ it "should format block tags" do
10
+ html = "<div>Test</div><div>More Test<div>This is a test</div></div>"
11
+ HtmlToPlainText.plain_text(html).should == "Test\nMore Test\nThis is a test"
12
+ end
13
+
14
+ it "should format <br> tags" do
15
+ html = "<div>Test</div><br><div>More Test<br />This is a test"
16
+ HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\nThis is a test"
17
+ end
18
+
19
+ it "should format <hr> tags" do
20
+ html = "<div>Test</div><hr><div>More Test<hr />This is a test"
21
+ HtmlToPlainText.plain_text(html).should == "Test\n-------------------------------\nMore Test\n-------------------------------\nThis is a test"
22
+ end
23
+
24
+ it "should keep text formatting in <pre> tag blocks" do
25
+ html = "<div>This \n is a \ntest</div><pre>with\n pre tags</pre>end"
26
+ HtmlToPlainText.plain_text(html).should == "This is a test\nwith\n pre tags\nend"
27
+ end
28
+
29
+ it "should remove inline formatting tags" do
30
+ html = "This is <strong>so</strong> cool. I<em> mean <em>it."
31
+ HtmlToPlainText.plain_text(html).should == "This is so cool. I mean it."
32
+ end
33
+
34
+ it "should remove script, style, object, applet, and iframe tags" do
35
+ html = "script <script>do_something</script> style <style>css</style> object <object>config</object> applet <applet>config</applet> iframe <iframe>config</iframe>"
36
+ HtmlToPlainText.plain_text(html).should == "script style object applet iframe"
37
+ end
38
+
39
+ it "should handle plaintext tags" do
40
+ html = "<div>my\nhtml</div><plaintext>my\n text"
41
+ HtmlToPlainText.plain_text(html).should == "my html\nmy\n text"
42
+ end
43
+
44
+ it "should not add extraneous spaces or line breaks" do
45
+ html = "this<p><p> is \n \n pretty bad lo<em>oking htm</em>l!"
46
+ HtmlToPlainText.plain_text(html).should == "this\n\nis pretty bad looking html!"
47
+ end
48
+
49
+ it "should format bullet lists" do
50
+ html = "List<ul><li>one</li><li>two<ul><li>a</li><li>b</li></ul></li><li>three</li></ul>"
51
+ HtmlToPlainText.plain_text(html).should == "List\n\n* one\n* two\n\n** a\n** b\n\n* three"
52
+ end
53
+
54
+ it "should format numbered lists" do
55
+ html = "List<ol><li>one</li><li>two<ol><li>a</li><li>b</li></ol></li><li>three</li></ol>"
56
+ HtmlToPlainText.plain_text(html).should == "List\n\n1. one\n2. two\n\na. a\nb. b\n\n3. three"
57
+ end
58
+
59
+ it "should format a table" do
60
+ html = "Table<table><tr><th>Col 1</th><th>Col 2</th></tr><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>"
61
+ HtmlToPlainText.plain_text(html).should == "Table\n\n| Col 1 | Col 2 |\n| 1 | 2 |\n| 3 | 4 |"
62
+ end
63
+
64
+ it "should ignore inline tags without bodies" do
65
+ html = "This is an <img src=\"/image\"> image"
66
+ HtmlToPlainText.plain_text(html).should == "This is an image"
67
+ end
68
+
69
+ it "should ignore comments" do
70
+ html = "This is <!-- html comment here --> html"
71
+ HtmlToPlainText.plain_text(html).should == "This is html"
72
+ end
73
+
74
+ it "should unencode entities" do
75
+ html = "High &amp; Low"
76
+ HtmlToPlainText.plain_text(html).should == "High & Low"
77
+ end
78
+
79
+ it "should normalize the line breaks" do
80
+ html = "<pre>These are\rreturn\r\nlines</pre>"
81
+ HtmlToPlainText.plain_text(html).should == "These are\nreturn\nlines"
82
+ end
83
+
84
+ it "should include absolute link URLs" do
85
+ html = "<a name='links'>Links</a> <a href='/test'>partial</a> <a href='http://example.com/test'>full</a> test<a href='http://example.com/test2'> <img src='test'> </a>"
86
+ HtmlToPlainText.plain_text(html).should == "Links partial full (http://example.com/test) test"
87
+ end
88
+
89
+ it "should unescape entities" do
90
+ html = "This &amp; th&#97;t"
91
+ HtmlToPlainText.plain_text(html).should == "This & that"
92
+ end
93
+ end
@@ -0,0 +1 @@
1
+ require File.expand_path("../../lib/html_to_plain_text.rb", __FILE__)
metadata ADDED
@@ -0,0 +1,107 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html_to_plain_text
3
+ version: !ruby/object:Gem::Version
4
+ hash: 23
5
+ prerelease:
6
+ segments:
7
+ - 1
8
+ - 0
9
+ - 0
10
+ version: 1.0.0
11
+ platform: ruby
12
+ authors:
13
+ - Brian Durand
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-06-08 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: nokogiri
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 7
30
+ segments:
31
+ - 1
32
+ - 4
33
+ - 0
34
+ version: 1.4.0
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: rspec
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">"
44
+ - !ruby/object:Gem::Version
45
+ hash: 15
46
+ segments:
47
+ - 2
48
+ - 0
49
+ - 0
50
+ version: 2.0.0
51
+ type: :development
52
+ version_requirements: *id002
53
+ description: A simple library for converting HTML into an approximation in plain text.
54
+ email:
55
+ - bdurand@embellishedvisions.com
56
+ executables: []
57
+
58
+ extensions: []
59
+
60
+ extra_rdoc_files:
61
+ - README.rdoc
62
+ files:
63
+ - README.rdoc
64
+ - VERSION
65
+ - Rakefile
66
+ - MIT_LICENSE
67
+ - lib/html_to_plain_text.rb
68
+ - spec/html_to_plain_text_spec.rb
69
+ - spec/spec_helper.rb
70
+ has_rdoc: true
71
+ homepage: http://github.com/bdurand/html_to_plain_text
72
+ licenses: []
73
+
74
+ post_install_message:
75
+ rdoc_options:
76
+ - --charset=UTF-8
77
+ - --main
78
+ - README.rdoc
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ none: false
83
+ requirements:
84
+ - - ">="
85
+ - !ruby/object:Gem::Version
86
+ hash: 3
87
+ segments:
88
+ - 0
89
+ version: "0"
90
+ required_rubygems_version: !ruby/object:Gem::Requirement
91
+ none: false
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ hash: 3
96
+ segments:
97
+ - 0
98
+ version: "0"
99
+ requirements: []
100
+
101
+ rubyforge_project:
102
+ rubygems_version: 1.5.2
103
+ signing_key:
104
+ specification_version: 3
105
+ summary: A simple library for converting HTML into plain text.
106
+ test_files: []
107
+