html_to_plain_text 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/MIT_LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Brian Durand
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,18 @@
1
+ = HTML To Plain Text
2
+
3
+ A simple gem that provide code to convert HTML into a plain text alternative. Line breaks from HTML block level elements will be maintained. Lists and tables will also maintain a little bit of formatting.
4
+
5
+ * Line breaks will be approximated using the generally established default margins for HTML tags (i.e. <p>
6
+ tag generates two line breaks, <div> generates one)
7
+ * Lists items will be numbered or bulleted with an asterisk
8
+ * <br> tags will add line breaks
9
+ * <hr> tags will add a string of hyphens to serve as a horizontal rule
10
+ * <table> elements will enclosed in "|" delimiters
11
+ * <a> tags will have the href URL appended to the text in parentheses
12
+ * Formatting tags like <strong> or <b> will be stripped
13
+ * Formatting inside <pre> or <plaintext> elements will be honored
14
+ * Code-like tags like <script> or <style> will be stripped
15
+
16
+ == Usage
17
+
18
+ HtmlToPlainText.plain_text(html)
data/Rakefile ADDED
@@ -0,0 +1,29 @@
1
+ require 'rubygems'
2
+ require 'rubygems/package_task'
3
+ require 'rake'
4
+
5
+ desc 'Default: run unit tests.'
6
+ task :default => :test
7
+
8
+ desc 'RVM likes to call it tests'
9
+ task :tests => :test
10
+
11
+ begin
12
+ require 'rspec'
13
+ require 'rspec/core/rake_task'
14
+ desc 'Run the unit tests'
15
+ RSpec::Core::RakeTask.new(:test)
16
+ rescue LoadError
17
+ task :test do
18
+ STDERR.puts "You must have rspec 2.0 installed to run the tests"
19
+ end
20
+ end
21
+
22
+ spec_file = File.expand_path('../html_to_plain_text.gemspec', __FILE__)
23
+ if File.exist?(spec_file)
24
+ spec = eval(File.read(spec_file))
25
+
26
+ Gem::PackageTask.new(spec) do |p|
27
+ p.gem_spec = spec
28
+ end
29
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
@@ -0,0 +1,131 @@
1
+ require 'nokogiri'
2
+
3
+ # The main method on this module +plain_text+ will convert a string of HTML to a plain text approximation.
4
+ module HtmlToPlainText
5
+ IGNORE_TAGS = %w(script style object applet iframe).inject({}){|h, t| h[t] = true; h}.freeze
6
+ PARAGRAPH_TAGS = %w(p h1 h2 h3 h4 h5 h6 table ol ul dl dd blockquote dialog figure aside section).inject({}){|h, t| h[t] = true; h}.freeze
7
+ BLOCK_TAGS = %w(div address li dt center del article header header footer nav pre legend tr).inject({}){|h, t| h[t] = true; h}.freeze
8
+ WHITESPACE = [" ", "\n", "\r"].freeze
9
+ PLAINTEXT = "plaintext".freeze
10
+ PRE = "pre".freeze
11
+ BR = "br".freeze
12
+ HR = "hr".freeze
13
+ TD = "td".freeze
14
+ TH = "th".freeze
15
+ TR = "tr".freeze
16
+ OL = "ol".freeze
17
+ UL = "ul".freeze
18
+ LI = "li".freeze
19
+ NUMBERS = ["1", "a"]
20
+ ABSOLUTE_URL_PATTERN = /^[a-z]+:\/\/[a-z0-9]/i
21
+
22
+ # Helper instance method for converting HTML into plain text. This method simply calls HtmlToPlainText.plain_text.
23
+ def plain_text(html)
24
+ HtmlToPlainText.plain_text(html)
25
+ end
26
+
27
+ class << self
28
+ # Convert some HTML into a plain text approximation.
29
+ def plain_text(html)
30
+ return if html.nil? || html.empty?
31
+ body = Nokogiri::HTML::Document.parse(html).css("body").first
32
+ return unless body
33
+ convert_node_to_plain_text(body).strip.gsub(/\r(\n?)/, "\n")
34
+ end
35
+
36
+ private
37
+
38
+ # Convert an HTML node to plain text. This method is called recursively with the output and
39
+ # formatting options for special tags.
40
+ def convert_node_to_plain_text(parent, out = "", options = {})
41
+ if PARAGRAPH_TAGS.include?(parent.name)
42
+ append_paragraph_breaks(out)
43
+ elsif BLOCK_TAGS.include?(parent.name)
44
+ append_block_breaks(out)
45
+ end
46
+
47
+ format_list_item(out, options) if parent.name == LI
48
+ out << "| " if parent.name == TR
49
+
50
+ parent.children.each do |node|
51
+ if node.text? || node.cdata?
52
+ text = node.text
53
+ unless options[:pre]
54
+ text = node.text.gsub(/[\n\r]/, " ").squeeze(" ")
55
+ text.lstrip! if WHITESPACE.include?(out[-1, 1])
56
+ end
57
+ out << text
58
+ elsif node.name == PLAINTEXT
59
+ out << node.text
60
+ elsif node.element? && !IGNORE_TAGS.include?(node.name)
61
+ convert_node_to_plain_text(node, out, child_options(node, options))
62
+
63
+ if node.name == BR
64
+ out << "\n"
65
+ elsif node.name == HR
66
+ out << "\n" unless out.end_with?("\n")
67
+ out << "-------------------------------\n"
68
+ elsif node.name == TD || node.name == TH
69
+ out << " | "
70
+ elsif node.name == "a"
71
+ href = node["href"]
72
+ if href && href.match(ABSOLUTE_URL_PATTERN) && node.text.match(/\S/)
73
+ out << " (#{href}) "
74
+ end
75
+ elsif PARAGRAPH_TAGS.include?(node.name)
76
+ append_paragraph_breaks(out)
77
+ elsif BLOCK_TAGS.include?(node.name)
78
+ append_block_breaks(out)
79
+ end
80
+ end
81
+ end
82
+ out
83
+ end
84
+
85
+ # Set formatting options that will be passed to child elements for a tag.
86
+ def child_options(node, options)
87
+ if node.name == UL
88
+ level = options[:ul] || -1
89
+ level += 1
90
+ options.merge(:list => :ul, :ul => level)
91
+ elsif node.name == OL
92
+ level = options[:ol] || -1
93
+ level += 1
94
+ options.merge(:list => :ol, :ol => level, :number => NUMBERS[level % 2])
95
+ elsif node.name == PRE
96
+ options.merge(:pre => true)
97
+ else
98
+ options
99
+ end
100
+ end
101
+
102
+ # Add double line breaks between paragraph elements. If line breaks already exist,
103
+ # new ones will only be added to get to two.
104
+ def append_paragraph_breaks(out)
105
+ out.chomp!(" ")
106
+ if out.end_with?("\n")
107
+ out << "\n" unless out.end_with?("\n\n")
108
+ else
109
+ out << "\n\n"
110
+ end
111
+ end
112
+
113
+ # Add a single line break between block elements. If a line break already exists,
114
+ # none will be added.
115
+ def append_block_breaks(out)
116
+ out.chomp!(" ")
117
+ out << "\n" unless out.end_with?("\n")
118
+ end
119
+
120
+ # Add an appropriate bullet or number to a list element.
121
+ def format_list_item(out, options)
122
+ if options[:list] == :ul
123
+ out << "#{'*' * (options[:ul] + 1)} "
124
+ elsif options[:list] == :ol
125
+ number = options[:number]
126
+ options[:number] = number.next
127
+ out << "#{number}. "
128
+ end
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,93 @@
1
+ require 'spec_helper'
2
+
3
+ describe HtmlToPlainText do
4
+ it "should format paragraph tags" do
5
+ html = "<h1>Test</h1><h2>More Test</h2><p>This is a test</p>"
6
+ HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\n\nThis is a test"
7
+ end
8
+
9
+ it "should format block tags" do
10
+ html = "<div>Test</div><div>More Test<div>This is a test</div></div>"
11
+ HtmlToPlainText.plain_text(html).should == "Test\nMore Test\nThis is a test"
12
+ end
13
+
14
+ it "should format <br> tags" do
15
+ html = "<div>Test</div><br><div>More Test<br />This is a test"
16
+ HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\nThis is a test"
17
+ end
18
+
19
+ it "should format <hr> tags" do
20
+ html = "<div>Test</div><hr><div>More Test<hr />This is a test"
21
+ HtmlToPlainText.plain_text(html).should == "Test\n-------------------------------\nMore Test\n-------------------------------\nThis is a test"
22
+ end
23
+
24
+ it "should keep text formatting in <pre> tag blocks" do
25
+ html = "<div>This \n is a \ntest</div><pre>with\n pre tags</pre>end"
26
+ HtmlToPlainText.plain_text(html).should == "This is a test\nwith\n pre tags\nend"
27
+ end
28
+
29
+ it "should remove inline formatting tags" do
30
+ html = "This is <strong>so</strong> cool. I<em> mean <em>it."
31
+ HtmlToPlainText.plain_text(html).should == "This is so cool. I mean it."
32
+ end
33
+
34
+ it "should remove script, style, object, applet, and iframe tags" do
35
+ html = "script <script>do_something</script> style <style>css</style> object <object>config</object> applet <applet>config</applet> iframe <iframe>config</iframe>"
36
+ HtmlToPlainText.plain_text(html).should == "script style object applet iframe"
37
+ end
38
+
39
+ it "should handle plaintext tags" do
40
+ html = "<div>my\nhtml</div><plaintext>my\n text"
41
+ HtmlToPlainText.plain_text(html).should == "my html\nmy\n text"
42
+ end
43
+
44
+ it "should not add extraneous spaces or line breaks" do
45
+ html = "this<p><p> is \n \n pretty bad lo<em>oking htm</em>l!"
46
+ HtmlToPlainText.plain_text(html).should == "this\n\nis pretty bad looking html!"
47
+ end
48
+
49
+ it "should format bullet lists" do
50
+ html = "List<ul><li>one</li><li>two<ul><li>a</li><li>b</li></ul></li><li>three</li></ul>"
51
+ HtmlToPlainText.plain_text(html).should == "List\n\n* one\n* two\n\n** a\n** b\n\n* three"
52
+ end
53
+
54
+ it "should format numbered lists" do
55
+ html = "List<ol><li>one</li><li>two<ol><li>a</li><li>b</li></ol></li><li>three</li></ol>"
56
+ HtmlToPlainText.plain_text(html).should == "List\n\n1. one\n2. two\n\na. a\nb. b\n\n3. three"
57
+ end
58
+
59
+ it "should format a table" do
60
+ html = "Table<table><tr><th>Col 1</th><th>Col 2</th></tr><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>"
61
+ HtmlToPlainText.plain_text(html).should == "Table\n\n| Col 1 | Col 2 |\n| 1 | 2 |\n| 3 | 4 |"
62
+ end
63
+
64
+ it "should ignore inline tags without bodies" do
65
+ html = "This is an <img src=\"/image\"> image"
66
+ HtmlToPlainText.plain_text(html).should == "This is an image"
67
+ end
68
+
69
+ it "should ignore comments" do
70
+ html = "This is <!-- html comment here --> html"
71
+ HtmlToPlainText.plain_text(html).should == "This is html"
72
+ end
73
+
74
+ it "should unencode entities" do
75
+ html = "High &amp; Low"
76
+ HtmlToPlainText.plain_text(html).should == "High & Low"
77
+ end
78
+
79
+ it "should normalize the line breaks" do
80
+ html = "<pre>These are\rreturn\r\nlines</pre>"
81
+ HtmlToPlainText.plain_text(html).should == "These are\nreturn\nlines"
82
+ end
83
+
84
+ it "should include absolute link URLs" do
85
+ html = "<a name='links'>Links</a> <a href='/test'>partial</a> <a href='http://example.com/test'>full</a> test<a href='http://example.com/test2'> <img src='test'> </a>"
86
+ HtmlToPlainText.plain_text(html).should == "Links partial full (http://example.com/test) test"
87
+ end
88
+
89
+ it "should unescape entities" do
90
+ html = "This &amp; th&#97;t"
91
+ HtmlToPlainText.plain_text(html).should == "This & that"
92
+ end
93
+ end
@@ -0,0 +1 @@
1
+ require File.expand_path("../../lib/html_to_plain_text.rb", __FILE__)
metadata ADDED
@@ -0,0 +1,107 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html_to_plain_text
3
+ version: !ruby/object:Gem::Version
4
+ hash: 23
5
+ prerelease:
6
+ segments:
7
+ - 1
8
+ - 0
9
+ - 0
10
+ version: 1.0.0
11
+ platform: ruby
12
+ authors:
13
+ - Brian Durand
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-06-08 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: nokogiri
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 7
30
+ segments:
31
+ - 1
32
+ - 4
33
+ - 0
34
+ version: 1.4.0
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: rspec
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">"
44
+ - !ruby/object:Gem::Version
45
+ hash: 15
46
+ segments:
47
+ - 2
48
+ - 0
49
+ - 0
50
+ version: 2.0.0
51
+ type: :development
52
+ version_requirements: *id002
53
+ description: A simple library for converting HTML into an approximation in plain text.
54
+ email:
55
+ - bdurand@embellishedvisions.com
56
+ executables: []
57
+
58
+ extensions: []
59
+
60
+ extra_rdoc_files:
61
+ - README.rdoc
62
+ files:
63
+ - README.rdoc
64
+ - VERSION
65
+ - Rakefile
66
+ - MIT_LICENSE
67
+ - lib/html_to_plain_text.rb
68
+ - spec/html_to_plain_text_spec.rb
69
+ - spec/spec_helper.rb
70
+ has_rdoc: true
71
+ homepage: http://github.com/bdurand/html_to_plain_text
72
+ licenses: []
73
+
74
+ post_install_message:
75
+ rdoc_options:
76
+ - --charset=UTF-8
77
+ - --main
78
+ - README.rdoc
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ none: false
83
+ requirements:
84
+ - - ">="
85
+ - !ruby/object:Gem::Version
86
+ hash: 3
87
+ segments:
88
+ - 0
89
+ version: "0"
90
+ required_rubygems_version: !ruby/object:Gem::Requirement
91
+ none: false
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ hash: 3
96
+ segments:
97
+ - 0
98
+ version: "0"
99
+ requirements: []
100
+
101
+ rubyforge_project:
102
+ rubygems_version: 1.5.2
103
+ signing_key:
104
+ specification_version: 3
105
+ summary: A simple library for converting HTML into plain text.
106
+ test_files: []
107
+