html_to_plain_text 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT_LICENSE +20 -0
- data/README.rdoc +18 -0
- data/Rakefile +29 -0
- data/VERSION +1 -0
- data/lib/html_to_plain_text.rb +131 -0
- data/spec/html_to_plain_text_spec.rb +93 -0
- data/spec/spec_helper.rb +1 -0
- metadata +107 -0
data/MIT_LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2011 Brian Durand
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
= HTML To Plain Text
|
2
|
+
|
3
|
+
A simple gem that provide code to convert HTML into a plain text alternative. Line breaks from HTML block level elements will be maintained. Lists and tables will also maintain a little bit of formatting.
|
4
|
+
|
5
|
+
* Line breaks will be approximated using the generally established default margins for HTML tags (i.e. <p>
|
6
|
+
tag generates two line breaks, <div> generates one)
|
7
|
+
* Lists items will be numbered or bulleted with an asterisk
|
8
|
+
* <br> tags will add line breaks
|
9
|
+
* <hr> tags will add a string of hyphens to serve as a horizontal rule
|
10
|
+
* <table> elements will enclosed in "|" delimiters
|
11
|
+
* <a> tags will have the href URL appended to the text in parentheses
|
12
|
+
* Formatting tags like <strong> or <b> will be stripped
|
13
|
+
* Formatting inside <pre> or <plaintext> elements will be honored
|
14
|
+
* Code-like tags like <script> or <style> will be stripped
|
15
|
+
|
16
|
+
== Usage
|
17
|
+
|
18
|
+
HtmlToPlainText.plain_text(html)
|
data/Rakefile
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rubygems/package_task'
|
3
|
+
require 'rake'
|
4
|
+
|
5
|
+
desc 'Default: run unit tests.'
|
6
|
+
task :default => :test
|
7
|
+
|
8
|
+
desc 'RVM likes to call it tests'
|
9
|
+
task :tests => :test
|
10
|
+
|
11
|
+
begin
|
12
|
+
require 'rspec'
|
13
|
+
require 'rspec/core/rake_task'
|
14
|
+
desc 'Run the unit tests'
|
15
|
+
RSpec::Core::RakeTask.new(:test)
|
16
|
+
rescue LoadError
|
17
|
+
task :test do
|
18
|
+
STDERR.puts "You must have rspec 2.0 installed to run the tests"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
spec_file = File.expand_path('../html_to_plain_text.gemspec', __FILE__)
|
23
|
+
if File.exist?(spec_file)
|
24
|
+
spec = eval(File.read(spec_file))
|
25
|
+
|
26
|
+
Gem::PackageTask.new(spec) do |p|
|
27
|
+
p.gem_spec = spec
|
28
|
+
end
|
29
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.0.0
|
@@ -0,0 +1,131 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
# The main method on this module +plain_text+ will convert a string of HTML to a plain text approximation.
|
4
|
+
module HtmlToPlainText
|
5
|
+
IGNORE_TAGS = %w(script style object applet iframe).inject({}){|h, t| h[t] = true; h}.freeze
|
6
|
+
PARAGRAPH_TAGS = %w(p h1 h2 h3 h4 h5 h6 table ol ul dl dd blockquote dialog figure aside section).inject({}){|h, t| h[t] = true; h}.freeze
|
7
|
+
BLOCK_TAGS = %w(div address li dt center del article header header footer nav pre legend tr).inject({}){|h, t| h[t] = true; h}.freeze
|
8
|
+
WHITESPACE = [" ", "\n", "\r"].freeze
|
9
|
+
PLAINTEXT = "plaintext".freeze
|
10
|
+
PRE = "pre".freeze
|
11
|
+
BR = "br".freeze
|
12
|
+
HR = "hr".freeze
|
13
|
+
TD = "td".freeze
|
14
|
+
TH = "th".freeze
|
15
|
+
TR = "tr".freeze
|
16
|
+
OL = "ol".freeze
|
17
|
+
UL = "ul".freeze
|
18
|
+
LI = "li".freeze
|
19
|
+
NUMBERS = ["1", "a"]
|
20
|
+
ABSOLUTE_URL_PATTERN = /^[a-z]+:\/\/[a-z0-9]/i
|
21
|
+
|
22
|
+
# Helper instance method for converting HTML into plain text. This method simply calls HtmlToPlainText.plain_text.
|
23
|
+
def plain_text(html)
|
24
|
+
HtmlToPlainText.plain_text(html)
|
25
|
+
end
|
26
|
+
|
27
|
+
class << self
|
28
|
+
# Convert some HTML into a plain text approximation.
|
29
|
+
def plain_text(html)
|
30
|
+
return if html.nil? || html.empty?
|
31
|
+
body = Nokogiri::HTML::Document.parse(html).css("body").first
|
32
|
+
return unless body
|
33
|
+
convert_node_to_plain_text(body).strip.gsub(/\r(\n?)/, "\n")
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
# Convert an HTML node to plain text. This method is called recursively with the output and
|
39
|
+
# formatting options for special tags.
|
40
|
+
def convert_node_to_plain_text(parent, out = "", options = {})
|
41
|
+
if PARAGRAPH_TAGS.include?(parent.name)
|
42
|
+
append_paragraph_breaks(out)
|
43
|
+
elsif BLOCK_TAGS.include?(parent.name)
|
44
|
+
append_block_breaks(out)
|
45
|
+
end
|
46
|
+
|
47
|
+
format_list_item(out, options) if parent.name == LI
|
48
|
+
out << "| " if parent.name == TR
|
49
|
+
|
50
|
+
parent.children.each do |node|
|
51
|
+
if node.text? || node.cdata?
|
52
|
+
text = node.text
|
53
|
+
unless options[:pre]
|
54
|
+
text = node.text.gsub(/[\n\r]/, " ").squeeze(" ")
|
55
|
+
text.lstrip! if WHITESPACE.include?(out[-1, 1])
|
56
|
+
end
|
57
|
+
out << text
|
58
|
+
elsif node.name == PLAINTEXT
|
59
|
+
out << node.text
|
60
|
+
elsif node.element? && !IGNORE_TAGS.include?(node.name)
|
61
|
+
convert_node_to_plain_text(node, out, child_options(node, options))
|
62
|
+
|
63
|
+
if node.name == BR
|
64
|
+
out << "\n"
|
65
|
+
elsif node.name == HR
|
66
|
+
out << "\n" unless out.end_with?("\n")
|
67
|
+
out << "-------------------------------\n"
|
68
|
+
elsif node.name == TD || node.name == TH
|
69
|
+
out << " | "
|
70
|
+
elsif node.name == "a"
|
71
|
+
href = node["href"]
|
72
|
+
if href && href.match(ABSOLUTE_URL_PATTERN) && node.text.match(/\S/)
|
73
|
+
out << " (#{href}) "
|
74
|
+
end
|
75
|
+
elsif PARAGRAPH_TAGS.include?(node.name)
|
76
|
+
append_paragraph_breaks(out)
|
77
|
+
elsif BLOCK_TAGS.include?(node.name)
|
78
|
+
append_block_breaks(out)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
out
|
83
|
+
end
|
84
|
+
|
85
|
+
# Set formatting options that will be passed to child elements for a tag.
|
86
|
+
def child_options(node, options)
|
87
|
+
if node.name == UL
|
88
|
+
level = options[:ul] || -1
|
89
|
+
level += 1
|
90
|
+
options.merge(:list => :ul, :ul => level)
|
91
|
+
elsif node.name == OL
|
92
|
+
level = options[:ol] || -1
|
93
|
+
level += 1
|
94
|
+
options.merge(:list => :ol, :ol => level, :number => NUMBERS[level % 2])
|
95
|
+
elsif node.name == PRE
|
96
|
+
options.merge(:pre => true)
|
97
|
+
else
|
98
|
+
options
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Add double line breaks between paragraph elements. If line breaks already exist,
|
103
|
+
# new ones will only be added to get to two.
|
104
|
+
def append_paragraph_breaks(out)
|
105
|
+
out.chomp!(" ")
|
106
|
+
if out.end_with?("\n")
|
107
|
+
out << "\n" unless out.end_with?("\n\n")
|
108
|
+
else
|
109
|
+
out << "\n\n"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# Add a single line break between block elements. If a line break already exists,
|
114
|
+
# none will be added.
|
115
|
+
def append_block_breaks(out)
|
116
|
+
out.chomp!(" ")
|
117
|
+
out << "\n" unless out.end_with?("\n")
|
118
|
+
end
|
119
|
+
|
120
|
+
# Add an appropriate bullet or number to a list element.
|
121
|
+
def format_list_item(out, options)
|
122
|
+
if options[:list] == :ul
|
123
|
+
out << "#{'*' * (options[:ul] + 1)} "
|
124
|
+
elsif options[:list] == :ol
|
125
|
+
number = options[:number]
|
126
|
+
options[:number] = number.next
|
127
|
+
out << "#{number}. "
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe HtmlToPlainText do
|
4
|
+
it "should format paragraph tags" do
|
5
|
+
html = "<h1>Test</h1><h2>More Test</h2><p>This is a test</p>"
|
6
|
+
HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\n\nThis is a test"
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should format block tags" do
|
10
|
+
html = "<div>Test</div><div>More Test<div>This is a test</div></div>"
|
11
|
+
HtmlToPlainText.plain_text(html).should == "Test\nMore Test\nThis is a test"
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should format <br> tags" do
|
15
|
+
html = "<div>Test</div><br><div>More Test<br />This is a test"
|
16
|
+
HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\nThis is a test"
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should format <hr> tags" do
|
20
|
+
html = "<div>Test</div><hr><div>More Test<hr />This is a test"
|
21
|
+
HtmlToPlainText.plain_text(html).should == "Test\n-------------------------------\nMore Test\n-------------------------------\nThis is a test"
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should keep text formatting in <pre> tag blocks" do
|
25
|
+
html = "<div>This \n is a \ntest</div><pre>with\n pre tags</pre>end"
|
26
|
+
HtmlToPlainText.plain_text(html).should == "This is a test\nwith\n pre tags\nend"
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should remove inline formatting tags" do
|
30
|
+
html = "This is <strong>so</strong> cool. I<em> mean <em>it."
|
31
|
+
HtmlToPlainText.plain_text(html).should == "This is so cool. I mean it."
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should remove script, style, object, applet, and iframe tags" do
|
35
|
+
html = "script <script>do_something</script> style <style>css</style> object <object>config</object> applet <applet>config</applet> iframe <iframe>config</iframe>"
|
36
|
+
HtmlToPlainText.plain_text(html).should == "script style object applet iframe"
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should handle plaintext tags" do
|
40
|
+
html = "<div>my\nhtml</div><plaintext>my\n text"
|
41
|
+
HtmlToPlainText.plain_text(html).should == "my html\nmy\n text"
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should not add extraneous spaces or line breaks" do
|
45
|
+
html = "this<p><p> is \n \n pretty bad lo<em>oking htm</em>l!"
|
46
|
+
HtmlToPlainText.plain_text(html).should == "this\n\nis pretty bad looking html!"
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should format bullet lists" do
|
50
|
+
html = "List<ul><li>one</li><li>two<ul><li>a</li><li>b</li></ul></li><li>three</li></ul>"
|
51
|
+
HtmlToPlainText.plain_text(html).should == "List\n\n* one\n* two\n\n** a\n** b\n\n* three"
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should format numbered lists" do
|
55
|
+
html = "List<ol><li>one</li><li>two<ol><li>a</li><li>b</li></ol></li><li>three</li></ol>"
|
56
|
+
HtmlToPlainText.plain_text(html).should == "List\n\n1. one\n2. two\n\na. a\nb. b\n\n3. three"
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should format a table" do
|
60
|
+
html = "Table<table><tr><th>Col 1</th><th>Col 2</th></tr><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>"
|
61
|
+
HtmlToPlainText.plain_text(html).should == "Table\n\n| Col 1 | Col 2 |\n| 1 | 2 |\n| 3 | 4 |"
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should ignore inline tags without bodies" do
|
65
|
+
html = "This is an <img src=\"/image\"> image"
|
66
|
+
HtmlToPlainText.plain_text(html).should == "This is an image"
|
67
|
+
end
|
68
|
+
|
69
|
+
it "should ignore comments" do
|
70
|
+
html = "This is <!-- html comment here --> html"
|
71
|
+
HtmlToPlainText.plain_text(html).should == "This is html"
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should unencode entities" do
|
75
|
+
html = "High & Low"
|
76
|
+
HtmlToPlainText.plain_text(html).should == "High & Low"
|
77
|
+
end
|
78
|
+
|
79
|
+
it "should normalize the line breaks" do
|
80
|
+
html = "<pre>These are\rreturn\r\nlines</pre>"
|
81
|
+
HtmlToPlainText.plain_text(html).should == "These are\nreturn\nlines"
|
82
|
+
end
|
83
|
+
|
84
|
+
it "should include absolute link URLs" do
|
85
|
+
html = "<a name='links'>Links</a> <a href='/test'>partial</a> <a href='http://example.com/test'>full</a> test<a href='http://example.com/test2'> <img src='test'> </a>"
|
86
|
+
HtmlToPlainText.plain_text(html).should == "Links partial full (http://example.com/test) test"
|
87
|
+
end
|
88
|
+
|
89
|
+
it "should unescape entities" do
|
90
|
+
html = "This & that"
|
91
|
+
HtmlToPlainText.plain_text(html).should == "This & that"
|
92
|
+
end
|
93
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require File.expand_path("../../lib/html_to_plain_text.rb", __FILE__)
|
metadata
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: html_to_plain_text
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 23
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
- 0
|
10
|
+
version: 1.0.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Brian Durand
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-06-08 00:00:00 -05:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: nokogiri
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 7
|
30
|
+
segments:
|
31
|
+
- 1
|
32
|
+
- 4
|
33
|
+
- 0
|
34
|
+
version: 1.4.0
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: rspec
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ">"
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
hash: 15
|
46
|
+
segments:
|
47
|
+
- 2
|
48
|
+
- 0
|
49
|
+
- 0
|
50
|
+
version: 2.0.0
|
51
|
+
type: :development
|
52
|
+
version_requirements: *id002
|
53
|
+
description: A simple library for converting HTML into an approximation in plain text.
|
54
|
+
email:
|
55
|
+
- bdurand@embellishedvisions.com
|
56
|
+
executables: []
|
57
|
+
|
58
|
+
extensions: []
|
59
|
+
|
60
|
+
extra_rdoc_files:
|
61
|
+
- README.rdoc
|
62
|
+
files:
|
63
|
+
- README.rdoc
|
64
|
+
- VERSION
|
65
|
+
- Rakefile
|
66
|
+
- MIT_LICENSE
|
67
|
+
- lib/html_to_plain_text.rb
|
68
|
+
- spec/html_to_plain_text_spec.rb
|
69
|
+
- spec/spec_helper.rb
|
70
|
+
has_rdoc: true
|
71
|
+
homepage: http://github.com/bdurand/html_to_plain_text
|
72
|
+
licenses: []
|
73
|
+
|
74
|
+
post_install_message:
|
75
|
+
rdoc_options:
|
76
|
+
- --charset=UTF-8
|
77
|
+
- --main
|
78
|
+
- README.rdoc
|
79
|
+
require_paths:
|
80
|
+
- lib
|
81
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
82
|
+
none: false
|
83
|
+
requirements:
|
84
|
+
- - ">="
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
hash: 3
|
87
|
+
segments:
|
88
|
+
- 0
|
89
|
+
version: "0"
|
90
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
|
+
none: false
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
hash: 3
|
96
|
+
segments:
|
97
|
+
- 0
|
98
|
+
version: "0"
|
99
|
+
requirements: []
|
100
|
+
|
101
|
+
rubyforge_project:
|
102
|
+
rubygems_version: 1.5.2
|
103
|
+
signing_key:
|
104
|
+
specification_version: 3
|
105
|
+
summary: A simple library for converting HTML into plain text.
|
106
|
+
test_files: []
|
107
|
+
|