html_to_plain_text 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT_LICENSE +20 -0
- data/README.rdoc +18 -0
- data/Rakefile +29 -0
- data/VERSION +1 -0
- data/lib/html_to_plain_text.rb +131 -0
- data/spec/html_to_plain_text_spec.rb +93 -0
- data/spec/spec_helper.rb +1 -0
- metadata +107 -0
data/MIT_LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2011 Brian Durand
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
= HTML To Plain Text
|
2
|
+
|
3
|
+
A simple gem that provide code to convert HTML into a plain text alternative. Line breaks from HTML block level elements will be maintained. Lists and tables will also maintain a little bit of formatting.
|
4
|
+
|
5
|
+
* Line breaks will be approximated using the generally established default margins for HTML tags (i.e. <p>
|
6
|
+
tag generates two line breaks, <div> generates one)
|
7
|
+
* Lists items will be numbered or bulleted with an asterisk
|
8
|
+
* <br> tags will add line breaks
|
9
|
+
* <hr> tags will add a string of hyphens to serve as a horizontal rule
|
10
|
+
* <table> elements will enclosed in "|" delimiters
|
11
|
+
* <a> tags will have the href URL appended to the text in parentheses
|
12
|
+
* Formatting tags like <strong> or <b> will be stripped
|
13
|
+
* Formatting inside <pre> or <plaintext> elements will be honored
|
14
|
+
* Code-like tags like <script> or <style> will be stripped
|
15
|
+
|
16
|
+
== Usage
|
17
|
+
|
18
|
+
HtmlToPlainText.plain_text(html)
|
data/Rakefile
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rubygems/package_task'
|
3
|
+
require 'rake'
|
4
|
+
|
5
|
+
desc 'Default: run unit tests.'
|
6
|
+
task :default => :test
|
7
|
+
|
8
|
+
desc 'RVM likes to call it tests'
|
9
|
+
task :tests => :test
|
10
|
+
|
11
|
+
begin
|
12
|
+
require 'rspec'
|
13
|
+
require 'rspec/core/rake_task'
|
14
|
+
desc 'Run the unit tests'
|
15
|
+
RSpec::Core::RakeTask.new(:test)
|
16
|
+
rescue LoadError
|
17
|
+
task :test do
|
18
|
+
STDERR.puts "You must have rspec 2.0 installed to run the tests"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
spec_file = File.expand_path('../html_to_plain_text.gemspec', __FILE__)
|
23
|
+
if File.exist?(spec_file)
|
24
|
+
spec = eval(File.read(spec_file))
|
25
|
+
|
26
|
+
Gem::PackageTask.new(spec) do |p|
|
27
|
+
p.gem_spec = spec
|
28
|
+
end
|
29
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.0.0
|
@@ -0,0 +1,131 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
# The main method on this module +plain_text+ will convert a string of HTML to a plain text approximation.
|
4
|
+
module HtmlToPlainText
|
5
|
+
IGNORE_TAGS = %w(script style object applet iframe).inject({}){|h, t| h[t] = true; h}.freeze
|
6
|
+
PARAGRAPH_TAGS = %w(p h1 h2 h3 h4 h5 h6 table ol ul dl dd blockquote dialog figure aside section).inject({}){|h, t| h[t] = true; h}.freeze
|
7
|
+
BLOCK_TAGS = %w(div address li dt center del article header header footer nav pre legend tr).inject({}){|h, t| h[t] = true; h}.freeze
|
8
|
+
WHITESPACE = [" ", "\n", "\r"].freeze
|
9
|
+
PLAINTEXT = "plaintext".freeze
|
10
|
+
PRE = "pre".freeze
|
11
|
+
BR = "br".freeze
|
12
|
+
HR = "hr".freeze
|
13
|
+
TD = "td".freeze
|
14
|
+
TH = "th".freeze
|
15
|
+
TR = "tr".freeze
|
16
|
+
OL = "ol".freeze
|
17
|
+
UL = "ul".freeze
|
18
|
+
LI = "li".freeze
|
19
|
+
NUMBERS = ["1", "a"]
|
20
|
+
ABSOLUTE_URL_PATTERN = /^[a-z]+:\/\/[a-z0-9]/i
|
21
|
+
|
22
|
+
# Helper instance method for converting HTML into plain text. This method simply calls HtmlToPlainText.plain_text.
|
23
|
+
def plain_text(html)
|
24
|
+
HtmlToPlainText.plain_text(html)
|
25
|
+
end
|
26
|
+
|
27
|
+
class << self
|
28
|
+
# Convert some HTML into a plain text approximation.
|
29
|
+
def plain_text(html)
|
30
|
+
return if html.nil? || html.empty?
|
31
|
+
body = Nokogiri::HTML::Document.parse(html).css("body").first
|
32
|
+
return unless body
|
33
|
+
convert_node_to_plain_text(body).strip.gsub(/\r(\n?)/, "\n")
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
# Convert an HTML node to plain text. This method is called recursively with the output and
|
39
|
+
# formatting options for special tags.
|
40
|
+
def convert_node_to_plain_text(parent, out = "", options = {})
|
41
|
+
if PARAGRAPH_TAGS.include?(parent.name)
|
42
|
+
append_paragraph_breaks(out)
|
43
|
+
elsif BLOCK_TAGS.include?(parent.name)
|
44
|
+
append_block_breaks(out)
|
45
|
+
end
|
46
|
+
|
47
|
+
format_list_item(out, options) if parent.name == LI
|
48
|
+
out << "| " if parent.name == TR
|
49
|
+
|
50
|
+
parent.children.each do |node|
|
51
|
+
if node.text? || node.cdata?
|
52
|
+
text = node.text
|
53
|
+
unless options[:pre]
|
54
|
+
text = node.text.gsub(/[\n\r]/, " ").squeeze(" ")
|
55
|
+
text.lstrip! if WHITESPACE.include?(out[-1, 1])
|
56
|
+
end
|
57
|
+
out << text
|
58
|
+
elsif node.name == PLAINTEXT
|
59
|
+
out << node.text
|
60
|
+
elsif node.element? && !IGNORE_TAGS.include?(node.name)
|
61
|
+
convert_node_to_plain_text(node, out, child_options(node, options))
|
62
|
+
|
63
|
+
if node.name == BR
|
64
|
+
out << "\n"
|
65
|
+
elsif node.name == HR
|
66
|
+
out << "\n" unless out.end_with?("\n")
|
67
|
+
out << "-------------------------------\n"
|
68
|
+
elsif node.name == TD || node.name == TH
|
69
|
+
out << " | "
|
70
|
+
elsif node.name == "a"
|
71
|
+
href = node["href"]
|
72
|
+
if href && href.match(ABSOLUTE_URL_PATTERN) && node.text.match(/\S/)
|
73
|
+
out << " (#{href}) "
|
74
|
+
end
|
75
|
+
elsif PARAGRAPH_TAGS.include?(node.name)
|
76
|
+
append_paragraph_breaks(out)
|
77
|
+
elsif BLOCK_TAGS.include?(node.name)
|
78
|
+
append_block_breaks(out)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
out
|
83
|
+
end
|
84
|
+
|
85
|
+
# Set formatting options that will be passed to child elements for a tag.
|
86
|
+
def child_options(node, options)
|
87
|
+
if node.name == UL
|
88
|
+
level = options[:ul] || -1
|
89
|
+
level += 1
|
90
|
+
options.merge(:list => :ul, :ul => level)
|
91
|
+
elsif node.name == OL
|
92
|
+
level = options[:ol] || -1
|
93
|
+
level += 1
|
94
|
+
options.merge(:list => :ol, :ol => level, :number => NUMBERS[level % 2])
|
95
|
+
elsif node.name == PRE
|
96
|
+
options.merge(:pre => true)
|
97
|
+
else
|
98
|
+
options
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Add double line breaks between paragraph elements. If line breaks already exist,
|
103
|
+
# new ones will only be added to get to two.
|
104
|
+
def append_paragraph_breaks(out)
|
105
|
+
out.chomp!(" ")
|
106
|
+
if out.end_with?("\n")
|
107
|
+
out << "\n" unless out.end_with?("\n\n")
|
108
|
+
else
|
109
|
+
out << "\n\n"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# Add a single line break between block elements. If a line break already exists,
|
114
|
+
# none will be added.
|
115
|
+
def append_block_breaks(out)
|
116
|
+
out.chomp!(" ")
|
117
|
+
out << "\n" unless out.end_with?("\n")
|
118
|
+
end
|
119
|
+
|
120
|
+
# Add an appropriate bullet or number to a list element.
|
121
|
+
def format_list_item(out, options)
|
122
|
+
if options[:list] == :ul
|
123
|
+
out << "#{'*' * (options[:ul] + 1)} "
|
124
|
+
elsif options[:list] == :ol
|
125
|
+
number = options[:number]
|
126
|
+
options[:number] = number.next
|
127
|
+
out << "#{number}. "
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe HtmlToPlainText do
|
4
|
+
it "should format paragraph tags" do
|
5
|
+
html = "<h1>Test</h1><h2>More Test</h2><p>This is a test</p>"
|
6
|
+
HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\n\nThis is a test"
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should format block tags" do
|
10
|
+
html = "<div>Test</div><div>More Test<div>This is a test</div></div>"
|
11
|
+
HtmlToPlainText.plain_text(html).should == "Test\nMore Test\nThis is a test"
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should format <br> tags" do
|
15
|
+
html = "<div>Test</div><br><div>More Test<br />This is a test"
|
16
|
+
HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\nThis is a test"
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should format <hr> tags" do
|
20
|
+
html = "<div>Test</div><hr><div>More Test<hr />This is a test"
|
21
|
+
HtmlToPlainText.plain_text(html).should == "Test\n-------------------------------\nMore Test\n-------------------------------\nThis is a test"
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should keep text formatting in <pre> tag blocks" do
|
25
|
+
html = "<div>This \n is a \ntest</div><pre>with\n pre tags</pre>end"
|
26
|
+
HtmlToPlainText.plain_text(html).should == "This is a test\nwith\n pre tags\nend"
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should remove inline formatting tags" do
|
30
|
+
html = "This is <strong>so</strong> cool. I<em> mean <em>it."
|
31
|
+
HtmlToPlainText.plain_text(html).should == "This is so cool. I mean it."
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should remove script, style, object, applet, and iframe tags" do
|
35
|
+
html = "script <script>do_something</script> style <style>css</style> object <object>config</object> applet <applet>config</applet> iframe <iframe>config</iframe>"
|
36
|
+
HtmlToPlainText.plain_text(html).should == "script style object applet iframe"
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should handle plaintext tags" do
|
40
|
+
html = "<div>my\nhtml</div><plaintext>my\n text"
|
41
|
+
HtmlToPlainText.plain_text(html).should == "my html\nmy\n text"
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should not add extraneous spaces or line breaks" do
|
45
|
+
html = "this<p><p> is \n \n pretty bad lo<em>oking htm</em>l!"
|
46
|
+
HtmlToPlainText.plain_text(html).should == "this\n\nis pretty bad looking html!"
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should format bullet lists" do
|
50
|
+
html = "List<ul><li>one</li><li>two<ul><li>a</li><li>b</li></ul></li><li>three</li></ul>"
|
51
|
+
HtmlToPlainText.plain_text(html).should == "List\n\n* one\n* two\n\n** a\n** b\n\n* three"
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should format numbered lists" do
|
55
|
+
html = "List<ol><li>one</li><li>two<ol><li>a</li><li>b</li></ol></li><li>three</li></ol>"
|
56
|
+
HtmlToPlainText.plain_text(html).should == "List\n\n1. one\n2. two\n\na. a\nb. b\n\n3. three"
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should format a table" do
|
60
|
+
html = "Table<table><tr><th>Col 1</th><th>Col 2</th></tr><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>"
|
61
|
+
HtmlToPlainText.plain_text(html).should == "Table\n\n| Col 1 | Col 2 |\n| 1 | 2 |\n| 3 | 4 |"
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should ignore inline tags without bodies" do
|
65
|
+
html = "This is an <img src=\"/image\"> image"
|
66
|
+
HtmlToPlainText.plain_text(html).should == "This is an image"
|
67
|
+
end
|
68
|
+
|
69
|
+
it "should ignore comments" do
|
70
|
+
html = "This is <!-- html comment here --> html"
|
71
|
+
HtmlToPlainText.plain_text(html).should == "This is html"
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should unencode entities" do
|
75
|
+
html = "High & Low"
|
76
|
+
HtmlToPlainText.plain_text(html).should == "High & Low"
|
77
|
+
end
|
78
|
+
|
79
|
+
it "should normalize the line breaks" do
|
80
|
+
html = "<pre>These are\rreturn\r\nlines</pre>"
|
81
|
+
HtmlToPlainText.plain_text(html).should == "These are\nreturn\nlines"
|
82
|
+
end
|
83
|
+
|
84
|
+
it "should include absolute link URLs" do
|
85
|
+
html = "<a name='links'>Links</a> <a href='/test'>partial</a> <a href='http://example.com/test'>full</a> test<a href='http://example.com/test2'> <img src='test'> </a>"
|
86
|
+
HtmlToPlainText.plain_text(html).should == "Links partial full (http://example.com/test) test"
|
87
|
+
end
|
88
|
+
|
89
|
+
it "should unescape entities" do
|
90
|
+
html = "This & that"
|
91
|
+
HtmlToPlainText.plain_text(html).should == "This & that"
|
92
|
+
end
|
93
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require File.expand_path("../../lib/html_to_plain_text.rb", __FILE__)
|
metadata
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: html_to_plain_text
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 23
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
- 0
|
10
|
+
version: 1.0.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Brian Durand
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-06-08 00:00:00 -05:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: nokogiri
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 7
|
30
|
+
segments:
|
31
|
+
- 1
|
32
|
+
- 4
|
33
|
+
- 0
|
34
|
+
version: 1.4.0
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: rspec
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ">"
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
hash: 15
|
46
|
+
segments:
|
47
|
+
- 2
|
48
|
+
- 0
|
49
|
+
- 0
|
50
|
+
version: 2.0.0
|
51
|
+
type: :development
|
52
|
+
version_requirements: *id002
|
53
|
+
description: A simple library for converting HTML into an approximation in plain text.
|
54
|
+
email:
|
55
|
+
- bdurand@embellishedvisions.com
|
56
|
+
executables: []
|
57
|
+
|
58
|
+
extensions: []
|
59
|
+
|
60
|
+
extra_rdoc_files:
|
61
|
+
- README.rdoc
|
62
|
+
files:
|
63
|
+
- README.rdoc
|
64
|
+
- VERSION
|
65
|
+
- Rakefile
|
66
|
+
- MIT_LICENSE
|
67
|
+
- lib/html_to_plain_text.rb
|
68
|
+
- spec/html_to_plain_text_spec.rb
|
69
|
+
- spec/spec_helper.rb
|
70
|
+
has_rdoc: true
|
71
|
+
homepage: http://github.com/bdurand/html_to_plain_text
|
72
|
+
licenses: []
|
73
|
+
|
74
|
+
post_install_message:
|
75
|
+
rdoc_options:
|
76
|
+
- --charset=UTF-8
|
77
|
+
- --main
|
78
|
+
- README.rdoc
|
79
|
+
require_paths:
|
80
|
+
- lib
|
81
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
82
|
+
none: false
|
83
|
+
requirements:
|
84
|
+
- - ">="
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
hash: 3
|
87
|
+
segments:
|
88
|
+
- 0
|
89
|
+
version: "0"
|
90
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
|
+
none: false
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
hash: 3
|
96
|
+
segments:
|
97
|
+
- 0
|
98
|
+
version: "0"
|
99
|
+
requirements: []
|
100
|
+
|
101
|
+
rubyforge_project:
|
102
|
+
rubygems_version: 1.5.2
|
103
|
+
signing_key:
|
104
|
+
specification_version: 3
|
105
|
+
summary: A simple library for converting HTML into plain text.
|
106
|
+
test_files: []
|
107
|
+
|