html_to_plain_text 1.0.2 → 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 119fe11f894f031f199ea631969078fefb53f417
4
+ data.tar.gz: 5882b3a0913030e44765b6506311682a2dd2e9ec
5
+ SHA512:
6
+ metadata.gz: 1cb91d616d5ebeb4a6a23f92005e8a4e7616d7f565cd1540b499e919c35106f9adb45083b6d2453629c3be73eb28d13455972ce21979cf98c21d95c81f4dd3eb
7
+ data.tar.gz: fd7aacdc78b1c2cf4ee23515a5f1ef8cf8975b9569deec96f19ca6178e6ef4fece11c79ec4a992c1452246ab6fba90fe15277b217500deecd4f63c3b67561b8b
data/README.rdoc CHANGED
@@ -1,5 +1,7 @@
1
1
  = HTML To Plain Text
2
2
 
3
+ <code>gem install html_to_plain_text</code>
4
+
3
5
  A simple gem that provide code to convert HTML into a plain text alternative. Line breaks from HTML block level elements will be maintained. Lists and tables will also maintain a little bit of formatting.
4
6
 
5
7
  * Line breaks will be approximated using the generally established default margins for HTML tags (i.e. <p>
@@ -15,4 +17,8 @@ tag generates two line breaks, <div> generates one)
15
17
 
16
18
  == Usage
17
19
 
18
- HtmlToPlainText.plain_text(html)
20
+ require 'html_to_plain_text'
21
+ html = "<h1>Hello</h1><p>world!</p>"
22
+ HtmlToPlainText.plain_text(html)
23
+ => "Hello\n\nworld!"
24
+
data/Rakefile CHANGED
@@ -1,29 +1,10 @@
1
- require 'rubygems'
2
- require 'rubygems/package_task'
3
- require 'rake'
1
+ require 'bundler/setup'
2
+ require 'bundler/gem_tasks'
3
+ require 'rspec/core/rake_task'
4
+ require 'bump/tasks'
4
5
 
5
6
  desc 'Default: run unit tests.'
6
7
  task :default => :test
7
8
 
8
- desc 'RVM likes to call it tests'
9
- task :tests => :test
10
-
11
- begin
12
- require 'rspec'
13
- require 'rspec/core/rake_task'
14
- desc 'Run the unit tests'
15
- RSpec::Core::RakeTask.new(:test)
16
- rescue LoadError
17
- task :test do
18
- STDERR.puts "You must have rspec 2.0 installed to run the tests"
19
- end
20
- end
21
-
22
- spec_file = File.expand_path('../html_to_plain_text.gemspec', __FILE__)
23
- if File.exist?(spec_file)
24
- spec = eval(File.read(spec_file))
25
-
26
- Gem::PackageTask.new(spec) do |p|
27
- p.gem_spec = spec
28
- end
29
- end
9
+ desc 'Run the unit tests'
10
+ RSpec::Core::RakeTask.new(:test)
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.2
1
+ 1.0.3
@@ -16,45 +16,56 @@ module HtmlToPlainText
16
16
  OL = "ol".freeze
17
17
  UL = "ul".freeze
18
18
  LI = "li".freeze
19
+ A = "a".freeze
19
20
  NUMBERS = ["1", "a"].freeze
20
21
  ABSOLUTE_URL_PATTERN = /^[a-z]+:\/\/[a-z0-9]/i.freeze
21
22
  HTML_PATTERN = /[<&]/.freeze
22
23
  TRAILING_WHITESPACE = /[ \t]+$/.freeze
23
-
24
+ BODY_TAG_XPATH = "/html/body".freeze
25
+ CARRIDGE_RETURN_PATTERN = /\r(\n?)/.freeze
26
+ LINE_BREAK_PATTERN = /[\n\r]/.freeze
27
+ NON_PROTOCOL_PATTERN = /:\/?\/?(.*)/.freeze
28
+ NOT_WHITESPACE_PATTERN = /\S/.freeze
29
+ SPACE = " ".freeze
30
+ EMPTY = "".freeze
31
+ NEWLINE = "\n".freeze
32
+ HREF = "href".freeze
33
+
24
34
  # Helper instance method for converting HTML into plain text. This method simply calls HtmlToPlainText.plain_text.
25
35
  def plain_text(html)
26
36
  HtmlToPlainText.plain_text(html)
27
37
  end
28
-
38
+
29
39
  class << self
30
40
  # Convert some HTML into a plain text approximation.
41
+
31
42
  def plain_text(html)
32
43
  return nil if html.nil?
33
- return html.dup unless html.match(HTML_PATTERN)
34
- body = Nokogiri::HTML::Document.parse(html).css("body").first
44
+ return html.dup unless html =~ HTML_PATTERN
45
+ body = Nokogiri::HTML::Document.parse(html).xpath(BODY_TAG_XPATH).first
35
46
  return unless body
36
- convert_node_to_plain_text(body).strip.gsub(/\r(\n?)/, "\n")
47
+ convert_node_to_plain_text(body).strip.gsub(CARRIDGE_RETURN_PATTERN, NEWLINE)
37
48
  end
38
-
49
+
39
50
  private
40
-
51
+
41
52
  # Convert an HTML node to plain text. This method is called recursively with the output and
42
53
  # formatting options for special tags.
43
- def convert_node_to_plain_text(parent, out = "", options = {})
54
+ def convert_node_to_plain_text(parent, out = '', options = {})
44
55
  if PARAGRAPH_TAGS.include?(parent.name)
45
56
  append_paragraph_breaks(out)
46
57
  elsif BLOCK_TAGS.include?(parent.name)
47
58
  append_block_breaks(out)
48
59
  end
49
-
60
+
50
61
  format_list_item(out, options) if parent.name == LI
51
62
  out << "| " if parent.name == TR
52
-
63
+
53
64
  parent.children.each do |node|
54
65
  if node.text? || node.cdata?
55
66
  text = node.text
56
67
  unless options[:pre]
57
- text = node.text.gsub(/[\n\r]/, " ").squeeze(" ")
68
+ text = node.text.gsub(LINE_BREAK_PATTERN, SPACE).squeeze(SPACE)
58
69
  text.lstrip! if WHITESPACE.include?(out[-1, 1])
59
70
  end
60
71
  out << text
@@ -62,19 +73,22 @@ module HtmlToPlainText
62
73
  out << node.text
63
74
  elsif node.element? && !IGNORE_TAGS.include?(node.name)
64
75
  convert_node_to_plain_text(node, out, child_options(node, options))
65
-
76
+
66
77
  if node.name == BR
67
- out.sub!(TRAILING_WHITESPACE, '')
68
- out << "\n"
78
+ out.sub!(TRAILING_WHITESPACE, EMPTY)
79
+ out << NEWLINE
69
80
  elsif node.name == HR
70
- out.sub!(TRAILING_WHITESPACE, '')
71
- out << "\n" unless out.end_with?("\n")
81
+ out.sub!(TRAILING_WHITESPACE, EMPTY)
82
+ out << NEWLINE unless out.end_with?(NEWLINE)
72
83
  out << "-------------------------------\n"
73
84
  elsif node.name == TD || node.name == TH
74
85
  out << " | "
75
- elsif node.name == "a"
76
- href = node["href"]
77
- if href && href.match(ABSOLUTE_URL_PATTERN) && node.text.match(/\S/)
86
+ elsif node.name == A
87
+ href = node[HREF]
88
+ if href &&
89
+ href =~ ABSOLUTE_URL_PATTERN &&
90
+ node.text =~ NOT_WHITESPACE_PATTERN &&
91
+ node.text != href[NON_PROTOCOL_PATTERN, 1] # use only text for <a href="mailto:a@b.com">a@b.com</a>
78
92
  out << " (#{href}) "
79
93
  end
80
94
  elsif PARAGRAPH_TAGS.include?(node.name)
@@ -86,7 +100,7 @@ module HtmlToPlainText
86
100
  end
87
101
  out
88
102
  end
89
-
103
+
90
104
  # Set formatting options that will be passed to child elements for a tag.
91
105
  def child_options(node, options)
92
106
  if node.name == UL
@@ -103,25 +117,25 @@ module HtmlToPlainText
103
117
  options
104
118
  end
105
119
  end
106
-
120
+
107
121
  # Add double line breaks between paragraph elements. If line breaks already exist,
108
122
  # new ones will only be added to get to two.
109
123
  def append_paragraph_breaks(out)
110
- out.sub!(TRAILING_WHITESPACE, '')
111
- if out.end_with?("\n")
112
- out << "\n" unless out.end_with?("\n\n")
124
+ out.sub!(TRAILING_WHITESPACE, EMPTY)
125
+ if out.end_with?(NEWLINE)
126
+ out << NEWLINE unless out.end_with?("\n\n")
113
127
  else
114
128
  out << "\n\n"
115
129
  end
116
130
  end
117
-
131
+
118
132
  # Add a single line break between block elements. If a line break already exists,
119
133
  # none will be added.
120
134
  def append_block_breaks(out)
121
- out.sub!(TRAILING_WHITESPACE, '')
122
- out << "\n" unless out.end_with?("\n")
135
+ out.sub!(TRAILING_WHITESPACE, EMPTY)
136
+ out << NEWLINE unless out.end_with?(NEWLINE)
123
137
  end
124
-
138
+
125
139
  # Add an appropriate bullet or number to a list element.
126
140
  def format_list_item(out, options)
127
141
  if options[:list] == :ul
metadata CHANGED
@@ -1,107 +1,108 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: html_to_plain_text
3
- version: !ruby/object:Gem::Version
4
- hash: 19
5
- prerelease:
6
- segments:
7
- - 1
8
- - 0
9
- - 2
10
- version: 1.0.2
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.3
11
5
  platform: ruby
12
- authors:
6
+ authors:
13
7
  - Brian Durand
14
8
  autorequire:
15
9
  bindir: bin
16
10
  cert_chain: []
17
-
18
- date: 2011-08-05 00:00:00 -05:00
19
- default_executable:
20
- dependencies:
21
- - !ruby/object:Gem::Dependency
11
+ date: 2015-11-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
22
14
  name: nokogiri
23
- prerelease: false
24
- requirement: &id001 !ruby/object:Gem::Requirement
25
- none: false
26
- requirements:
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
27
17
  - - ">="
28
- - !ruby/object:Gem::Version
29
- hash: 7
30
- segments:
31
- - 1
32
- - 4
33
- - 0
18
+ - !ruby/object:Gem::Version
34
19
  version: 1.4.0
35
20
  type: :runtime
36
- version_requirements: *id001
37
- - !ruby/object:Gem::Dependency
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 1.4.0
27
+ - !ruby/object:Gem::Dependency
38
28
  name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">"
32
+ - !ruby/object:Gem::Version
33
+ version: 2.6.0
34
+ type: :development
39
35
  prerelease: false
40
- requirement: &id002 !ruby/object:Gem::Requirement
41
- none: false
42
- requirements:
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
43
38
  - - ">"
44
- - !ruby/object:Gem::Version
45
- hash: 15
46
- segments:
47
- - 2
48
- - 0
49
- - 0
50
- version: 2.0.0
39
+ - !ruby/object:Gem::Version
40
+ version: 2.6.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
51
48
  type: :development
52
- version_requirements: *id002
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: bump
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
53
69
  description: A simple library for converting HTML into an approximation in plain text.
54
- email:
70
+ email:
55
71
  - bdurand@embellishedvisions.com
56
72
  executables: []
57
-
58
73
  extensions: []
59
-
60
- extra_rdoc_files:
74
+ extra_rdoc_files:
61
75
  - README.rdoc
62
- files:
76
+ files:
77
+ - MIT_LICENSE
63
78
  - README.rdoc
64
- - VERSION
65
79
  - Rakefile
66
- - MIT_LICENSE
80
+ - VERSION
67
81
  - lib/html_to_plain_text.rb
68
- - spec/html_to_plain_text_spec.rb
69
- - spec/spec_helper.rb
70
- has_rdoc: true
71
- homepage: http://github.com/bdurand/html_to_plain_text
82
+ homepage: https://github.com/bdurand/html_to_plain_text
72
83
  licenses: []
73
-
84
+ metadata: {}
74
85
  post_install_message:
75
- rdoc_options:
76
- - --charset=UTF-8
77
- - --main
86
+ rdoc_options:
87
+ - "--charset=UTF-8"
88
+ - "--main"
78
89
  - README.rdoc
79
- require_paths:
90
+ require_paths:
80
91
  - lib
81
- required_ruby_version: !ruby/object:Gem::Requirement
82
- none: false
83
- requirements:
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ requirements:
84
94
  - - ">="
85
- - !ruby/object:Gem::Version
86
- hash: 3
87
- segments:
88
- - 0
89
- version: "0"
90
- required_rubygems_version: !ruby/object:Gem::Requirement
91
- none: false
92
- requirements:
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ required_rubygems_version: !ruby/object:Gem::Requirement
98
+ requirements:
93
99
  - - ">="
94
- - !ruby/object:Gem::Version
95
- hash: 3
96
- segments:
97
- - 0
98
- version: "0"
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
99
102
  requirements: []
100
-
101
103
  rubyforge_project:
102
- rubygems_version: 1.5.2
104
+ rubygems_version: 2.4.5
103
105
  signing_key:
104
- specification_version: 3
106
+ specification_version: 4
105
107
  summary: A simple library for converting HTML into plain text.
106
108
  test_files: []
107
-
@@ -1,105 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe HtmlToPlainText do
4
- it "should format paragraph tags" do
5
- html = "<h1>Test</h1><h2>More Test</h2>\t \t<p>\n\tThis is a test\n</p>"
6
- HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\n\nThis is a test"
7
- end
8
-
9
- it "should format block tags" do
10
- html = "<div>Test</div><div>More Test<div>\t This is a test\t </div></div>"
11
- HtmlToPlainText.plain_text(html).should == "Test\nMore Test\nThis is a test"
12
- end
13
-
14
- it "should format <br> tags" do
15
- html = "<div>Test</div><br><div>More Test \t <br />This is a test"
16
- HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\nThis is a test"
17
- end
18
-
19
- it "should format <hr> tags" do
20
- html = "<div>Test</div><hr><div>More Test \t <hr />This is a test"
21
- HtmlToPlainText.plain_text(html).should == "Test\n-------------------------------\nMore Test\n-------------------------------\nThis is a test"
22
- end
23
-
24
- it "should keep text formatting in <pre> tag blocks" do
25
- html = "<div>This \n is a \ntest</div><pre>with\n pre tags</pre>end"
26
- HtmlToPlainText.plain_text(html).should == "This is a test\nwith\n pre tags\nend"
27
- end
28
-
29
- it "should remove inline formatting tags" do
30
- html = "This is <strong>so</strong> cool. I<em> mean <em>it."
31
- HtmlToPlainText.plain_text(html).should == "This is so cool. I mean it."
32
- end
33
-
34
- it "should remove script, style, object, applet, and iframe tags" do
35
- html = "script <script>do_something</script> style <style>css</style> object <object>config</object> applet <applet>config</applet> iframe <iframe>config</iframe>"
36
- HtmlToPlainText.plain_text(html).should == "script style object applet iframe"
37
- end
38
-
39
- it "should handle plaintext tags" do
40
- html = "<div>my\nhtml</div><plaintext>my\n text"
41
- HtmlToPlainText.plain_text(html).should == "my html\nmy\n text"
42
- end
43
-
44
- it "should not add extraneous spaces or line breaks" do
45
- html = "this<p><p> is \n \n pretty bad lo<em>oking htm</em>l!"
46
- HtmlToPlainText.plain_text(html).should == "this\n\nis pretty bad looking html!"
47
- end
48
-
49
- it "should format bullet lists" do
50
- html = "List<ul><li>one</li><li>two<ul><li>a</li><li>b</li></ul></li><li>three</li></ul>"
51
- HtmlToPlainText.plain_text(html).should == "List\n\n* one\n* two\n\n** a\n** b\n\n* three"
52
- end
53
-
54
- it "should format numbered lists" do
55
- html = "List<ol><li>one</li><li>two<ol><li>a</li><li>b</li></ol></li><li>three</li></ol>"
56
- HtmlToPlainText.plain_text(html).should == "List\n\n1. one\n2. two\n\na. a\nb. b\n\n3. three"
57
- end
58
-
59
- it "should format a table" do
60
- html = "Table<table><tr><th>Col 1</th><th>Col 2</th></tr><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>"
61
- HtmlToPlainText.plain_text(html).should == "Table\n\n| Col 1 | Col 2 |\n| 1 | 2 |\n| 3 | 4 |"
62
- end
63
-
64
- it "should ignore inline tags without bodies" do
65
- html = "This is an <img src=\"/image\"> image"
66
- HtmlToPlainText.plain_text(html).should == "This is an image"
67
- end
68
-
69
- it "should ignore comments" do
70
- html = "This is <!-- html comment here --> html"
71
- HtmlToPlainText.plain_text(html).should == "This is html"
72
- end
73
-
74
- it "should unencode entities" do
75
- html = "High &amp; Low"
76
- HtmlToPlainText.plain_text(html).should == "High & Low"
77
- end
78
-
79
- it "should normalize the line breaks" do
80
- html = "<pre>These are\rreturn\r\nlines</pre>"
81
- HtmlToPlainText.plain_text(html).should == "These are\nreturn\nlines"
82
- end
83
-
84
- it "should include absolute link URLs" do
85
- html = "<a name='links'>Links</a> <a href='/test'>partial</a> <a href='http://example.com/test'>full</a> test<a href='http://example.com/test2'> <img src='test'> </a>"
86
- HtmlToPlainText.plain_text(html).should == "Links partial full (http://example.com/test) test"
87
- end
88
-
89
- it "should unescape entities" do
90
- html = "This &amp; th&#97;t"
91
- HtmlToPlainText.plain_text(html).should == "This & that"
92
- end
93
-
94
- it "should handle nil" do
95
- HtmlToPlainText.plain_text(nil).should == nil
96
- end
97
-
98
- it "should handle empty text" do
99
- HtmlToPlainText.plain_text("").should == ""
100
- end
101
-
102
- it "should handle non-html text" do
103
- HtmlToPlainText.plain_text("test").should == "test"
104
- end
105
- end
data/spec/spec_helper.rb DELETED
@@ -1 +0,0 @@
1
- require File.expand_path("../../lib/html_to_plain_text.rb", __FILE__)