html_to_plain_text 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 119fe11f894f031f199ea631969078fefb53f417
4
+ data.tar.gz: 5882b3a0913030e44765b6506311682a2dd2e9ec
5
+ SHA512:
6
+ metadata.gz: 1cb91d616d5ebeb4a6a23f92005e8a4e7616d7f565cd1540b499e919c35106f9adb45083b6d2453629c3be73eb28d13455972ce21979cf98c21d95c81f4dd3eb
7
+ data.tar.gz: fd7aacdc78b1c2cf4ee23515a5f1ef8cf8975b9569deec96f19ca6178e6ef4fece11c79ec4a992c1452246ab6fba90fe15277b217500deecd4f63c3b67561b8b
data/README.rdoc CHANGED
@@ -1,5 +1,7 @@
1
1
  = HTML To Plain Text
2
2
 
3
+ <code>gem install html_to_plain_text</code>
4
+
3
5
  A simple gem that provide code to convert HTML into a plain text alternative. Line breaks from HTML block level elements will be maintained. Lists and tables will also maintain a little bit of formatting.
4
6
 
5
7
  * Line breaks will be approximated using the generally established default margins for HTML tags (i.e. <p>
@@ -15,4 +17,8 @@ tag generates two line breaks, <div> generates one)
15
17
 
16
18
  == Usage
17
19
 
18
- HtmlToPlainText.plain_text(html)
20
+ require 'html_to_plain_text'
21
+ html = "<h1>Hello</h1><p>world!</p>"
22
+ HtmlToPlainText.plain_text(html)
23
+ => "Hello\n\nworld!"
24
+
data/Rakefile CHANGED
@@ -1,29 +1,10 @@
1
- require 'rubygems'
2
- require 'rubygems/package_task'
3
- require 'rake'
1
+ require 'bundler/setup'
2
+ require 'bundler/gem_tasks'
3
+ require 'rspec/core/rake_task'
4
+ require 'bump/tasks'
4
5
 
5
6
  desc 'Default: run unit tests.'
6
7
  task :default => :test
7
8
 
8
- desc 'RVM likes to call it tests'
9
- task :tests => :test
10
-
11
- begin
12
- require 'rspec'
13
- require 'rspec/core/rake_task'
14
- desc 'Run the unit tests'
15
- RSpec::Core::RakeTask.new(:test)
16
- rescue LoadError
17
- task :test do
18
- STDERR.puts "You must have rspec 2.0 installed to run the tests"
19
- end
20
- end
21
-
22
- spec_file = File.expand_path('../html_to_plain_text.gemspec', __FILE__)
23
- if File.exist?(spec_file)
24
- spec = eval(File.read(spec_file))
25
-
26
- Gem::PackageTask.new(spec) do |p|
27
- p.gem_spec = spec
28
- end
29
- end
9
+ desc 'Run the unit tests'
10
+ RSpec::Core::RakeTask.new(:test)
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.2
1
+ 1.0.3
@@ -16,45 +16,56 @@ module HtmlToPlainText
16
16
  OL = "ol".freeze
17
17
  UL = "ul".freeze
18
18
  LI = "li".freeze
19
+ A = "a".freeze
19
20
  NUMBERS = ["1", "a"].freeze
20
21
  ABSOLUTE_URL_PATTERN = /^[a-z]+:\/\/[a-z0-9]/i.freeze
21
22
  HTML_PATTERN = /[<&]/.freeze
22
23
  TRAILING_WHITESPACE = /[ \t]+$/.freeze
23
-
24
+ BODY_TAG_XPATH = "/html/body".freeze
25
+ CARRIDGE_RETURN_PATTERN = /\r(\n?)/.freeze
26
+ LINE_BREAK_PATTERN = /[\n\r]/.freeze
27
+ NON_PROTOCOL_PATTERN = /:\/?\/?(.*)/.freeze
28
+ NOT_WHITESPACE_PATTERN = /\S/.freeze
29
+ SPACE = " ".freeze
30
+ EMPTY = "".freeze
31
+ NEWLINE = "\n".freeze
32
+ HREF = "href".freeze
33
+
24
34
  # Helper instance method for converting HTML into plain text. This method simply calls HtmlToPlainText.plain_text.
25
35
  def plain_text(html)
26
36
  HtmlToPlainText.plain_text(html)
27
37
  end
28
-
38
+
29
39
  class << self
30
40
  # Convert some HTML into a plain text approximation.
41
+
31
42
  def plain_text(html)
32
43
  return nil if html.nil?
33
- return html.dup unless html.match(HTML_PATTERN)
34
- body = Nokogiri::HTML::Document.parse(html).css("body").first
44
+ return html.dup unless html =~ HTML_PATTERN
45
+ body = Nokogiri::HTML::Document.parse(html).xpath(BODY_TAG_XPATH).first
35
46
  return unless body
36
- convert_node_to_plain_text(body).strip.gsub(/\r(\n?)/, "\n")
47
+ convert_node_to_plain_text(body).strip.gsub(CARRIDGE_RETURN_PATTERN, NEWLINE)
37
48
  end
38
-
49
+
39
50
  private
40
-
51
+
41
52
  # Convert an HTML node to plain text. This method is called recursively with the output and
42
53
  # formatting options for special tags.
43
- def convert_node_to_plain_text(parent, out = "", options = {})
54
+ def convert_node_to_plain_text(parent, out = '', options = {})
44
55
  if PARAGRAPH_TAGS.include?(parent.name)
45
56
  append_paragraph_breaks(out)
46
57
  elsif BLOCK_TAGS.include?(parent.name)
47
58
  append_block_breaks(out)
48
59
  end
49
-
60
+
50
61
  format_list_item(out, options) if parent.name == LI
51
62
  out << "| " if parent.name == TR
52
-
63
+
53
64
  parent.children.each do |node|
54
65
  if node.text? || node.cdata?
55
66
  text = node.text
56
67
  unless options[:pre]
57
- text = node.text.gsub(/[\n\r]/, " ").squeeze(" ")
68
+ text = node.text.gsub(LINE_BREAK_PATTERN, SPACE).squeeze(SPACE)
58
69
  text.lstrip! if WHITESPACE.include?(out[-1, 1])
59
70
  end
60
71
  out << text
@@ -62,19 +73,22 @@ module HtmlToPlainText
62
73
  out << node.text
63
74
  elsif node.element? && !IGNORE_TAGS.include?(node.name)
64
75
  convert_node_to_plain_text(node, out, child_options(node, options))
65
-
76
+
66
77
  if node.name == BR
67
- out.sub!(TRAILING_WHITESPACE, '')
68
- out << "\n"
78
+ out.sub!(TRAILING_WHITESPACE, EMPTY)
79
+ out << NEWLINE
69
80
  elsif node.name == HR
70
- out.sub!(TRAILING_WHITESPACE, '')
71
- out << "\n" unless out.end_with?("\n")
81
+ out.sub!(TRAILING_WHITESPACE, EMPTY)
82
+ out << NEWLINE unless out.end_with?(NEWLINE)
72
83
  out << "-------------------------------\n"
73
84
  elsif node.name == TD || node.name == TH
74
85
  out << " | "
75
- elsif node.name == "a"
76
- href = node["href"]
77
- if href && href.match(ABSOLUTE_URL_PATTERN) && node.text.match(/\S/)
86
+ elsif node.name == A
87
+ href = node[HREF]
88
+ if href &&
89
+ href =~ ABSOLUTE_URL_PATTERN &&
90
+ node.text =~ NOT_WHITESPACE_PATTERN &&
91
+ node.text != href[NON_PROTOCOL_PATTERN, 1] # use only text for <a href="mailto:a@b.com">a@b.com</a>
78
92
  out << " (#{href}) "
79
93
  end
80
94
  elsif PARAGRAPH_TAGS.include?(node.name)
@@ -86,7 +100,7 @@ module HtmlToPlainText
86
100
  end
87
101
  out
88
102
  end
89
-
103
+
90
104
  # Set formatting options that will be passed to child elements for a tag.
91
105
  def child_options(node, options)
92
106
  if node.name == UL
@@ -103,25 +117,25 @@ module HtmlToPlainText
103
117
  options
104
118
  end
105
119
  end
106
-
120
+
107
121
  # Add double line breaks between paragraph elements. If line breaks already exist,
108
122
  # new ones will only be added to get to two.
109
123
  def append_paragraph_breaks(out)
110
- out.sub!(TRAILING_WHITESPACE, '')
111
- if out.end_with?("\n")
112
- out << "\n" unless out.end_with?("\n\n")
124
+ out.sub!(TRAILING_WHITESPACE, EMPTY)
125
+ if out.end_with?(NEWLINE)
126
+ out << NEWLINE unless out.end_with?("\n\n")
113
127
  else
114
128
  out << "\n\n"
115
129
  end
116
130
  end
117
-
131
+
118
132
  # Add a single line break between block elements. If a line break already exists,
119
133
  # none will be added.
120
134
  def append_block_breaks(out)
121
- out.sub!(TRAILING_WHITESPACE, '')
122
- out << "\n" unless out.end_with?("\n")
135
+ out.sub!(TRAILING_WHITESPACE, EMPTY)
136
+ out << NEWLINE unless out.end_with?(NEWLINE)
123
137
  end
124
-
138
+
125
139
  # Add an appropriate bullet or number to a list element.
126
140
  def format_list_item(out, options)
127
141
  if options[:list] == :ul
metadata CHANGED
@@ -1,107 +1,108 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: html_to_plain_text
3
- version: !ruby/object:Gem::Version
4
- hash: 19
5
- prerelease:
6
- segments:
7
- - 1
8
- - 0
9
- - 2
10
- version: 1.0.2
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.3
11
5
  platform: ruby
12
- authors:
6
+ authors:
13
7
  - Brian Durand
14
8
  autorequire:
15
9
  bindir: bin
16
10
  cert_chain: []
17
-
18
- date: 2011-08-05 00:00:00 -05:00
19
- default_executable:
20
- dependencies:
21
- - !ruby/object:Gem::Dependency
11
+ date: 2015-11-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
22
14
  name: nokogiri
23
- prerelease: false
24
- requirement: &id001 !ruby/object:Gem::Requirement
25
- none: false
26
- requirements:
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
27
17
  - - ">="
28
- - !ruby/object:Gem::Version
29
- hash: 7
30
- segments:
31
- - 1
32
- - 4
33
- - 0
18
+ - !ruby/object:Gem::Version
34
19
  version: 1.4.0
35
20
  type: :runtime
36
- version_requirements: *id001
37
- - !ruby/object:Gem::Dependency
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 1.4.0
27
+ - !ruby/object:Gem::Dependency
38
28
  name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">"
32
+ - !ruby/object:Gem::Version
33
+ version: 2.6.0
34
+ type: :development
39
35
  prerelease: false
40
- requirement: &id002 !ruby/object:Gem::Requirement
41
- none: false
42
- requirements:
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
43
38
  - - ">"
44
- - !ruby/object:Gem::Version
45
- hash: 15
46
- segments:
47
- - 2
48
- - 0
49
- - 0
50
- version: 2.0.0
39
+ - !ruby/object:Gem::Version
40
+ version: 2.6.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
51
48
  type: :development
52
- version_requirements: *id002
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: bump
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
53
69
  description: A simple library for converting HTML into an approximation in plain text.
54
- email:
70
+ email:
55
71
  - bdurand@embellishedvisions.com
56
72
  executables: []
57
-
58
73
  extensions: []
59
-
60
- extra_rdoc_files:
74
+ extra_rdoc_files:
61
75
  - README.rdoc
62
- files:
76
+ files:
77
+ - MIT_LICENSE
63
78
  - README.rdoc
64
- - VERSION
65
79
  - Rakefile
66
- - MIT_LICENSE
80
+ - VERSION
67
81
  - lib/html_to_plain_text.rb
68
- - spec/html_to_plain_text_spec.rb
69
- - spec/spec_helper.rb
70
- has_rdoc: true
71
- homepage: http://github.com/bdurand/html_to_plain_text
82
+ homepage: https://github.com/bdurand/html_to_plain_text
72
83
  licenses: []
73
-
84
+ metadata: {}
74
85
  post_install_message:
75
- rdoc_options:
76
- - --charset=UTF-8
77
- - --main
86
+ rdoc_options:
87
+ - "--charset=UTF-8"
88
+ - "--main"
78
89
  - README.rdoc
79
- require_paths:
90
+ require_paths:
80
91
  - lib
81
- required_ruby_version: !ruby/object:Gem::Requirement
82
- none: false
83
- requirements:
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ requirements:
84
94
  - - ">="
85
- - !ruby/object:Gem::Version
86
- hash: 3
87
- segments:
88
- - 0
89
- version: "0"
90
- required_rubygems_version: !ruby/object:Gem::Requirement
91
- none: false
92
- requirements:
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ required_rubygems_version: !ruby/object:Gem::Requirement
98
+ requirements:
93
99
  - - ">="
94
- - !ruby/object:Gem::Version
95
- hash: 3
96
- segments:
97
- - 0
98
- version: "0"
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
99
102
  requirements: []
100
-
101
103
  rubyforge_project:
102
- rubygems_version: 1.5.2
104
+ rubygems_version: 2.4.5
103
105
  signing_key:
104
- specification_version: 3
106
+ specification_version: 4
105
107
  summary: A simple library for converting HTML into plain text.
106
108
  test_files: []
107
-
@@ -1,105 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe HtmlToPlainText do
4
- it "should format paragraph tags" do
5
- html = "<h1>Test</h1><h2>More Test</h2>\t \t<p>\n\tThis is a test\n</p>"
6
- HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\n\nThis is a test"
7
- end
8
-
9
- it "should format block tags" do
10
- html = "<div>Test</div><div>More Test<div>\t This is a test\t </div></div>"
11
- HtmlToPlainText.plain_text(html).should == "Test\nMore Test\nThis is a test"
12
- end
13
-
14
- it "should format <br> tags" do
15
- html = "<div>Test</div><br><div>More Test \t <br />This is a test"
16
- HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\nThis is a test"
17
- end
18
-
19
- it "should format <hr> tags" do
20
- html = "<div>Test</div><hr><div>More Test \t <hr />This is a test"
21
- HtmlToPlainText.plain_text(html).should == "Test\n-------------------------------\nMore Test\n-------------------------------\nThis is a test"
22
- end
23
-
24
- it "should keep text formatting in <pre> tag blocks" do
25
- html = "<div>This \n is a \ntest</div><pre>with\n pre tags</pre>end"
26
- HtmlToPlainText.plain_text(html).should == "This is a test\nwith\n pre tags\nend"
27
- end
28
-
29
- it "should remove inline formatting tags" do
30
- html = "This is <strong>so</strong> cool. I<em> mean <em>it."
31
- HtmlToPlainText.plain_text(html).should == "This is so cool. I mean it."
32
- end
33
-
34
- it "should remove script, style, object, applet, and iframe tags" do
35
- html = "script <script>do_something</script> style <style>css</style> object <object>config</object> applet <applet>config</applet> iframe <iframe>config</iframe>"
36
- HtmlToPlainText.plain_text(html).should == "script style object applet iframe"
37
- end
38
-
39
- it "should handle plaintext tags" do
40
- html = "<div>my\nhtml</div><plaintext>my\n text"
41
- HtmlToPlainText.plain_text(html).should == "my html\nmy\n text"
42
- end
43
-
44
- it "should not add extraneous spaces or line breaks" do
45
- html = "this<p><p> is \n \n pretty bad lo<em>oking htm</em>l!"
46
- HtmlToPlainText.plain_text(html).should == "this\n\nis pretty bad looking html!"
47
- end
48
-
49
- it "should format bullet lists" do
50
- html = "List<ul><li>one</li><li>two<ul><li>a</li><li>b</li></ul></li><li>three</li></ul>"
51
- HtmlToPlainText.plain_text(html).should == "List\n\n* one\n* two\n\n** a\n** b\n\n* three"
52
- end
53
-
54
- it "should format numbered lists" do
55
- html = "List<ol><li>one</li><li>two<ol><li>a</li><li>b</li></ol></li><li>three</li></ol>"
56
- HtmlToPlainText.plain_text(html).should == "List\n\n1. one\n2. two\n\na. a\nb. b\n\n3. three"
57
- end
58
-
59
- it "should format a table" do
60
- html = "Table<table><tr><th>Col 1</th><th>Col 2</th></tr><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>"
61
- HtmlToPlainText.plain_text(html).should == "Table\n\n| Col 1 | Col 2 |\n| 1 | 2 |\n| 3 | 4 |"
62
- end
63
-
64
- it "should ignore inline tags without bodies" do
65
- html = "This is an <img src=\"/image\"> image"
66
- HtmlToPlainText.plain_text(html).should == "This is an image"
67
- end
68
-
69
- it "should ignore comments" do
70
- html = "This is <!-- html comment here --> html"
71
- HtmlToPlainText.plain_text(html).should == "This is html"
72
- end
73
-
74
- it "should unencode entities" do
75
- html = "High &amp; Low"
76
- HtmlToPlainText.plain_text(html).should == "High & Low"
77
- end
78
-
79
- it "should normalize the line breaks" do
80
- html = "<pre>These are\rreturn\r\nlines</pre>"
81
- HtmlToPlainText.plain_text(html).should == "These are\nreturn\nlines"
82
- end
83
-
84
- it "should include absolute link URLs" do
85
- html = "<a name='links'>Links</a> <a href='/test'>partial</a> <a href='http://example.com/test'>full</a> test<a href='http://example.com/test2'> <img src='test'> </a>"
86
- HtmlToPlainText.plain_text(html).should == "Links partial full (http://example.com/test) test"
87
- end
88
-
89
- it "should unescape entities" do
90
- html = "This &amp; th&#97;t"
91
- HtmlToPlainText.plain_text(html).should == "This & that"
92
- end
93
-
94
- it "should handle nil" do
95
- HtmlToPlainText.plain_text(nil).should == nil
96
- end
97
-
98
- it "should handle empty text" do
99
- HtmlToPlainText.plain_text("").should == ""
100
- end
101
-
102
- it "should handle non-html text" do
103
- HtmlToPlainText.plain_text("test").should == "test"
104
- end
105
- end
data/spec/spec_helper.rb DELETED
@@ -1 +0,0 @@
1
- require File.expand_path("../../lib/html_to_plain_text.rb", __FILE__)