RubyGems - html_to_plain_text - Versions diffs - 1.0.2 → 1.0.3 - Mend

html_to_plain_text 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +7 -0
data/README.rdoc +7 -1
data/Rakefile +6 -25
data/VERSION +1 -1
data/lib/html_to_plain_text.rb +42 -28
metadata +75 -74
data/spec/html_to_plain_text_spec.rb +0 -105
data/spec/spec_helper.rb +0 -1

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 119fe11f894f031f199ea631969078fefb53f417
+  data.tar.gz: 5882b3a0913030e44765b6506311682a2dd2e9ec
+SHA512:
+  metadata.gz: 1cb91d616d5ebeb4a6a23f92005e8a4e7616d7f565cd1540b499e919c35106f9adb45083b6d2453629c3be73eb28d13455972ce21979cf98c21d95c81f4dd3eb
+  data.tar.gz: fd7aacdc78b1c2cf4ee23515a5f1ef8cf8975b9569deec96f19ca6178e6ef4fece11c79ec4a992c1452246ab6fba90fe15277b217500deecd4f63c3b67561b8b

data/README.rdoc CHANGED Viewed

@@ -1,5 +1,7 @@
 = HTML To Plain Text
+<code>gem install html_to_plain_text</code>
 A simple gem that provide code to convert HTML into a plain text alternative. Line breaks from HTML block level elements will be maintained. Lists and tables will also maintain a little bit of formatting.
 * Line breaks will be approximated using the generally established default margins for HTML tags (i.e. <p>
@@ -15,4 +17,8 @@ tag generates two line breaks, <div> generates one)
 == Usage
-HtmlToPlainText.plain_text(html)
+    require 'html_to_plain_text'
+    html = "<h1>Hello</h1><p>world!</p>"
+    HtmlToPlainText.plain_text(html)
+    => "Hello\n\nworld!"

data/Rakefile CHANGED Viewed

@@ -1,29 +1,10 @@
-require 'rubygems'
-require 'rubygems/package_task'
-require 'rake'
+require 'bundler/setup'
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
+require 'bump/tasks'
 desc 'Default: run unit tests.'
 task :default => :test
-desc 'RVM likes to call it tests'
-task :tests => :test
-begin
-  require 'rspec'
-  require 'rspec/core/rake_task'
-  desc 'Run the unit tests'
-  RSpec::Core::RakeTask.new(:test)
-rescue LoadError
-  task :test do
-    STDERR.puts "You must have rspec 2.0 installed to run the tests"
-  end
-end
-spec_file = File.expand_path('../html_to_plain_text.gemspec', __FILE__)
-if File.exist?(spec_file)
-  spec = eval(File.read(spec_file))
-  Gem::PackageTask.new(spec) do |p|
-    p.gem_spec = spec
-  end
-end
+desc 'Run the unit tests'
+RSpec::Core::RakeTask.new(:test)

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 1.0.2
1	+ 1.0.3

data/lib/html_to_plain_text.rb CHANGED Viewed

@@ -16,45 +16,56 @@ module HtmlToPlainText
   OL = "ol".freeze
   UL = "ul".freeze
   LI = "li".freeze
+  A = "a".freeze
   NUMBERS = ["1", "a"].freeze
   ABSOLUTE_URL_PATTERN = /^[a-z]+:\/\/[a-z0-9]/i.freeze
   HTML_PATTERN = /[<&]/.freeze
   TRAILING_WHITESPACE = /[ \t]+$/.freeze
+  BODY_TAG_XPATH = "/html/body".freeze
+  CARRIDGE_RETURN_PATTERN = /\r(\n?)/.freeze
+  LINE_BREAK_PATTERN = /[\n\r]/.freeze
+  NON_PROTOCOL_PATTERN = /:\/?\/?(.*)/.freeze
+  NOT_WHITESPACE_PATTERN = /\S/.freeze
+  SPACE = " ".freeze
+  EMPTY = "".freeze
+  NEWLINE = "\n".freeze
+  HREF = "href".freeze
   # Helper instance method for converting HTML into plain text. This method simply calls HtmlToPlainText.plain_text.
   def plain_text(html)
     HtmlToPlainText.plain_text(html)
   end
   class << self
     # Convert some HTML into a plain text approximation.
     def plain_text(html)
       return nil if html.nil?
-      return html.dup unless html.match(HTML_PATTERN)
-      body = Nokogiri::HTML::Document.parse(html).css("body").first
+      return html.dup unless html =~ HTML_PATTERN
+      body = Nokogiri::HTML::Document.parse(html).xpath(BODY_TAG_XPATH).first
       return unless body
-      convert_node_to_plain_text(body).strip.gsub(/\r(\n?)/, "\n")
+      convert_node_to_plain_text(body).strip.gsub(CARRIDGE_RETURN_PATTERN, NEWLINE)
     end
     private
     # Convert an HTML node to plain text. This method is called recursively with the output and
     # formatting options for special tags.
-    def convert_node_to_plain_text(parent, out = "", options = {})
+    def convert_node_to_plain_text(parent, out = '', options = {})
       if PARAGRAPH_TAGS.include?(parent.name)
         append_paragraph_breaks(out)
       elsif BLOCK_TAGS.include?(parent.name)
         append_block_breaks(out)
       end
       format_list_item(out, options) if parent.name == LI
       out << "| " if parent.name == TR
       parent.children.each do |node|
         if node.text? || node.cdata?
           text = node.text
           unless options[:pre]
-            text = node.text.gsub(/[\n\r]/, " ").squeeze(" ")
+            text = node.text.gsub(LINE_BREAK_PATTERN, SPACE).squeeze(SPACE)
             text.lstrip! if WHITESPACE.include?(out[-1, 1])
           end
           out << text
@@ -62,19 +73,22 @@ module HtmlToPlainText
           out << node.text
         elsif node.element? && !IGNORE_TAGS.include?(node.name)
           convert_node_to_plain_text(node, out, child_options(node, options))
           if node.name == BR
-            out.sub!(TRAILING_WHITESPACE, '')
-            out << "\n"
+            out.sub!(TRAILING_WHITESPACE, EMPTY)
+            out << NEWLINE
           elsif node.name == HR
-            out.sub!(TRAILING_WHITESPACE, '')
-            out << "\n" unless out.end_with?("\n")
+            out.sub!(TRAILING_WHITESPACE, EMPTY)
+            out << NEWLINE unless out.end_with?(NEWLINE)
             out << "-------------------------------\n"
           elsif node.name == TD || node.name == TH
             out << " | "
-          elsif node.name == "a"
-            href = node["href"]
-            if href && href.match(ABSOLUTE_URL_PATTERN) && node.text.match(/\S/)
+          elsif node.name == A
+            href = node[HREF]
+            if href &&
+                href =~ ABSOLUTE_URL_PATTERN &&
+                node.text =~ NOT_WHITESPACE_PATTERN &&
+                node.text != href[NON_PROTOCOL_PATTERN, 1] # use only text for <a href="mailto:a@b.com">a@b.com</a>
               out << " (#{href}) "
             end
           elsif PARAGRAPH_TAGS.include?(node.name)
@@ -86,7 +100,7 @@ module HtmlToPlainText
       end
       out
     end
     # Set formatting options that will be passed to child elements for a tag.
     def child_options(node, options)
       if node.name == UL
@@ -103,25 +117,25 @@ module HtmlToPlainText
         options
       end
     end
     # Add double line breaks between paragraph elements. If line breaks already exist,
     # new ones will only be added to get to two.
     def append_paragraph_breaks(out)
-      out.sub!(TRAILING_WHITESPACE, '')
-      if out.end_with?("\n")
-        out << "\n" unless out.end_with?("\n\n")
+      out.sub!(TRAILING_WHITESPACE, EMPTY)
+      if out.end_with?(NEWLINE)
+        out << NEWLINE unless out.end_with?("\n\n")
       else
         out << "\n\n"
       end
     end
     # Add a single line break between block elements. If a line break already exists,
     # none will be added.
     def append_block_breaks(out)
-      out.sub!(TRAILING_WHITESPACE, '')
-      out << "\n" unless out.end_with?("\n")
+      out.sub!(TRAILING_WHITESPACE, EMPTY)
+      out << NEWLINE unless out.end_with?(NEWLINE)
     end
     # Add an appropriate bullet or number to a list element.
     def format_list_item(out, options)
       if options[:list] == :ul

metadata CHANGED Viewed

@@ -1,107 +1,108 @@
---- !ruby/object:Gem::Specification
+--- !ruby/object:Gem::Specification
 name: html_to_plain_text
-version: !ruby/object:Gem::Version
-  hash: 19
-  prerelease:
-  segments:
-  - 1
-  - 0
-  - 2
-  version: 1.0.2
+version: !ruby/object:Gem::Version
+  version: 1.0.3
 platform: ruby
-authors:
+authors:
 - Brian Durand
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-08-05 00:00:00 -05:00
-default_executable:
-dependencies:
-- !ruby/object:Gem::Dependency
+date: 2015-11-11 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
   name: nokogiri
-  prerelease: false
-  requirement: &id001 !ruby/object:Gem::Requirement
-    none: false
-    requirements:
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
     - - ">="
-      - !ruby/object:Gem::Version
-        hash: 7
-        segments:
-        - 1
-        - 4
-        - 0
+      - !ruby/object:Gem::Version
         version: 1.4.0
   type: :runtime
-  version_requirements: *id001
-- !ruby/object:Gem::Dependency
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.4.0
+- !ruby/object:Gem::Dependency
   name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">"
+      - !ruby/object:Gem::Version
+        version: 2.6.0
+  type: :development
   prerelease: false
-  requirement: &id002 !ruby/object:Gem::Requirement
-    none: false
-    requirements:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
     - - ">"
-      - !ruby/object:Gem::Version
-        hash: 15
-        segments:
-        - 2
-        - 0
-        - 0
-        version: 2.0.0
+      - !ruby/object:Gem::Version
+        version: 2.6.0
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
   type: :development
-  version_requirements: *id002
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: bump
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: A simple library for converting HTML into an approximation in plain text.
-email:
+email:
 - bdurand@embellishedvisions.com
 executables: []
 extensions: []
-extra_rdoc_files:
+extra_rdoc_files:
 - README.rdoc
-files:
+files:
+- MIT_LICENSE
 - README.rdoc
-- VERSION
 - Rakefile
-- MIT_LICENSE
+- VERSION
 - lib/html_to_plain_text.rb
-- spec/html_to_plain_text_spec.rb
-- spec/spec_helper.rb
-has_rdoc: true
-homepage: http://github.com/bdurand/html_to_plain_text
+homepage: https://github.com/bdurand/html_to_plain_text
 licenses: []
+metadata: {}
 post_install_message:
-rdoc_options:
-- --charset=UTF-8
-- --main
+rdoc_options:
+- "--charset=UTF-8"
+- "--main"
 - README.rdoc
-require_paths:
+require_paths:
 - lib
-required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
-  requirements:
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
   - - ">="
-    - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
-      version: "0"
-required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
-  requirements:
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
   - - ">="
-    - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
-      version: "0"
+    - !ruby/object:Gem::Version
+      version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.5.2
+rubygems_version: 2.4.5
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: A simple library for converting HTML into plain text.
 test_files: []

data/spec/html_to_plain_text_spec.rb DELETED Viewed

@@ -1,105 +0,0 @@
-require 'spec_helper'
-describe HtmlToPlainText do
-  it "should format paragraph tags" do
-    html = "<h1>Test</h1><h2>More Test</h2>\t \t<p>\n\tThis is a test\n</p>"
-    HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\n\nThis is a test"
-  end
-  it "should format block tags" do
-    html = "<div>Test</div><div>More Test<div>\t This is a test\t </div></div>"
-    HtmlToPlainText.plain_text(html).should == "Test\nMore Test\nThis is a test"
-  end
-  it "should format <br> tags" do
-    html = "<div>Test</div><br><div>More Test \t <br />This is a test"
-    HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\nThis is a test"
-  end
-  it "should format <hr> tags" do
-    html = "<div>Test</div><hr><div>More Test \t <hr />This is a test"
-    HtmlToPlainText.plain_text(html).should == "Test\n-------------------------------\nMore Test\n-------------------------------\nThis is a test"
-  end
-  it "should keep text formatting in <pre> tag blocks" do
-    html = "<div>This \n is a \ntest</div><pre>with\n  pre tags</pre>end"
-    HtmlToPlainText.plain_text(html).should == "This is a test\nwith\n  pre tags\nend"
-  end
-  it "should remove inline formatting tags" do
-    html = "This is <strong>so</strong> cool. I<em> mean <em>it."
-    HtmlToPlainText.plain_text(html).should == "This is so cool. I mean it."
-  end
-  it "should remove script, style, object, applet, and iframe tags" do
-    html = "script <script>do_something</script> style <style>css</style> object <object>config</object> applet <applet>config</applet> iframe <iframe>config</iframe>"
-    HtmlToPlainText.plain_text(html).should == "script style object applet iframe"
-  end
-  it "should handle plaintext tags" do
-    html = "<div>my\nhtml</div><plaintext>my\n text"
-    HtmlToPlainText.plain_text(html).should == "my html\nmy\n text"
-  end
-  it "should not add extraneous spaces or line breaks" do
-    html = "this<p><p>  is   \n    \n pretty bad lo<em>oking htm</em>l!"
-    HtmlToPlainText.plain_text(html).should == "this\n\nis pretty bad looking html!"
-  end
-  it "should format bullet lists" do
-    html = "List<ul><li>one</li><li>two<ul><li>a</li><li>b</li></ul></li><li>three</li></ul>"
-    HtmlToPlainText.plain_text(html).should == "List\n\n* one\n* two\n\n** a\n** b\n\n* three"
-  end
-  it "should format numbered lists" do
-    html = "List<ol><li>one</li><li>two<ol><li>a</li><li>b</li></ol></li><li>three</li></ol>"
-    HtmlToPlainText.plain_text(html).should == "List\n\n1. one\n2. two\n\na. a\nb. b\n\n3. three"
-  end
-  it "should format a table" do
-    html = "Table<table><tr><th>Col 1</th><th>Col 2</th></tr><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>"
-    HtmlToPlainText.plain_text(html).should == "Table\n\n| Col 1 | Col 2 |\n| 1 | 2 |\n| 3 | 4 |"
-  end
-  it "should ignore inline tags without bodies" do
-    html = "This is an <img src=\"/image\"> image"
-    HtmlToPlainText.plain_text(html).should == "This is an image"
-  end
-  it "should ignore comments" do
-    html = "This is <!-- html comment here --> html"
-    HtmlToPlainText.plain_text(html).should == "This is html"
-  end
-  it "should unencode entities" do
-    html = "High &amp; Low"
-    HtmlToPlainText.plain_text(html).should == "High & Low"
-  end
-  it "should normalize the line breaks" do
-    html = "<pre>These are\rreturn\r\nlines</pre>"
-    HtmlToPlainText.plain_text(html).should == "These are\nreturn\nlines"
-  end
-  it "should include absolute link URLs" do
-    html = "<a name='links'>Links</a> <a href='/test'>partial</a> <a href='http://example.com/test'>full</a> test<a href='http://example.com/test2'> <img src='test'> </a>"
-    HtmlToPlainText.plain_text(html).should == "Links partial full (http://example.com/test) test"
-  end
-  it "should unescape entities" do
-    html = "This &amp; th&#97;t"
-    HtmlToPlainText.plain_text(html).should == "This & that"
-  end
-  it "should handle nil" do
-    HtmlToPlainText.plain_text(nil).should == nil
-  end
-  it "should handle empty text" do
-    HtmlToPlainText.plain_text("").should == ""
-  end
-  it "should handle non-html text" do
-    HtmlToPlainText.plain_text("test").should == "test"
-  end
-end

data/spec/spec_helper.rb DELETED Viewed

	@@ -1 +0,0 @@
1	- require File.expand_path("../../lib/html_to_plain_text.rb", __FILE__)