RubyGems - html_to_plain_text - Versions diffs - 1.0.2 → 1.0.3 - Mend

html_to_plain_text 1.0.2 → 1.0.3

Files changed (8) hide show

checksums.yaml +7 -0
data/README.rdoc +7 -1
data/Rakefile +6 -25
data/VERSION +1 -1
data/lib/html_to_plain_text.rb +42 -28
metadata +75 -74
data/spec/html_to_plain_text_spec.rb +0 -105
data/spec/spec_helper.rb +0 -1

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 119fe11f894f031f199ea631969078fefb53f417
+  data.tar.gz: 5882b3a0913030e44765b6506311682a2dd2e9ec
+SHA512:
+  metadata.gz: 1cb91d616d5ebeb4a6a23f92005e8a4e7616d7f565cd1540b499e919c35106f9adb45083b6d2453629c3be73eb28d13455972ce21979cf98c21d95c81f4dd3eb
+  data.tar.gz: fd7aacdc78b1c2cf4ee23515a5f1ef8cf8975b9569deec96f19ca6178e6ef4fece11c79ec4a992c1452246ab6fba90fe15277b217500deecd4f63c3b67561b8b

data/README.rdoc CHANGED Viewed

@@ -1,5 +1,7 @@
 = HTML To Plain Text
+<code>gem install html_to_plain_text</code>
 A simple gem that provide code to convert HTML into a plain text alternative. Line breaks from HTML block level elements will be maintained. Lists and tables will also maintain a little bit of formatting.
 * Line breaks will be approximated using the generally established default margins for HTML tags (i.e. <p>
@@ -15,4 +17,8 @@ tag generates two line breaks, <div> generates one)
 == Usage
-HtmlToPlainText.plain_text(html)
+    require 'html_to_plain_text'
+    html = "<h1>Hello</h1><p>world!</p>"
+    HtmlToPlainText.plain_text(html)
+    => "Hello\n\nworld!"

data/Rakefile CHANGED Viewed

@@ -1,29 +1,10 @@
-require 'rubygems'
-require 'rubygems/package_task'
-require 'rake'
+require 'bundler/setup'
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
+require 'bump/tasks'
 desc 'Default: run unit tests.'
 task :default => :test
-desc 'RVM likes to call it tests'
-task :tests => :test
-begin
-  require 'rspec'
-  require 'rspec/core/rake_task'
-  desc 'Run the unit tests'
-  RSpec::Core::RakeTask.new(:test)
-rescue LoadError
-  task :test do
-    STDERR.puts "You must have rspec 2.0 installed to run the tests"
-  end
-end
-spec_file = File.expand_path('../html_to_plain_text.gemspec', __FILE__)
-if File.exist?(spec_file)
-  spec = eval(File.read(spec_file))
-  Gem::PackageTask.new(spec) do |p|
-    p.gem_spec = spec
-  end
-end
+desc 'Run the unit tests'
+RSpec::Core::RakeTask.new(:test)

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 1.0.2
1	+ 1.0.3

data/lib/html_to_plain_text.rb CHANGED Viewed

@@ -16,45 +16,56 @@ module HtmlToPlainText
   OL = "ol".freeze
   UL = "ul".freeze
   LI = "li".freeze
+  A = "a".freeze
   NUMBERS = ["1", "a"].freeze
   ABSOLUTE_URL_PATTERN = /^[a-z]+:\/\/[a-z0-9]/i.freeze
   HTML_PATTERN = /[<&]/.freeze
   TRAILING_WHITESPACE = /[ \t]+$/.freeze
+  BODY_TAG_XPATH = "/html/body".freeze
+  CARRIDGE_RETURN_PATTERN = /\r(\n?)/.freeze
+  LINE_BREAK_PATTERN = /[\n\r]/.freeze
+  NON_PROTOCOL_PATTERN = /:\/?\/?(.*)/.freeze
+  NOT_WHITESPACE_PATTERN = /\S/.freeze
+  SPACE = " ".freeze
+  EMPTY = "".freeze
+  NEWLINE = "\n".freeze
+  HREF = "href".freeze
   # Helper instance method for converting HTML into plain text. This method simply calls HtmlToPlainText.plain_text.
   def plain_text(html)
     HtmlToPlainText.plain_text(html)
   end
   class << self
     # Convert some HTML into a plain text approximation.
     def plain_text(html)
       return nil if html.nil?
-      return html.dup unless html.match(HTML_PATTERN)
-      body = Nokogiri::HTML::Document.parse(html).css("body").first
+      return html.dup unless html =~ HTML_PATTERN
+      body = Nokogiri::HTML::Document.parse(html).xpath(BODY_TAG_XPATH).first
       return unless body
-      convert_node_to_plain_text(body).strip.gsub(/\r(\n?)/, "\n")
+      convert_node_to_plain_text(body).strip.gsub(CARRIDGE_RETURN_PATTERN, NEWLINE)
     end
     private
     # Convert an HTML node to plain text. This method is called recursively with the output and
     # formatting options for special tags.
-    def convert_node_to_plain_text(parent, out = "", options = {})
+    def convert_node_to_plain_text(parent, out = '', options = {})
       if PARAGRAPH_TAGS.include?(parent.name)
         append_paragraph_breaks(out)
       elsif BLOCK_TAGS.include?(parent.name)
         append_block_breaks(out)
       end
       format_list_item(out, options) if parent.name == LI
       out << "| " if parent.name == TR
       parent.children.each do |node|
         if node.text? || node.cdata?
           text = node.text
           unless options[:pre]
-            text = node.text.gsub(/[\n\r]/, " ").squeeze(" ")
+            text = node.text.gsub(LINE_BREAK_PATTERN, SPACE).squeeze(SPACE)
             text.lstrip! if WHITESPACE.include?(out[-1, 1])
           end
           out << text
@@ -62,19 +73,22 @@ module HtmlToPlainText
           out << node.text
         elsif node.element? && !IGNORE_TAGS.include?(node.name)
           convert_node_to_plain_text(node, out, child_options(node, options))
           if node.name == BR
-            out.sub!(TRAILING_WHITESPACE, '')
-            out << "\n"
+            out.sub!(TRAILING_WHITESPACE, EMPTY)
+            out << NEWLINE
           elsif node.name == HR
-            out.sub!(TRAILING_WHITESPACE, '')
-            out << "\n" unless out.end_with?("\n")
+            out.sub!(TRAILING_WHITESPACE, EMPTY)
+            out << NEWLINE unless out.end_with?(NEWLINE)
             out << "-------------------------------\n"
           elsif node.name == TD || node.name == TH
             out << " | "
-          elsif node.name == "a"
-            href = node["href"]
-            if href && href.match(ABSOLUTE_URL_PATTERN) && node.text.match(/\S/)
+          elsif node.name == A
+            href = node[HREF]
+            if href &&
+                href =~ ABSOLUTE_URL_PATTERN &&
+                node.text =~ NOT_WHITESPACE_PATTERN &&
+                node.text != href[NON_PROTOCOL_PATTERN, 1] # use only text for <a href="mailto:a@b.com">a@b.com</a>
               out << " (#{href}) "
             end
           elsif PARAGRAPH_TAGS.include?(node.name)
@@ -86,7 +100,7 @@ module HtmlToPlainText
       end
       out
     end
     # Set formatting options that will be passed to child elements for a tag.
     def child_options(node, options)
       if node.name == UL
@@ -103,25 +117,25 @@ module HtmlToPlainText
         options
       end
     end
     # Add double line breaks between paragraph elements. If line breaks already exist,
     # new ones will only be added to get to two.
     def append_paragraph_breaks(out)
-      out.sub!(TRAILING_WHITESPACE, '')
-      if out.end_with?("\n")
-        out << "\n" unless out.end_with?("\n\n")
+      out.sub!(TRAILING_WHITESPACE, EMPTY)
+      if out.end_with?(NEWLINE)
+        out << NEWLINE unless out.end_with?("\n\n")
       else
         out << "\n\n"
       end
     end
     # Add a single line break between block elements. If a line break already exists,
     # none will be added.
     def append_block_breaks(out)
-      out.sub!(TRAILING_WHITESPACE, '')
-      out << "\n" unless out.end_with?("\n")
+      out.sub!(TRAILING_WHITESPACE, EMPTY)
+      out << NEWLINE unless out.end_with?(NEWLINE)
     end
     # Add an appropriate bullet or number to a list element.
     def format_list_item(out, options)
       if options[:list] == :ul

metadata CHANGED Viewed

@@ -1,107 +1,108 @@
---- !ruby/object:Gem::Specification
+--- !ruby/object:Gem::Specification
 name: html_to_plain_text
-version: !ruby/object:Gem::Version
-  hash: 19
-  prerelease:
-  segments:
-  - 1
-  - 0
-  - 2
-  version: 1.0.2
+version: !ruby/object:Gem::Version
+  version: 1.0.3
 platform: ruby
-authors:
+authors:
 - Brian Durand
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-08-05 00:00:00 -05:00
-default_executable:
-dependencies:
-- !ruby/object:Gem::Dependency
+date: 2015-11-11 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
   name: nokogiri
-  prerelease: false
-  requirement: &id001 !ruby/object:Gem::Requirement
-    none: false
-    requirements:
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
     - - ">="
-      - !ruby/object:Gem::Version
-        hash: 7
-        segments:
-        - 1
-        - 4
-        - 0
+      - !ruby/object:Gem::Version
         version: 1.4.0
   type: :runtime
-  version_requirements: *id001
-- !ruby/object:Gem::Dependency
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.4.0
+- !ruby/object:Gem::Dependency
   name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">"
+      - !ruby/object:Gem::Version
+        version: 2.6.0
+  type: :development
   prerelease: false
-  requirement: &id002 !ruby/object:Gem::Requirement
-    none: false
-    requirements:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
     - - ">"
-      - !ruby/object:Gem::Version
-        hash: 15
-        segments:
-        - 2
-        - 0
-        - 0
-        version: 2.0.0
+      - !ruby/object:Gem::Version
+        version: 2.6.0
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
   type: :development
-  version_requirements: *id002
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: bump
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: A simple library for converting HTML into an approximation in plain text.
-email:
+email:
 - bdurand@embellishedvisions.com
 executables: []
 extensions: []
-extra_rdoc_files:
+extra_rdoc_files:
 - README.rdoc
-files:
+files:
+- MIT_LICENSE
 - README.rdoc
-- VERSION
 - Rakefile
-- MIT_LICENSE
+- VERSION
 - lib/html_to_plain_text.rb
-- spec/html_to_plain_text_spec.rb
-- spec/spec_helper.rb
-has_rdoc: true
-homepage: http://github.com/bdurand/html_to_plain_text
+homepage: https://github.com/bdurand/html_to_plain_text
 licenses: []
+metadata: {}
 post_install_message:
-rdoc_options:
-- --charset=UTF-8
-- --main
+rdoc_options:
+- "--charset=UTF-8"
+- "--main"
 - README.rdoc
-require_paths:
+require_paths:
 - lib
-required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
-  requirements:
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
   - - ">="
-    - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
-      version: "0"
-required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
-  requirements:
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
   - - ">="
-    - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
-      version: "0"
+    - !ruby/object:Gem::Version
+      version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.5.2
+rubygems_version: 2.4.5
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: A simple library for converting HTML into plain text.
 test_files: []

data/spec/html_to_plain_text_spec.rb DELETED Viewed

@@ -1,105 +0,0 @@
-require 'spec_helper'
-describe HtmlToPlainText do
-  it "should format paragraph tags" do
-    html = "<h1>Test</h1><h2>More Test</h2>\t \t<p>\n\tThis is a test\n</p>"
-    HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\n\nThis is a test"
-  end
-  it "should format block tags" do
-    html = "<div>Test</div><div>More Test<div>\t This is a test\t </div></div>"
-    HtmlToPlainText.plain_text(html).should == "Test\nMore Test\nThis is a test"
-  end
-  it "should format <br> tags" do
-    html = "<div>Test</div><br><div>More Test \t <br />This is a test"
-    HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\nThis is a test"
-  end
-  it "should format <hr> tags" do
-    html = "<div>Test</div><hr><div>More Test \t <hr />This is a test"
-    HtmlToPlainText.plain_text(html).should == "Test\n-------------------------------\nMore Test\n-------------------------------\nThis is a test"
-  end
-  it "should keep text formatting in <pre> tag blocks" do
-    html = "<div>This \n is a \ntest</div><pre>with\n  pre tags</pre>end"
-    HtmlToPlainText.plain_text(html).should == "This is a test\nwith\n  pre tags\nend"
-  end
-  it "should remove inline formatting tags" do
-    html = "This is <strong>so</strong> cool. I<em> mean <em>it."
-    HtmlToPlainText.plain_text(html).should == "This is so cool. I mean it."
-  end
-  it "should remove script, style, object, applet, and iframe tags" do
-    html = "script <script>do_something</script> style <style>css</style> object <object>config</object> applet <applet>config</applet> iframe <iframe>config</iframe>"
-    HtmlToPlainText.plain_text(html).should == "script style object applet iframe"
-  end
-  it "should handle plaintext tags" do
-    html = "<div>my\nhtml</div><plaintext>my\n text"
-    HtmlToPlainText.plain_text(html).should == "my html\nmy\n text"
-  end
-  it "should not add extraneous spaces or line breaks" do
-    html = "this<p><p>  is   \n    \n pretty bad lo<em>oking htm</em>l!"
-    HtmlToPlainText.plain_text(html).should == "this\n\nis pretty bad looking html!"
-  end
-  it "should format bullet lists" do
-    html = "List<ul><li>one</li><li>two<ul><li>a</li><li>b</li></ul></li><li>three</li></ul>"
-    HtmlToPlainText.plain_text(html).should == "List\n\n* one\n* two\n\n** a\n** b\n\n* three"
-  end
-  it "should format numbered lists" do
-    html = "List<ol><li>one</li><li>two<ol><li>a</li><li>b</li></ol></li><li>three</li></ol>"
-    HtmlToPlainText.plain_text(html).should == "List\n\n1. one\n2. two\n\na. a\nb. b\n\n3. three"
-  end
-  it "should format a table" do
-    html = "Table<table><tr><th>Col 1</th><th>Col 2</th></tr><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>"
-    HtmlToPlainText.plain_text(html).should == "Table\n\n| Col 1 | Col 2 |\n| 1 | 2 |\n| 3 | 4 |"
-  end
-  it "should ignore inline tags without bodies" do
-    html = "This is an <img src=\"/image\"> image"
-    HtmlToPlainText.plain_text(html).should == "This is an image"
-  end
-  it "should ignore comments" do
-    html = "This is <!-- html comment here --> html"
-    HtmlToPlainText.plain_text(html).should == "This is html"
-  end
-  it "should unencode entities" do
-    html = "High &amp; Low"
-    HtmlToPlainText.plain_text(html).should == "High & Low"
-  end
-  it "should normalize the line breaks" do
-    html = "<pre>These are\rreturn\r\nlines</pre>"
-    HtmlToPlainText.plain_text(html).should == "These are\nreturn\nlines"
-  end
-  it "should include absolute link URLs" do
-    html = "<a name='links'>Links</a> <a href='/test'>partial</a> <a href='http://example.com/test'>full</a> test<a href='http://example.com/test2'> <img src='test'> </a>"
-    HtmlToPlainText.plain_text(html).should == "Links partial full (http://example.com/test) test"
-  end
-  it "should unescape entities" do
-    html = "This &amp; th&#97;t"
-    HtmlToPlainText.plain_text(html).should == "This & that"
-  end
-  it "should handle nil" do
-    HtmlToPlainText.plain_text(nil).should == nil
-  end
-  it "should handle empty text" do
-    HtmlToPlainText.plain_text("").should == ""
-  end
-  it "should handle non-html text" do
-    HtmlToPlainText.plain_text("test").should == "test"
-  end
-end

data/spec/spec_helper.rb DELETED Viewed

	@@ -1 +0,0 @@
1	- require File.expand_path("../../lib/html_to_plain_text.rb", __FILE__)