RubyGems - html_to_plain_text - Versions diffs - 1.0.0 - Mend

html_to_plain_text 1.0.0

Files changed (8) hide show

data/MIT_LICENSE +20 -0
data/README.rdoc +18 -0
data/Rakefile +29 -0
data/VERSION +1 -0
data/lib/html_to_plain_text.rb +131 -0
data/spec/html_to_plain_text_spec.rb +93 -0
data/spec/spec_helper.rb +1 -0
metadata +107 -0

data/MIT_LICENSE ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2011 Brian Durand
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.rdoc ADDED Viewed

@@ -0,0 +1,18 @@
+= HTML To Plain Text
+A simple gem that provide code to convert HTML into a plain text alternative. Line breaks from HTML block level elements will be maintained. Lists and tables will also maintain a little bit of formatting.
+* Line breaks will be approximated using the generally established default margins for HTML tags (i.e. <p>
+tag generates two line breaks, <div> generates one)
+* Lists items will be numbered or bulleted with an asterisk
+* <br> tags will add line breaks
+* <hr> tags will add a string of hyphens to serve as a horizontal rule
+* <table> elements will enclosed in "|" delimiters
+* <a> tags will have the href URL appended to the text in parentheses
+* Formatting tags like <strong> or <b> will be stripped
+* Formatting inside <pre> or <plaintext> elements will be honored
+* Code-like tags like <script> or <style> will be stripped
+== Usage
+HtmlToPlainText.plain_text(html)

data/Rakefile ADDED Viewed

@@ -0,0 +1,29 @@
+require 'rubygems'
+require 'rubygems/package_task'
+require 'rake'
+desc 'Default: run unit tests.'
+task :default => :test
+desc 'RVM likes to call it tests'
+task :tests => :test
+begin
+  require 'rspec'
+  require 'rspec/core/rake_task'
+  desc 'Run the unit tests'
+  RSpec::Core::RakeTask.new(:test)
+rescue LoadError
+  task :test do
+    STDERR.puts "You must have rspec 2.0 installed to run the tests"
+  end
+end
+spec_file = File.expand_path('../html_to_plain_text.gemspec', __FILE__)
+if File.exist?(spec_file)
+  spec = eval(File.read(spec_file))
+  Gem::PackageTask.new(spec) do |p|
+    p.gem_spec = spec
+  end
+end

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 1.0.0

data/lib/html_to_plain_text.rb ADDED Viewed

@@ -0,0 +1,131 @@
+require 'nokogiri'
+# The main method on this module +plain_text+ will convert a string of HTML to a plain text approximation.
+module HtmlToPlainText
+  IGNORE_TAGS = %w(script style object applet iframe).inject({}){|h, t| h[t] = true; h}.freeze
+  PARAGRAPH_TAGS = %w(p h1 h2 h3 h4 h5 h6 table ol ul dl dd blockquote dialog figure aside section).inject({}){|h, t| h[t] = true; h}.freeze
+  BLOCK_TAGS = %w(div address li dt center del article header header footer nav pre legend tr).inject({}){|h, t| h[t] = true; h}.freeze
+  WHITESPACE = [" ", "\n", "\r"].freeze
+  PLAINTEXT = "plaintext".freeze
+  PRE = "pre".freeze
+  BR = "br".freeze
+  HR = "hr".freeze
+  TD = "td".freeze
+  TH = "th".freeze
+  TR = "tr".freeze
+  OL = "ol".freeze
+  UL = "ul".freeze
+  LI = "li".freeze
+  NUMBERS = ["1", "a"]
+  ABSOLUTE_URL_PATTERN = /^[a-z]+:\/\/[a-z0-9]/i
+  # Helper instance method for converting HTML into plain text. This method simply calls HtmlToPlainText.plain_text.
+  def plain_text(html)
+    HtmlToPlainText.plain_text(html)
+  end
+  class << self
+    # Convert some HTML into a plain text approximation.
+    def plain_text(html)
+      return if html.nil? || html.empty?
+      body = Nokogiri::HTML::Document.parse(html).css("body").first
+      return unless body
+      convert_node_to_plain_text(body).strip.gsub(/\r(\n?)/, "\n")
+    end
+    private
+    # Convert an HTML node to plain text. This method is called recursively with the output and
+    # formatting options for special tags.
+    def convert_node_to_plain_text(parent, out = "", options = {})
+      if PARAGRAPH_TAGS.include?(parent.name)
+        append_paragraph_breaks(out)
+      elsif BLOCK_TAGS.include?(parent.name)
+        append_block_breaks(out)
+      end
+      format_list_item(out, options) if parent.name == LI
+      out << "| " if parent.name == TR
+      parent.children.each do |node|
+        if node.text? || node.cdata?
+          text = node.text
+          unless options[:pre]
+            text = node.text.gsub(/[\n\r]/, " ").squeeze(" ")
+            text.lstrip! if WHITESPACE.include?(out[-1, 1])
+          end
+          out << text
+        elsif node.name == PLAINTEXT
+          out << node.text
+        elsif node.element? && !IGNORE_TAGS.include?(node.name)
+          convert_node_to_plain_text(node, out, child_options(node, options))
+          if node.name == BR
+            out << "\n"
+          elsif node.name == HR
+            out << "\n" unless out.end_with?("\n")
+            out << "-------------------------------\n"
+          elsif node.name == TD || node.name == TH
+            out << " | "
+          elsif node.name == "a"
+            href = node["href"]
+            if href && href.match(ABSOLUTE_URL_PATTERN) && node.text.match(/\S/)
+              out << " (#{href}) "
+            end
+          elsif PARAGRAPH_TAGS.include?(node.name)
+            append_paragraph_breaks(out)
+          elsif BLOCK_TAGS.include?(node.name)
+            append_block_breaks(out)
+          end
+        end
+      end
+      out
+    end
+    # Set formatting options that will be passed to child elements for a tag.
+    def child_options(node, options)
+      if node.name == UL
+        level = options[:ul] || -1
+        level += 1
+        options.merge(:list => :ul, :ul => level)
+      elsif node.name == OL
+        level = options[:ol] || -1
+        level += 1
+        options.merge(:list => :ol, :ol => level, :number => NUMBERS[level % 2])
+      elsif node.name == PRE
+        options.merge(:pre => true)
+      else
+        options
+      end
+    end
+    # Add double line breaks between paragraph elements. If line breaks already exist,
+    # new ones will only be added to get to two.
+    def append_paragraph_breaks(out)
+      out.chomp!(" ")
+      if out.end_with?("\n")
+        out << "\n" unless out.end_with?("\n\n")
+      else
+        out << "\n\n"
+      end
+    end
+    # Add a single line break between block elements. If a line break already exists,
+    # none will be added.
+    def append_block_breaks(out)
+      out.chomp!(" ")
+      out << "\n" unless out.end_with?("\n")
+    end
+    # Add an appropriate bullet or number to a list element.
+    def format_list_item(out, options)
+      if options[:list] == :ul
+        out << "#{'*' * (options[:ul] + 1)} "
+      elsif options[:list] == :ol
+        number = options[:number]
+        options[:number] = number.next
+        out << "#{number}. "
+      end
+    end
+  end
+end

data/spec/html_to_plain_text_spec.rb ADDED Viewed

@@ -0,0 +1,93 @@
+require 'spec_helper'
+describe HtmlToPlainText do
+  it "should format paragraph tags" do
+    html = "<h1>Test</h1><h2>More Test</h2><p>This is a test</p>"
+    HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\n\nThis is a test"
+  end
+  it "should format block tags" do
+    html = "<div>Test</div><div>More Test<div>This is a test</div></div>"
+    HtmlToPlainText.plain_text(html).should == "Test\nMore Test\nThis is a test"
+  end
+  it "should format <br> tags" do
+    html = "<div>Test</div><br><div>More Test<br />This is a test"
+    HtmlToPlainText.plain_text(html).should == "Test\n\nMore Test\nThis is a test"
+  end
+  it "should format <hr> tags" do
+    html = "<div>Test</div><hr><div>More Test<hr />This is a test"
+    HtmlToPlainText.plain_text(html).should == "Test\n-------------------------------\nMore Test\n-------------------------------\nThis is a test"
+  end
+  it "should keep text formatting in <pre> tag blocks" do
+    html = "<div>This \n is a \ntest</div><pre>with\n  pre tags</pre>end"
+    HtmlToPlainText.plain_text(html).should == "This is a test\nwith\n  pre tags\nend"
+  end
+  it "should remove inline formatting tags" do
+    html = "This is <strong>so</strong> cool. I<em> mean <em>it."
+    HtmlToPlainText.plain_text(html).should == "This is so cool. I mean it."
+  end
+  it "should remove script, style, object, applet, and iframe tags" do
+    html = "script <script>do_something</script> style <style>css</style> object <object>config</object> applet <applet>config</applet> iframe <iframe>config</iframe>"
+    HtmlToPlainText.plain_text(html).should == "script style object applet iframe"
+  end
+  it "should handle plaintext tags" do
+    html = "<div>my\nhtml</div><plaintext>my\n text"
+    HtmlToPlainText.plain_text(html).should == "my html\nmy\n text"
+  end
+  it "should not add extraneous spaces or line breaks" do
+    html = "this<p><p>  is   \n    \n pretty bad lo<em>oking htm</em>l!"
+    HtmlToPlainText.plain_text(html).should == "this\n\nis pretty bad looking html!"
+  end
+  it "should format bullet lists" do
+    html = "List<ul><li>one</li><li>two<ul><li>a</li><li>b</li></ul></li><li>three</li></ul>"
+    HtmlToPlainText.plain_text(html).should == "List\n\n* one\n* two\n\n** a\n** b\n\n* three"
+  end
+  it "should format numbered lists" do
+    html = "List<ol><li>one</li><li>two<ol><li>a</li><li>b</li></ol></li><li>three</li></ol>"
+    HtmlToPlainText.plain_text(html).should == "List\n\n1. one\n2. two\n\na. a\nb. b\n\n3. three"
+  end
+  it "should format a table" do
+    html = "Table<table><tr><th>Col 1</th><th>Col 2</th></tr><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>"
+    HtmlToPlainText.plain_text(html).should == "Table\n\n| Col 1 | Col 2 |\n| 1 | 2 |\n| 3 | 4 |"
+  end
+  it "should ignore inline tags without bodies" do
+    html = "This is an <img src=\"/image\"> image"
+    HtmlToPlainText.plain_text(html).should == "This is an image"
+  end
+  it "should ignore comments" do
+    html = "This is <!-- html comment here --> html"
+    HtmlToPlainText.plain_text(html).should == "This is html"
+  end
+  it "should unencode entities" do
+    html = "High &amp; Low"
+    HtmlToPlainText.plain_text(html).should == "High & Low"
+  end
+  it "should normalize the line breaks" do
+    html = "<pre>These are\rreturn\r\nlines</pre>"
+    HtmlToPlainText.plain_text(html).should == "These are\nreturn\nlines"
+  end
+  it "should include absolute link URLs" do
+    html = "<a name='links'>Links</a> <a href='/test'>partial</a> <a href='http://example.com/test'>full</a> test<a href='http://example.com/test2'> <img src='test'> </a>"
+    HtmlToPlainText.plain_text(html).should == "Links partial full (http://example.com/test) test"
+  end
+  it "should unescape entities" do
+    html = "This &amp; th&#97;t"
+    HtmlToPlainText.plain_text(html).should == "This & that"
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require File.expand_path("../../lib/html_to_plain_text.rb", __FILE__)

metadata ADDED Viewed

@@ -0,0 +1,107 @@
+--- !ruby/object:Gem::Specification
+name: html_to_plain_text
+version: !ruby/object:Gem::Version
+  hash: 23
+  prerelease:
+  segments:
+  - 1
+  - 0
+  - 0
+  version: 1.0.0
+platform: ruby
+authors:
+- Brian Durand
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-06-08 00:00:00 -05:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 7
+        segments:
+        - 1
+        - 4
+        - 0
+        version: 1.4.0
+  type: :runtime
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: rspec
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">"
+      - !ruby/object:Gem::Version
+        hash: 15
+        segments:
+        - 2
+        - 0
+        - 0
+        version: 2.0.0
+  type: :development
+  version_requirements: *id002
+description: A simple library for converting HTML into an approximation in plain text.
+email:
+- bdurand@embellishedvisions.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- README.rdoc
+files:
+- README.rdoc
+- VERSION
+- Rakefile
+- MIT_LICENSE
+- lib/html_to_plain_text.rb
+- spec/html_to_plain_text_spec.rb
+- spec/spec_helper.rb
+has_rdoc: true
+homepage: http://github.com/bdurand/html_to_plain_text
+licenses: []
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+- --main
+- README.rdoc
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.5.2
+signing_key:
+specification_version: 3
+summary: A simple library for converting HTML into plain text.
+test_files: []