RubyGems - html_massage - Versions diffs - 0.0.2 → 0.2.0 - Mend

html_massage 0.0.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/.gitignore CHANGED Viewed

@@ -2,3 +2,4 @@
 .idea
 Gemfile.lock
 pkg/*
+README-backup*

data/Gemfile CHANGED Viewed

@@ -1,4 +1,6 @@
-source "http://rubygems.org"
+source 'https://rubygems.org'
+gem 'reverse_markdown',  :git => 'git://github.com/harlantwood/reverse_markdown.git'
 # Specify your gem's dependencies in html_massage.gemspec
 gemspec

data/License-MIT ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2012 Harlan T Wood
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md CHANGED Viewed

@@ -1,18 +1,80 @@
 # html_massage
 Give your HTML a massage, in just the ways it loves:
  * Remove headers and footers and navigation, and strip to only the "content" part of the HTML
  * Sanitize tags, removing javascript and styling
  * Convert your HTML to nicely-formatted plain text
-## Usage
+## Sample Usage
+### Full Massage
-    require 'rubygems'
     require 'html_massage'
-    html = "<html><body><div id='header'>My Site</div><div>This is some great content!</div></body></html>"
-    html_massage = HtmlMassage.new( html, :ignored_selectors => [ '#header' ] )
-     # => #<HtmlMassager::HtmlMassage ... >
-    html_massage.to_html
-     # => "<div>This is some great content!</div>"
-    html_massage.to_text
-     # => "This is some great content!\n"
+    html = %{
+      <html>
+        <head>
+          <script type="text/javascript">document.write('I am a bad script');</script>
+        </head>
+        <body>
+          <div id="header">My Site</div>
+          <div>This is some great content!</div>
+          <a href ="foo/bar.html">Click this link</a>
+        </body>
+      </html>
+    }
+    puts HtmlMassage.html( html )
+    # => "<div>This is some great content!</div>"
+    puts HtmlMassage.text( html )
+    # => "This is some great content!\n"
+### Content Only
+    html_massage = HtmlMassage.new( html,
+            :exclude => [ '#header' ] )
+    # => #<HtmlMassager::HtmlMassage ... >
+    puts html_massage.exclude!
+    # <div>This is some great content!</div>
+    # <a href="foo/bar.html">Click this link</a>
+### Sanitize HTML
+    html_massage = HtmlMassage.new( html,
+            :exclude => [ '#header' ] )
+    # => #<HtmlMassager::HtmlMassage ... >
+    puts html_massage.sanitize_html!
+    # <html>
+    #   <head>
+    #   </head>
+    #   <body>
+    #     <div id="header">My Site</div>
+    #     <div>This is some great content!</div>
+    #   </body>
+    # </html>
+### Make Links Absolute
+    html_massage = HtmlMassage.new( html,
+            :exclude => [ '#header' ],
+            :source_url => 'http://example.com/joe/page1.html' )
+    puts html_massage.absolutify_links!
+    # <html>
+    #   <head>
+    #     <script type="text/javascript">document.write('I am a bad script');</script>
+    #   </head>
+    #   <body>
+    #     <div id="header">My Site</div>
+    #     <div>This is some great content!</div>
+    #     <a href ="http://example.com/joe/foo/bar.html">Click this link</a>
+    #   </body>
+    # </html>
+    puts html_massage.absolutify_images!
+    #

data/Rakefile CHANGED Viewed

	@@ -1 +1 @@
1	- require 'bundler/gem_tasks'
1	+ require "bundler/gem_tasks"

data/bin/html_massage ADDED Viewed

@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+require 'html_massage/cli'
+HtmlMassager::CLI.start

data/generate_readme.rb ADDED Viewed

@@ -0,0 +1,59 @@
+#!/usr/bin/env ruby
+class IO
+  def self.write( path, content )
+    file = File.new( path, "w" )
+    file.write( content )
+    file.close
+  end
+end
+CHUNK_SEP = "\n\n"
+def is_code?( markdown )
+  markdown.start_with?( '    ' )
+end
+def header( text, top_newlines )
+  puts "\n" * top_newlines
+  puts '*'*10
+  puts text
+  puts '*'*10
+end
+system( "cp README.md README-backup-#{Time.now.to_s.gsub(/\W/, '-')}.md" )
+readme = IO.read( 'README.md' )
+chunks = readme.split( CHUNK_SEP )
+code = ''
+new_readme = ''
+chunks.each do |chunk|
+  if is_code?( chunk )
+    chunk
+    code << chunk << CHUNK_SEP
+    header( 'Code', 3 )
+    puts code
+    header( 'Result', 1 )
+    puts result = eval( code )
+    unless result.nil?
+      p 111, chunk
+      result = result.to_s
+      _, code_sans_results = chunk.match( /\A((?:    [^#].*\r?\n)+)(?:    #.*\r?\n)+\Z/ ).to_a
+      if code_sans_results
+        p 222
+        result = result.split("\n").map{ |line| "    # #{line}" }.join("\n")
+        chunk = code_sans_results << result << CHUNK_SEP
+      end
+    end
+    header( 'Output', 1 )
+    puts chunk
+    new_readme << chunk << CHUNK_SEP
+  end
+end
+IO.write( 'README.md', new_readme )

data/html_massage.gemspec CHANGED Viewed

@@ -1,23 +1,28 @@
 # -*- encoding: utf-8 -*-
-$:.push File.expand_path("../lib", __FILE__)
-require "html_massage/version"
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'html_massage/version'
-Gem::Specification.new do |s|
-  s.name        = "html_massage"
-  s.version     = HtmlMassager::VERSION
-  s.authors     = ["Harlan Knight Wood"]
-  s.email       = ["code@hkw7.org"]
-  s.homepage    = "https://github.com/onesunone/html_massage"
-  s.summary     = %{Massages HTML how you want to.}
-  s.description = %{Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text.}
+Gem::Specification.new do |gem|
+  gem.name          = "html_massage"
+  gem.version       = HtmlMassager::VERSION
+  gem.authors       = ["Harlan T Wood"]
+  gem.email         = ["code@harlantwood.net"]
+  gem.homepage      = "https://github.com/harlantwood/html_massage"
+  gem.summary       = %{Massages HTML how you want to.}
+  gem.description   = %{Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text.}
-  s.rubyforge_project = "html_massage"
+  gem.files         = `git ls-files`.split($/)
+  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
+  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
+  gem.require_paths = ["lib"]
-  s.add_dependency('nokogiri', ">= 1.4.4")
-  s.add_dependency('sanitize', ">= 2.0.0")
+  gem.add_dependency "nokogiri", ">= 1.4"
+  gem.add_dependency "sanitize", ">= 2.0"
+  gem.add_dependency "thor"
+  gem.add_dependency "rest-client", ">= 1.6"
+  gem.add_development_dependency "rspec", ">= 2.5"
-  s.files         = `git ls-files`.split("\n")
-  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
-  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
-  s.require_paths = ["lib"]
 end

data/lib/html_massage/cli.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require 'thor'
+require 'rest_client'
+require 'html_massage'
+module HtmlMassager
+  class CLI < Thor
+    desc :html, 'Download HTML from given URL and massage into html'
+    def html url
+      STDOUT.puts massage_to :html, url
+    end
+    desc :text, 'Download HTML from given URL and massage into plain text'
+    def text url
+      STDOUT.puts massage_to :text, url
+    end
+    desc :markdown, 'Download HTML from given URL and massage into markdown'
+    def markdown url
+      STDOUT.puts massage_to :markdown, url
+    end
+    no_tasks do
+      def massage_to output_format, url
+        HtmlMassage.send output_format,
+                         RestClient.get(url),
+                         :source_url => url,
+                         :links => :absolute,
+                         :images => :absolute
+      end
+    end
+  end
+end

data/lib/html_massage/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module HtmlMassager
-  VERSION = "0.0.2"
+  VERSION = "0.2.0"
 end

data/lib/html_massage.rb CHANGED Viewed

@@ -1,129 +1,281 @@
 require "cgi"
 require "nokogiri"
 require "sanitize"
+require "reverse_markdown"
 require "html_massage/version"
 module HtmlMassager
   class HtmlMassage
-    def initialize( html, options )
-      @source_url        = options[ :source_url ]
-      @ignored_selectors = options[ :ignored_selectors ]
-      @clean_html = massage_html( html )
-    end
-    def massage_html( html )
-      html = content_only( html )
-      html = sanitize_html( html )
-      html = absolutify_links( html ) if @source_url
+    INCLUDE_CONTENT_ONLY = %w[
       html
-    end
+      body
+    ]
-    def content_only( content )
-      doc = Nokogiri::HTML( content )
-      body = doc / 'html' / 'body'
+    DEFAULT_EXCLUDE_OPTIONS = [
+      # general:
+      'head',
+      'title',
+      'meta',
-      @ignored_selectors.to_a.each do |ignored_selector|
-        ( body / ignored_selector ).remove
-      end
+      'div#header',
+      'div.header',
+      'div#banner',
+      'div.banner',
+      '.footer',
+      '#footer',
+      'div#navigation',
+      'div.navigation',
+      'div#nav',
+      'div.nav',
+      'div#sidebar',
+      'div.sidebar',
+      '#breadcrumbs',
+      '.breadcrumbs',
+      '#backfornav',
+      '.backfornav',
+      'div.post-footer',
+      'div.navigation',
-      content = body / '#content'
-      content = body if content.empty?
-      content = content.inner_html
-      content
-    end
+      # wordpress:
+      'a#left_arrow',
+      'a#right_arrow',
+      'div#comments',
+      'div#comment-section',
+      'div#respond',
-    def sanitize_html(html)
-      html = html.dup
+      # typepad
+      '#pagebody > #pagebody-inner > #alpha',
+      'p.content-nav',
+      # blog widgets
+      '.widget_blog_subscription',
+      '.loggedout-follow-normal',
-      %w[ script noscript style ].each do |tag|
-        html.gsub!( %r{<#{tag}[^>]*>.*?</#{tag}>}mi, '' )
-      end
-      Sanitize.clean(
-          html,
-          {
-              :elements => [
-                  'a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
-                  'blockquote', 'br', 'button', 'caption', 'center', 'cite',
-                  'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir',
-                  'div', 'dl', 'dt', 'em', 'fieldset', 'form', 'h1',
-                  'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
-                  'img',
-                  'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu',
-                  'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
-                  'select', 'small', 'span', 'strike', 'strong', 'sub',
-                  'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
-                  'thead', 'tr', 'tt', 'u', 'ul', 'var',
+      # wikipedia
+      '#bodyContent > #siteSub',
+      '#bodyContent > #contentSub',
+      '#bodyContent > #jump-to-nav',
+      'table.metadata',
+      'table.navbox',
+      'table.toc',
+      'div#catlinks',
+      'div.printfooter',
+      'h1 > span.editsection',
+      'h2 > span.editsection',
+      'h3 > span.editsection',
+      'h4 > span.editsection',
+      'h5 > span.editsection',
+      'h6 > span.editsection',
+      # wikipedia "message boxes" -- metadata such as "requires cleanup":
+      # see http://en.wikipedia.org/wiki/Template:Ambox
+      'table.ambox',
+      'table.tmbox',
+      'table.imbox',
+      'table.cmbox',
+      'table.ombox',
+      'table.fmbox',
+      'table.dmbox',
+      # mediawiki
+      '#mw-subcategories',
+      '#mw-pages',
+      '#mw-head',
+      '#mw-panel',
+      # social media sharing:
+      'ul#sharebar',
+      'ul#sharebarx',
+      '.sharedaddy',
+      '#sharing_email',
+      # signup:
+      '#mailchimp_signup_bottom',
+    ]
+    DEFAULT_SANITIZE_OPTIONS = {
+              :elements => %w[
+                  a abbr acronym address area b big
+                  blockquote br button caption center cite
+                  code col colgroup dd del dfn dir
+                  div dl dt em fieldset form h1
+                  h2 h3 h4 h5 h6 hr i
+                  img
+                  input ins kbd label legend li map menu
+                  ol optgroup option p pre q s samp
+                  select small span strike strong sub
+                  sup table tbody td textarea tfoot th
+                  thead tr tt u ul var
               ],
               :attributes => {
-                  'a' => ['href'],
-                  'img' => ['src'],
-                  :all => ['abbr', 'accept', 'accept-charset',
-                           'accesskey', 'action', 'align', 'alt', 'axis',
-                           'border', 'cellpadding', 'cellspacing', 'char',
-                           'charoff', 'class', 'charset', 'checked', 'cite',
-                           'clear', 'cols', 'colspan', 'color',
-                           'compact', 'coords', 'datetime', 'dir',
-                           'disabled', 'enctype', 'for', 'frame',
-                           'headers', 'height', 'hreflang',
-                           'hspace', 'id', 'ismap', 'label', 'lang',
-                           'longdesc', 'maxlength', 'media', 'method',
-                           'multiple', 'name', 'nohref', 'noshade',
-                           'nowrap', 'prompt', 'readonly', 'rel', 'rev',
-                           'rows', 'rowspan', 'rules', 'scope',
-                           'selected', 'shape', 'size', 'span',
-                           'start', 'summary', 'tabindex', 'target',
-                           'title', 'type', 'usemap', 'valign', 'value',
-                           'vspace', 'width']
+                  'a' => %w[ href ],
+                  'img' => %w[ src ],
+                  :all => %w[
+                    abbr accept accept-charset
+                    accesskey action align alt axis
+                    border cellpadding cellspacing char
+                    charoff class charset checked cite
+                    clear cols colspan color
+                    compact coords datetime dir
+                    disabled enctype for frame
+                    headers height hreflang
+                    hspace id ismap label lang
+                    longdesc maxlength media method
+                    multiple name nohref noshade
+                    nowrap prompt readonly rel rev
+                    rows rowspan rules scope
+                    selected shape size span
+                    start summary tabindex target
+                    title type usemap valign value
+                    vspace width
+                  ]
               },
+              # medium permissive list:
+              #:elements => [
+              #    'a', 'b', 'blockquote', 'br', 'code', 'dd', 'del', 'dl', 'dt',
+              #    'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
+              #    'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike', 'strong', 'sub',
+              #    'sup', 'table', 'tbody', 'td', 'th',
+              #    'thead', 'tr', 'u', 'ul',
+              #],
               :protocols => {
                   'a' => {'href' => ['http', 'https', 'mailto', :relative]},
                   'img' => {'src' => ['http', 'https', :relative]}
               },
-              # consider including for deprecated/historical/or spam-suspect pages:
-              # Gollum has a nice way to add this to your config optionally, see:
-              # https://github.com/github/gollum/blob/master/lib/gollum/sanitization.rb
+              # Consider including for deprecated/historical or spam-suspect pages:
               #
               #        :add_attributes => {
               #            'a' => {'rel' => 'nofollow'}
               #        }
+              #
+              # Gollum has a nice way to add this to your config optionally, see:
+              # https://github.com/github/gollum/blob/master/lib/gollum/sanitization.rb
           }
-      )
+    DEFAULTS = {
+            :include => INCLUDE_CONTENT_ONLY,
+            :exclude => DEFAULT_EXCLUDE_OPTIONS,
+            :sanitize => DEFAULT_SANITIZE_OPTIONS,
+            :links => :unchanged,
+    }
+    def self.html( html, options={} )
+      new( html ).massage!( options ).to_html
     end
-    def absolutify_links( html )
-      match = @source_url.match( %r{(^[a-z]+://[^/]+)(/.+/)}i )
-      return html unless match
+    def self.text( html, options={} )
+      new( html ).massage!( options ).to_text
+    end
+    def self.markdown( html, options={} )
+      ReverseMarkdown.parse( self.html( html, options ) )
+    end
+    def initialize( html )
+      @html = html.dup
+    end
+    def massage!( options={} )
+      self.class.translate_old_options( options )
+      options = DEFAULTS.merge( options )
+      absolutify_links!(options[:source_url])  if options.delete( :links  ) == :absolute
+      absolutify_images!(options[:source_url]) if options.delete( :images ) == :absolute
+      include!( options.delete( :include ) )
+      exclude!( options.delete( :exclude ) )
+      sanitize!( options.delete( :sanitize ) )
+      tidy_whitespace!
+      raise "Unexpected options #{options.inspect}" unless options.empty?
+      self
+    end
+    def self.translate_old_options( options )
+      options[ :exclude ] = options.delete( :ignored_selectors ) if options[ :ignored_selectors ]
+    end
+    def exclude!( selectors_to_exclude )
+      doc = Nokogiri::HTML( @html )
+      selectors_to_exclude.to_a.each do |selector_to_exclude|
+        ( doc / selector_to_exclude ).remove
+      end
+      @html = doc.to_s
+    end
+    def include!( selectors_to_include )
+      section = Nokogiri::HTML( @html )
+      selectors_to_include.to_a.each do |selector_to_include|
+        subsection = section / selector_to_include
+        section = subsection unless subsection.empty?
+      end
+      @html = section.inner_html
+    end
+    def sanitize!( sanitize_options={} )
+      # Sanitize does not thoroughly remove these tags -- so we do a manual pass:
+      %w[ script noscript style ].each do |tag|
+        unless sanitize_options[ :elements ] && sanitize_options[ :elements ].include?( tag )
+          @html.gsub!( %r{<#{tag}[^>]*>.*?</#{tag}>}mi, '' )
+        end
+      end
+      @html = Sanitize.clean( @html, sanitize_options )
+      @html
+    end
+    def absolutify_links!(source_url)
+      absolutify_paths!('a', 'href', source_url)
+    end
+    def absolutify_images!(source_url)
+      absolutify_paths!('img', 'src', source_url)
+    end
+    def absolutify_paths!(tag_name, attr, source_url)
+      raise "When asking for absolute images or paths, please pass in source_url" unless source_url
+      match = source_url.match( %r{(^[a-z]+?://[^/]+)(/.+/)?}i )
+      return @html unless match
       base_url = match[ 1 ]
       resource_dir_url = match[ 0 ]   # whole regexp match
+      dom = Nokogiri::HTML.fragment( @html )
-      dom = Nokogiri::HTML.fragment( html )
-      links = dom / 'a'
-      links.each do |link|
-        href = link[ 'href' ]
-        if href
-          link[ 'href' ] =
-            case href
+      tags = dom / tag_name
+      tags.each do |tag|
+        value = tag[ attr ]
+        if value
+          tag[ attr ] =
+            case value
+              when %r{^//}  # eg src="//upload.wikimedia.org/wikipedia/Map.png"
+                value
               when %r{^/}
-                File.join( base_url, href )
+                File.join( base_url, value )
               when %r{^\.\.}
-                File.join( resource_dir_url, href )
+                File.join( resource_dir_url, value )
               else
-                href
+                value
             end
         end
       end
-      html = dom.to_s
-      html
+      @html = dom.to_s.strip
     end
-    def to_html
-      @clean_html
+    def tidy_whitespace!
+      @html = strip_lines(@html)
+      tidy_tables!
+    end
+    def tidy_tables!
+      @html.gsub!(%r{(<table\b)(.+?)(</table>)}m) { open,body,close=$1,$2,$3; open + body.gsub(/\n{2,}/, "\n") + close }
     end
     def to_text
-      text = CGI.unescapeHTML( @clean_html )
+      text = CGI.unescapeHTML( @html )
       # normalize newlines
       text.gsub!(/\r\n/, "\n")
@@ -132,7 +284,7 @@ module HtmlMassager
       # nbsp => ' '
       text.gsub!(/&nbsp;/, ' ')
-      # TODO: figure out how to do these in ruby 1.9.2:
+      # TODO: figure out how to do these in ruby 1.9:
       # They now throw 'incompatible encoding -- ascii regexp for utf8 string'
       #    text.gsub!( /\302\240/, ' ' )  # UTF8 for nbsp
       #    text.gsub!( /\240/, ' ' )      # ascii for nbsp
@@ -163,14 +315,21 @@ module HtmlMassager
       "#{text}\n"
     end
-    def strip_lines( text )
-      lines = text.split( "\n" )
+    def strip_lines(content)
+      lines = content.split( $/ )    # $/ is the current ruby line ending, \n by default
       lines.map!{ |line| line.strip }
-      text = lines.join( "\n" )
-      text.strip
+      processed = lines.join( $/ )
+      processed.strip
+    end
+    def to_html
+      @html.strip!
+      @html
     end
   end
 end
-include HtmlMassager
+include HtmlMassager

data/spec/html_massage_spec.rb ADDED Viewed

@@ -0,0 +1,210 @@
+# -*- encoding: utf-8 -*-
+require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'html_massage'))
+describe HtmlMassager::HtmlMassage do
+  include HtmlMassager
+  describe ".html" do
+    it 'Should massage and output HTML' do
+      html = "<html><body><div>This is some great content!</div></body></html>"
+      HtmlMassage.html(html).should == "<div>This is some great content!</div>"
+    end
+    it 'should remove HTML "doctype"' do
+      html = '
+        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+        <html xmlns="http://www.w3.org/1999/xhtml">
+        <body>
+          <p>foobar</p>
+        </body>
+        </html>
+        '
+      HtmlMassage.html(html).strip.should == "<p>foobar</p>"
+    end
+  end
+  describe ".text" do
+    it 'Should massage and output text' do
+      html = "<html><body><div>This is some great content!</div></body></html>"
+      HtmlMassage.text(html).strip.should == "This is some great content!"
+    end
+    it 'should convert an HTML sample as expected' do
+      html = "
+        <html><body>
+        <h1>Title</h1>
+        This is the body.
+        Testing <a href='http://www.google.com/'>link to Google</a>.
+        <p />
+        Testing image <img src='/noimage.png'>.
+        <br />
+        The End.
+        </body></html>
+        "
+      HtmlMassage.text(html).strip.should == "Title
+        This is the body. Testing link to Google.
+        Testing image .
+        The End.
+        ".strip.gsub(/^ +/, '')
+    end
+    it 'should play nice with UTF8 HTML source' do
+      html = '
+        <html>
+        <head>
+          <meta content="text/html; charset=utf-8" http-equiv="content-type" />
+        </head>
+        <body>
+          Niq is a performer → Angry, arrogant, &amp; so admired.
+        </body>
+        </html>
+        '
+      HtmlMassage.text(html).strip.should == "Niq is a performer → Angry, arrogant, & so admired."
+    end
+    it 'should play nice with &nbsp;' do
+      pending
+      html = '&nbsp;&nbsp;&nbsp;'
+      HtmlMassage.text(html).strip.should == "   "
+    end
+  end
+  describe ".markdown" do
+    it 'Should massage and output markdown' do
+      html = "<html><body><div>This is some <i>great</i> content!</div></body></html>"
+      massaged = HtmlMassage.markdown html
+      massaged.strip.should == "This is some _great_ content!"
+    end
+  end
+  describe "#massage!" do
+    context 'invalid html' do
+      [
+        "<html><body>foobar</body>",
+        "<html><body>foobar</html>",
+        "<body>foobar</body></html>",
+        "<html>foobar</body></html>",
+      ].each do |broken_html|
+        it "should return 'foobar' when given #{broken_html.inspect}" do
+          HtmlMassage.new(broken_html).massage!.to_text.strip.should == "foobar"
+        end
+      end
+    end
+    pending 'should convert an HTML sample as expected'
+    it 'should leave HTML entities intact' do
+      pending 'improve ::Node.massage_html -- handling of html entities, utf8 chars'
+      original = "This &ldquo;branching&rdquo; of creative works"
+      massage = HtmlMassager::HtmlMassage.new( original )
+      massage.massage!.should == original
+    end
+  end
+  describe ".sanitize_html" do
+    it 'should remove <style> tags and their contents' do
+      html = %~<!-- Remix button --><br />
+        <style type='text/css'>
+            a.remix_on_wikinodes_tab {
+            top: 25%; left: 0; width: 42px; height: 100px; color: #FFF; cursor:pointer; text-indent:-99999px; overflow:hidden; position: fixed; z-index: 99999; margin-left: -7px; background-image: url(http://www.openyourproject.org/images/remix_tab.png); _position: absolute; right: 0 !important; left: auto !important; margin-right: -7px !important; margin-left: auto !important; } a.remix_on_wikinodes_tab:hover { margin-left: -4px; margin-right: -4px !important; margin-left: auto !important;
+          }
+        </style>
+        <p> <script type="text/javascript" language="javascript"> document.write( '<a style="background-color: #2a2a2a;" class="remix_on_wikinodes_tab" href="http://www.openyourproject.org/nodes/new?parent=' + window.location + '" title="Remix this content on WikiNodes -- creative collaboration designed to set you free" >Remix This</a>' ); </script> <noscript>Note: you can turn on Javascript to see the &#8216;Remix This&#8217; link.</noscript></p>
+      ~
+      html_massager = HtmlMassage.new( html )
+      html_massager.sanitize!.should_not =~ /remix_on_wikinodes_tab/
+    end
+    it 'should remove <noscript> tags and their contents' do
+      html = %{ <noscript>Note: you can turn on Javascript to see the 'Remix This' link. </noscript> }
+      html_massager = HtmlMassage.new( html )
+      html_massager.sanitize!.strip.should == ''
+    end
+  end
+  describe '#absolutify_links' do
+    it 'should work for absolute path links' do
+      source_url = 'http://en.wikipedia.org/wiki/Singularity'
+      original_html = '<a href="/wiki/Ray_Kurzweil">Ray</a>'
+      html_massager = HtmlMassage.new( original_html )
+      html_massager.absolutify_links!(source_url).should ==
+          '<a href="http://en.wikipedia.org/wiki/Ray_Kurzweil">Ray</a>'
+    end
+    it 'should work for absolute path links (bugfix)' do
+      source_url = 'http://p2pfoundation.net/NextNet'
+      original_html = '<a href="/Ten_Principles_for_an_Autonomous_Internet" title="Ten Principles for an Autonomous Internet">Ten Principles for an Autonomous Internet</a>'
+      html_massager = HtmlMassage.new( original_html )
+      html_massager.absolutify_links!(source_url).should ==
+          '<a href="http://p2pfoundation.net/Ten_Principles_for_an_Autonomous_Internet" title="Ten Principles for an Autonomous Internet">Ten Principles for an Autonomous Internet</a>'
+    end
+    it 'should work for relative links' do
+      source_url = 'http://en.wikipedia.org/wiki/Singularity'
+      original_html = '<a href="../wiki/Ray_Kurzweil">Ray</a>'
+      html_massager = HtmlMassage.new( original_html )
+      html_massager.absolutify_links!(source_url).should ==
+          '<a href="http://en.wikipedia.org/wiki/../wiki/Ray_Kurzweil">Ray</a>'
+    end
+    it 'should leave full URLs alone' do
+      source_url = 'http://en.wikipedia.org/wiki/Singularity'
+      original_html = '<a href="http://www.wired.com/wiredscience">wired science</a>'
+      html_massager = HtmlMassage.new( original_html )
+      html_massager.absolutify_links!(source_url).should == original_html
+    end
+    it 'should leave // style URLs alone' do
+      source_url = 'http://en.wikipedia.org/wiki/Singularity'
+      original_html = '<a href="//wired.com/wiredscience">wired science</a>'
+      html_massager = HtmlMassage.new( original_html )
+      html_massager.absolutify_links!(source_url).should == original_html
+    end
+    it 'should leave "jump links" alone' do
+      source_url = 'http://en.wikipedia.org/wiki/Singularity'
+      original_html = '<a href="#cite_1">1</a>'
+      html_massager = HtmlMassage.new( original_html )
+      html_massager.absolutify_links!(source_url).should == original_html
+    end
+  end
+  describe '#absolutify_images!' do
+    it 'should work for absolute path links' do
+      source_url = 'http://enlightenedstructure.org/Home/'
+      original_html = '<img src="/IMG/we-are.png" alt="" class="icon">'
+      html_massager = HtmlMassage.new( original_html )
+      html_massager.absolutify_images!(source_url).should ==
+          '<img src="http://enlightenedstructure.org/IMG/we-are.png" alt="" class="icon">'
+    end
+    it 'should work for absolute path links (bugfix)' do
+      source_url = 'http://www.realitysandwich.com/blog/daniel_pinchbeck'
+      original_html = '<img src="/sites/realitysandwich.com/themes/zen/pinkreality/images/creative-commons-license.png" alt="Attribution-Noncommercial-Share Alike 3.0 Unported" title="" width="88" height="31">'
+      html_massager = HtmlMassage.new( original_html )
+      html_massager.absolutify_images!(source_url).should ==
+          '<img src="http://www.realitysandwich.com/sites/realitysandwich.com/themes/zen/pinkreality/images/creative-commons-license.png" alt="Attribution-Noncommercial-Share Alike 3.0 Unported" title="" width="88" height="31">'
+    end
+    it 'should leave // style URLs alone' do
+      source_url = 'http://en.wikipedia.org/wiki/List_of_communes_in_France_with_over_20,000_inhabitants_(2006_census)'
+      original_html = '<img alt="" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/f9/France-CIA_WFB_Map.png/220px-France-CIA_WFB_Map.png" width="220" height="235" class="thumbimage">'
+      html_massager = HtmlMassage.new( original_html )
+      html_massager.absolutify_images!(source_url).should == original_html
+    end
+  end
+  describe '#tidy_tables!' do
+    it 'should remove multiple newlines from tables' do
+      HtmlMassage.new("<table><tr>\n<th>Chư\n\n\nYang Sin National Park</th>\n\n\n</tr></table>").tidy_tables!.should ==
+        "<table><tr>\n<th>Chư\nYang Sin National Park</th>\n</tr></table>"
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,82 +1,140 @@
---- !ruby/object:Gem::Specification
+--- !ruby/object:Gem::Specification
 name: html_massage
-version: !ruby/object:Gem::Version
+version: !ruby/object:Gem::Version
+  version: 0.2.0
   prerelease:
-  version: 0.0.2
 platform: ruby
-authors:
-- Harlan Knight Wood
+authors:
+- Harlan T Wood
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-06-18 00:00:00 Z
-dependencies:
-- !ruby/object:Gem::Dependency
+date: 2012-11-25 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
   name: nokogiri
-  prerelease: false
-  requirement: &id001 !ruby/object:Gem::Requirement
+  requirement: !ruby/object:Gem::Requirement
     none: false
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 1.4.4
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '1.4'
   type: :runtime
-  version_requirements: *id001
-- !ruby/object:Gem::Dependency
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '1.4'
+- !ruby/object:Gem::Dependency
   name: sanitize
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '2.0'
+  type: :runtime
   prerelease: false
-  requirement: &id002 !ruby/object:Gem::Requirement
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '2.0'
+- !ruby/object:Gem::Dependency
+  name: thor
+  requirement: !ruby/object:Gem::Requirement
     none: false
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 2.0.0
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
   type: :runtime
-  version_requirements: *id002
-description: "Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text."
-email:
-- code@hkw7.org
-executables: []
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rest-client
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '1.6'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '1.6'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '2.5'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '2.5'
+description: ! 'Massages HTML how you want to: sanitize tags, remove headers and footers,
+  convert to plain text.'
+email:
+- code@harlantwood.net
+executables:
+- html_massage
 extensions: []
 extra_rdoc_files: []
-files:
+files:
 - .gitignore
 - Gemfile
+- License-MIT
 - README.md
 - Rakefile
+- bin/html_massage
+- generate_readme.rb
 - html_massage.gemspec
 - lib/html_massage.rb
+- lib/html_massage/cli.rb
 - lib/html_massage/version.rb
-homepage: https://github.com/onesunone/html_massage
+- spec/html_massage_spec.rb
+homepage: https://github.com/harlantwood/html_massage
 licenses: []
 post_install_message:
 rdoc_options: []
-require_paths:
+require_paths:
 - lib
-required_ruby_version: !ruby/object:Gem::Requirement
+required_ruby_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      version: "0"
-required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      version: "0"
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
 requirements: []
-rubyforge_project: html_massage
-rubygems_version: 1.8.5
+rubyforge_project:
+rubygems_version: 1.8.24
 signing_key:
 specification_version: 3
 summary: Massages HTML how you want to.
-test_files: []
+test_files:
+- spec/html_massage_spec.rb