RubyGems - html2md - Versions diffs - 0.1.2 → 0.1.3 - Mend

html2md 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/features/assets/test.html +0 -1
data/features/assets/test.md +1 -0
data/features/markdown.feature +31 -11
data/features/step_definitions/markdown_steps.rb +2 -1
data/lib/html2md.rb +2 -1
data/lib/html2md/document.rb +170 -26
metadata +2 -2

data/features/assets/test.html CHANGED

@@ -31,7 +31,6 @@
 <ol>
 <li>Ordered Item 1</li>
 <li>Ordered Item 2
 <ul>
 <li>Un-Ordered Item 1</li>
 </ul>

data/features/assets/test.md CHANGED

@@ -1,3 +1,4 @@
 ********
 Header 1

data/features/markdown.feature CHANGED

@@ -4,12 +4,12 @@ Feature: Markdown
   Scenario: Create a H Rule (HR) element
     * HTML <hr/>
     * I say parse
-    * The markdown should be (********\n)
+    * The markdown should be (\n********\n\n)
   Scenario: Create a hard break (BR) element
     * HTML <br/>
     * I say parse
-    * The markdown should be (  \n)
+    * The markdown should be (\n   \n)
   Scenario: Paragraph (P) elements should be a single hard return
     * HTML <p>
@@ -24,17 +24,17 @@ Feature: Markdown
   Scenario: Other ancors should be ignored
     * HTML <a name="link"> Link </a>
     * I say parse
-    * The markdown should be ( Link )
+    * The markdown should be ( Link)
   Scenario: Ancors should reset after being used once
     * HTML <a href="/some/link.html"> Link </a> <a name="link"> Link </a>
     * I say parse
-    * The markdown should be ([ Link ](/some/link.html)  Link )
+    * The markdown should be ([ Link ](/some/link.html) Link)
   Scenario: Other (a) elements should be ignored
-    * HTML <a> Text </a>
+    * HTML <a> Text Text </a>
     * I say parse
-    * The markdown should be ( Text )
+    * The markdown should be ( Text Text)
   Scenario: An order list
     * HTML <ol><li>First</li><li>Second</li><ol>
@@ -47,9 +47,9 @@ Feature: Markdown
     * The markdown should be (\n  - First\n  - Second\n\n)
   Scenario: Complex List
-    * HTML <ul><li>First</li><li> <ol><li>First<ul><li>First</li><li>Second</li></ul></li><li>Second</li> </ol>Second</li><ul>
+    * HTML <ul> <li>First</li> <li> <ol> <li> Some Text <ul> <li>First</li> <li>Second</li> </ul> </li> <li>Second</li> </ol> </li> <li>Second</li> <ul>
     * I say parse
-    * The markdown should be (\n  - First\n  - \n    1. First\n      - First\n      - Second\n    2. Second\nSecond\n\n)
+    * The markdown should be (\n  - First\n    1. Some Text\n      - First\n      - Second\n    2. Second\n  - Second\n\n)
   Scenario: Emphasis (em) element
     * HTML <em>Emphasis</em>
@@ -77,9 +77,19 @@ Feature: Markdown
     * The markdown should be (This is in a span)
   Scenario: Character data should not have new lines
-    * HTML <p>This is character data \n\n\n\n</p>
+    * HTML <p>This is character data    \n\n\n\n</p>
     * I say parse
-    * The markdown should be (This is character data \n\n)
+    * The markdown should be (This is character data\n\n)
+  Scenario: Character data should not have new lines
+    * HTML <em><p> This is emphasized </p><br/></em>
+    * I say parse
+    * The markdown should be (_This is emphasized_)
+  Scenario: HR Followed by em should not fold
+    * HTML <em><p> This is emphasized </p><br/></em><hr/>
+    * I say parse
+    * The markdown should be (_This is emphasized_\n********\n\n)
   Scenario: First level headers
     * HTML <h1>This is a H1 Element</h1>
@@ -91,6 +101,11 @@ Feature: Markdown
     * I say parse
     * The markdown should be (\nThis is a H2 Element\n--------------------\n\n)
+  Scenario: New lines should be treated as space
+    * HTML <body>Word 1\nWord 2</body>
+    * I say parse
+    * The markdown should be (Word 1 Word 2)
   Scenario: Third level headers
     * HTML <h3>This is a H3 Element</h3>
     * I say parse
@@ -99,4 +114,9 @@ Feature: Markdown
   Scenario: Full File Conversion
     * File (./features/assets/test.html)
     * I say parse
-    * The mardown should be equal to (./features/assets/test.md)
+    * The mardown should be equal to (./features/assets/test.md)
+  Scenario: Strike Through
+    * HTML <strike><p> This is striken </p><br/></strike>
+    * I say parse
+    * The markdown should be (~~This is striken~~)

data/features/step_definitions/markdown_steps.rb CHANGED

@@ -12,7 +12,8 @@ After do
 end
 Given /HTML (.*)/ do |n|
-  @html2md.source = n.gsub("\\n", "\n")
+  @html2md.source = n.gsub('\n', "\n")
+  puts n.gsub '\n',"\n"
 end
 Given /File \((.*)\)/ do |n|

data/lib/html2md.rb CHANGED

@@ -1,5 +1,6 @@
 require 'nokogiri'
 require 'html2md/document'
+require 'cgi'
 class Html2Md
   attr_accessor :options, :source
@@ -13,7 +14,7 @@ class Html2Md
     doc = Html2Md::Document.new()
     doc.relative_url = options[:relative_url]
     parser = Nokogiri::HTML::SAX::Parser.new(doc)
-    parser.parse(source)
+    parser.parse( CGI.unescapeHTML(source).gsub(/\r/," ") )
     parser.document.markdown
   end
 end

data/lib/html2md/document.rb CHANGED

@@ -7,12 +7,33 @@ class Html2Md
     attr_reader :markdown
     attr_accessor :relative_url
+    def is_newline?(line)
+      if line.is_a? String
+        if /^\s+$/ =~ line
+          true
+        elsif  /^\[\[::HARD_BREAK::\]\]$/ =~ line
+          true
+        #elsif line.empty?
+        #  true
+        else
+          false
+        end
+      else
+        false
+      end
+    end
+    def new_line
+      @markdown << "\n" unless is_newline?( @markdown[-1] ) and is_newline?( @markdown[-2] )
+    end
     def start_document
-      @markdown = ''
+      @markdown = []
       @last_href = nil
       @allowed_tags = ['tr','td','th','table']
       @list_tree = []
       @last_cdata_length = 0
+      @pre_block = false
     end
@@ -47,7 +68,6 @@ class Html2Md
     end
     def end_element name, attributes = []
-      #@markdown << name
       end_name = "end_#{name}".to_sym
       both_name = "start_and_end_#{name}".to_sym
       if self.respond_to?(both_name)
@@ -59,16 +79,83 @@ class Html2Md
       end
     end
+    def start_strike(attributes)
+      @markdown << "~~"
+    end
+    def end_strike(attributes)
+      #Collapse Breaks
+      while is_newline?( @markdown[-1] )
+        @markdown.delete_at(-1)
+      end
+      #Collapse Space Before the emphasis
+      @markdown.reverse!
+      @markdown.each_index do |index|
+        if @markdown[index].eql? '~~'
+          count = 1
+          while is_newline?(@markdown[index-count])
+            @markdown.delete_at(index-count)
+          end
+          @markdown[index-1].gsub!(/^\s+/,'')
+        end
+      end
+      @markdown.reverse!
+      @markdown[-1].gsub!(/\s+$/,'')
+      @markdown << '~~'
+    end
     def start_hr(attributes)
-      @markdown << "********\n"
+      new_line
+      @markdown << "********"
+      new_line
+      new_line
     end
     def end_hr(attributes)
     end
-    def start_and_end_em(attributes)
+    def start_em(attributes)
+      @markdown << "_"
+    end
+    def end_em(attributes)
+      #Collapse Breaks
+      while is_newline?( @markdown[-1] )
+        @markdown.delete_at(-1)
+      end
+      #Collapse Space Before the emphasis
+      @markdown.reverse!
+      @markdown.each_index do |index|
+        if @markdown[index].eql? '_' and not @markdown[index+1] =~ /\\$/
+          count = 1
+          while is_newline?(@markdown[index-count])
+            @markdown.delete_at(index-count)
+          end
+          @markdown[index-1].gsub!(/^\s+/,'')
+        end
+      end
+      @markdown.reverse!
+      @markdown[-1].gsub!(/\s+$/,'')
       @markdown << '_'
+      ###@markdown.gsub!(/((\[\[::HARD_BREAK::\]\])?(\s+)?)*_$/,'_')
     end
     def start_and_end_strong(attributes)
@@ -76,7 +163,8 @@ class Html2Md
     end
     def start_br(attributes)
-      @markdown << "  \n"
+      new_line
+      @markdown << "[[::HARD_BREAK::]]"
     end
     def end_br(attributes)
@@ -88,39 +176,44 @@ class Html2Md
     end
     def end_p(attributes)
-      @markdown << "\n\n" unless @list_tree[-1]
+      new_line unless @list_tree[-1]
+      new_line unless @list_tree[-1]
     end
     def start_h1(attributes)
-      @markdown << "\n"
+      new_line
     end
     def end_h1(attributes)
-      @markdown << "\n"
+      new_line
       @last_cdata_length.times do
         @markdown << "="
       end
-      @markdown << "\n\n"
+      new_line
+      new_line
     end
     def start_h2(attributes)
-      @markdown << "\n"
+      new_line
     end
     def end_h2(attributes)
-      @markdown << "\n"
+      new_line
       @last_cdata_length.times do
         @markdown << "-"
       end
-      @markdown << "\n\n"
+      new_line
+      new_line
     end
     def start_h3(attributes)
-      @markdown << "\n### "
+      new_line
+      @markdown << "### "
     end
     def end_h3(attributes)
-      @markdown << "\n\n"
+      new_line
+      new_line
     end
     def start_a(attributes)
@@ -133,15 +226,22 @@ class Html2Md
     end
     def start_pre(attributes)
-      @markdown << "\n```\n"
+      @pre_block = true;
+      new_line
+      @markdown << "```"
+      new_line
     end
     def end_pre(attributes)
-      @markdown << "\n```\n"
+      @pre_block = false;
+      new_line
+      @markdown << "```"
+      new_line
     end
     def end_a(attributes)
-        if @last_href and not (['http','https'].include? URI(@last_href).scheme)
+      begin
+        if @last_href and not (['http','https'].include? URI(URI.escape(@last_href)).scheme)
             begin
               rp = URI(relative_url)
               rp.path = @last_href
@@ -152,21 +252,24 @@ class Html2Md
         @markdown << "](#{@last_href})" if @last_href
         @last_href = nil if @last_href
+      rescue
+      end
     end
     def start_ul(attributes)
-      @markdown << "\n" #if @list_tree[-1]
+      new_line
       @list_tree.push( { :type => :ul, :current_element => 0 } )
     end
     def end_ul(attributes)
       @list_tree.pop
-      @markdown << "\n" unless @list_tree[-1]
+      new_line unless @list_tree[-1]
     end
     def start_ol(attributes)
-      @markdown << "\n"# if @list_tree[-1]
+      new_line
       @list_tree.push( { :type => :ol, :current_element => 0 } )
     end
@@ -177,12 +280,21 @@ class Html2Md
     def start_li(attributes)
+      if /^(-|\d+.)\s+$/ =~ @markdown[-2]
+        @markdown.delete_at(-2)
+        @markdown.delete_at(-3)
+      end
+      @markdown[-2].gsub! /^\s+(-|\d+.)\s+$/,''
+      #Add Whitespace before the list item
       @list_tree.length.times do
         @markdown << "  "
       end
+      #Increment the Current Element to start at one
       @list_tree[-1][:current_element] += 1
       case @list_tree[-1][:type]
       when :ol
         @markdown << "#{ @list_tree[-1][:current_element] }. "
@@ -193,19 +305,51 @@ class Html2Md
     end
     def end_li(attributes)
-      @markdown << "\n" if @markdown[-1] != "\n" and @markdown[-1] != 10
+      new_line if @markdown[-1] != "\n" and @markdown[-1] != 10
     end
     def characters c
-      @last_cdata_length = c.chomp.length
-      if @list_tree[-1]
-        @markdown << c.gsub(/\n(\s*)?/,"").lstrip
-      else
-        @markdown << c.gsub(/\n(\s*)?/,"")
+      #Escape character data with _
+      c.gsub!('_','\_') unless @pre_block
+      #Collapse all whitespace into spaces
+      c.gsub!(/(\s+|\n|\r\n|\t)/, " ")
+      if c.rstrip.lstrip.chomp != ""
+        if @list_tree[-1]
+          #Strip whitespace at the start of the character data
+          c.gsub!(/\A(\r|\n|\s|\t)/,'')
+          c.chomp!
+          @last_cdata_length = c.chomp.length
+          @markdown << c
+        else
+          @last_cdata_length = c.chomp.length
+          @markdown << c
+        end
       end
     end
     def end_document
+      @markdown = @markdown.join('')
+      #Replace All Ancor Links
+      @markdown.gsub!(/\[.*\]\(#.*\)/,'')
+      #Remove all extra space at the end of a line
+      @markdown.gsub!(/ +$/,'')
+      #Add Hard Breaks
+      @markdown.gsub!(/\[\[::HARD_BREAK::\]\]/,"   \n")
+      #Collapse Superfulious Hard Line Breaks
+      #@markdown.gsub!(/(   \n+){1,}/,"   \n")
+      #Collapse Superfulious Line Breaks
       @markdown.gsub!(/\n{2,}/,"\n\n")
     end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: html2md
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.3
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-03-23 00:00:00.000000000 Z
+date: 2012-03-28 00:00:00.000000000 Z
 dependencies: []
 description: ! '  Converts Basic HTML to markdown