RubyGems - word-to-markdown - Versions diffs - 0.0.4 → 0.1.0 - Mend

word-to-markdown 0.0.4 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 2f7734f2a7a7b2ca0f3ac11ea9350cb5b34afdb3
-  data.tar.gz: f8b438412256bad44a7ca1ef5c094ef045a202be
+  metadata.gz: 81c1f8bce02e417b5338908ad224b5eede8170f1
+  data.tar.gz: 48ddedc28faa4f8de21b0038e8f758585fa376e5
 SHA512:
-  metadata.gz: de20ce24839bc2e32f2405bc7b2acb37c1b0a78ca5a7bab06d16a3467639f6e00cccc8193a370b1fc695277857c9d69e8be87eb8ae82953cede2fc6b13732c91
-  data.tar.gz: 335420b487bd2bb17a11d8ec80321053e615502ab0dcb7217068ca5bd0a61a62a20a49ed437244bde7ae8494b099a6699b4c8cd823b8e6d14296985466ce7d78
+  metadata.gz: 28a0c7229327a22874ee65dd7bf748c853deb9aafe83ad05ca7d228d701847f9707e7655dd354ddda4aa5b5400ca151ec0a67590aaf7bcc1fe83ade50bc4befc
+  data.tar.gz: 6f1e0ae2c3cca7ee0c9bce22439b6f21c79ad572851fa6056593f1fb0c1d26bee289bb586cdfab11102314cb7261e3ae0c7e515c04185fcb6d52bbc5e036c3b6

data/lib/word-to-markdown.rb CHANGED

@@ -1,6 +1,8 @@
 require 'reverse_markdown'
 require 'descriptive_statistics'
 require 'premailer'
+require 'nokogiri'
+require 'nokogiri-styles'
 class WordToMarkdown
@@ -12,6 +14,7 @@ class WordToMarkdown
     MsoListParagraphCxSpFirst
     MsoListParagraphCxSpMiddle
     MsoListParagraphCxSpLast
+    MsoListParagraph
   ]
   attr_reader :path, :doc
@@ -44,6 +47,7 @@ class WordToMarkdown
     html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
     html = Premailer.new(html, :with_html_string => true, :input_encoding => "UTF-8").to_inline_css
     html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
+    html.gsub! /\<\/?w:[^>]+>/, "" # Strip everything in the word namespace
     html.gsub! /\n|\r/," "         # Remove linebreaks
     html.gsub! /“|”/, '"'          # Straighten curly double quotes
     html.gsub! /‘|’/, "'"          # Straighten curly single quotes
@@ -62,7 +66,7 @@ class WordToMarkdown
   # Returns the html representation of the document
   def html
-    doc.to_html
+    doc.to_html.gsub("</li>\n", "</li>")
   end
   # Determine the document encoding
@@ -88,11 +92,7 @@ class WordToMarkdown
     string.sub!(/\A[[:space:]]+/,'')                # leading whitespace
     string.sub!(/[[:space:]]+\z/,'')                # trailing whitespace
     string.gsub!(/\n\n \n\n/,"\n\n")                # Quadruple line breaks
-    string.gsub!(/^([0-9]+)\.[[:space:]]*/,"\\1. ") # Numbered lists
-    string.gsub!(/^-[[:space:]·]*/,"- ")            # Unnumbered lists
     string.gsub!(/\u00A0/, "")                      # Unicode non-breaking spaces, injected as tabs
-    string.gsub!(/^ /, "")                          # Leading spaces
-    string.gsub!(/^- (\d+)\./, "\\1.")              # OL's wrapped in UL's see http://bit.ly/1ivqxy8
     string
   end
@@ -141,10 +141,56 @@ class WordToMarkdown
     font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
   end
+  # CSS selector to select non-symantic lists
+  def li_selectors
+    ".#{LI_SELECTORS.join(",.")}"
+  end
   # Try to make semantic markup explicit where implied by the export
   def semanticize!
-    # Convert unnumbered list paragraphs to actual unnumbered lists
-    doc.css(".#{LI_SELECTORS.join(",.")}").each { |node| node.node_name = "li" }
+    # Semanticize lists
+    indent_level = 0
+    doc.css(li_selectors).each do |node|
+      # Determine if this is an implicit UL or an implicit OL list item
+      if node.classes.include?("MsoListParagraph") || node.content.match(/^[a-zA-Z0-9]+\./)
+        list_type = "ol"
+      else
+        list_type = "ul"
+      end
+      # Determine parent node for this li, creating it if necessary
+      if node.indent > indent_level
+        list = Nokogiri::XML::Node.new list_type, @doc
+        list.classes = ["list", "indent#{node.indent}"]
+        if node.indent == 1
+          list.parent = node.parent
+        else
+          list.parent = node.parent.css(".indent#{node.indent-1} li").last
+        end
+      else
+        list = node.parent.css(".indent#{node.indent}").last
+      end
+      # Note our current nesting depth
+      indent_level = node.indent
+      # Convert list paragraphs to actual numbered and unnumbered lists
+      node.node_name = "li"
+      node.parent = list
+      # Scrub unicode bullets
+      span = node.css("span:first")[1]
+      if span && span.styles["mso-list"] && span.styles["mso-list"] == "Ignore"
+        span.content = span.content[1..-1] unless span.content.match /^\d+\./
+      end
+      # Convert all pseudo-numbered list items into numbered list items, e.g., ii. => 2.
+      node.content = node.content.gsub /^[[:space:] ]+/, ""
+      node.content = node.content.gsub /^[a-zA-Z0-9]+\.[[:space:]]+/, ""
+    end
     # Try to guess heading where implicit bassed on font size
     implicit_headings.each do |element|
@@ -161,14 +207,34 @@ module Nokogiri
   module XML
     class Element
-      FONT_SIZE_REGEX = /\bfont-size:\s?([0-9\.]+)pt;?\b/
+      def indent
+        if styles['mso-list']
+          styles['mso-list'].split(" ")[1].sub("level","").to_i
+        else
+          (left_margin / 0.5).to_i
+        end
+      end
+      # The node's left-margin
+      # Used for parsing nested Lis
+      #
+      # Returns a float with the left margin
+      def left_margin
+        if styles['margin-left']
+          styles['margin-left'].to_f
+        elsif styles['margin']
+          styles['margin'].split(" ").last.to_f
+        else
+          0
+        end
+      end
-      # Extend nokogiri nodes to guess their font size where defined
+      # The node's font size
+      # Used for guessing heading sizes
+      #
+      # Returns a float with the font-size
       def font_size
-        @font_size ||= begin
-          match = FONT_SIZE_REGEX.match attr("style")
-          match[1].to_i unless match.nil?
-        end
+        styles['font-size'].to_f if styles['font-size']
       end
     end
   end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: word-to-markdown
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.1.0
 platform: ruby
 authors:
 - Ben Balter
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-03-26 00:00:00.000000000 Z
+date: 2014-03-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: reverse_markdown
@@ -52,6 +52,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: nokogiri-styles
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement