RubyGems - word-to-markdown - Versions diffs - 0.0.3 → 0.0.4 - Mend

word-to-markdown 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 438abfb05468da472c652d87e12b80d40f714572
-  data.tar.gz: b43be781c4a35c7d5968bb97b8620af3cfd10bb8
+  metadata.gz: 2f7734f2a7a7b2ca0f3ac11ea9350cb5b34afdb3
+  data.tar.gz: f8b438412256bad44a7ca1ef5c094ef045a202be
 SHA512:
-  metadata.gz: 0fe70f87fdb8524e85316fccf6784f23e43e60f614e6037e7289583804b9a0f0fb4930eb79c60c23b67e11c3e2c57c87aa16a36a67d07e3c1aba375938b50614
-  data.tar.gz: e0ae583d9d9e343b9b722e236018a108e27ea454aec521607dd807a2dc63a4f96ca2a8392eec0024af1c1f723aa30c83621456c75726559df7180cacb5c3f310
+  metadata.gz: de20ce24839bc2e32f2405bc7b2acb37c1b0a78ca5a7bab06d16a3467639f6e00cccc8193a370b1fc695277857c9d69e8be87eb8ae82953cede2fc6b13732c91
+  data.tar.gz: 335420b487bd2bb17a11d8ec80321053e615502ab0dcb7217068ca5bd0a61a62a20a49ed437244bde7ae8494b099a6699b4c8cd823b8e6d14296985466ce7d78

data/lib/word-to-markdown.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'reverse_markdown'
 require 'descriptive_statistics'
+require 'premailer'
 class WordToMarkdown
@@ -34,9 +35,14 @@ class WordToMarkdown
   end
   # Perform pre-processing normalization
+  #
+  # html - the raw html input from the export
+  #
+  # Returns the normalized html
   def normalize(html)
     encoding = encoding(html)
     html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
+    html = Premailer.new(html, :with_html_string => true, :input_encoding => "UTF-8").to_inline_css
     html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
     html.gsub! /\n|\r/," "         # Remove linebreaks
     html.gsub! /“|”/, '"'          # Straighten curly double quotes
@@ -44,18 +50,26 @@ class WordToMarkdown
     html
   end
+  # Pretty print the class in console
   def inspect
     "<WordToMarkdown path=\"#{@path}\">"
   end
+  # Returns the markdown representation of the document
   def to_s
     @markdown ||= scrub_whitespace(ReverseMarkdown.parse(html))
   end
+  # Returns the html representation of the document
   def html
     doc.to_html
   end
+  # Determine the document encoding
+  #
+  # html - the raw html export
+  #
+  # Returns the encoding, defaulting to "UTF-8"
   def encoding(html)
     match = html.encode("UTF-8", :invalid => :replace, :replace => "").match(/charset=([^\"]+)/)
     if match
@@ -65,6 +79,11 @@ class WordToMarkdown
     end
   end
+  # Perform post-processing normalization of certain Word quirks
+  #
+  # string - the markdown representation of the document
+  #
+  # Returns the normalized markdown
   def scrub_whitespace(string)
     string.sub!(/\A[[:space:]]+/,'')                # leading whitespace
     string.sub!(/[[:space:]]+\z/,'')                # trailing whitespace
@@ -73,6 +92,7 @@ class WordToMarkdown
     string.gsub!(/^-[[:space:]·]*/,"- ")            # Unnumbered lists
     string.gsub!(/\u00A0/, "")                      # Unicode non-breaking spaces, injected as tabs
     string.gsub!(/^ /, "")                          # Leading spaces
+    string.gsub!(/^- (\d+)\./, "\\1.")              # OL's wrapped in UL's see http://bit.ly/1ivqxy8
     string
   end
@@ -91,12 +111,18 @@ class WordToMarkdown
   def font_sizes
     @font_sizes ||= begin
       sizes = []
-      implicit_headings.each { |element| sizes.push element.font_size }
-      sizes
+      doc.css("[style]").each do |element|
+        sizes.push element.font_size.round(-1) unless element.font_size.nil?
+      end
+      sizes.uniq.sort
     end
   end
   # Given a Nokogiri node, guess what heading it represents, if any
+  #
+  # node - the nokigiri node
+  #
+  # retuns the heading tag (e.g., H1), or nil
   def guess_heading(node)
     return nil if node.font_size == nil
     [*1...HEADING_DEPTH].each do |heading|
@@ -107,6 +133,10 @@ class WordToMarkdown
   # Minimum font size required for a given heading
   # e.g., H(2) would represent the minimum font size of an implicit h2
+  #
+  # n - the heading number, e.g., 1, 2
+  #
+  # returns the minimum font size as an integer
   def h(n)
     font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
   end
@@ -133,6 +163,7 @@ module Nokogiri
       FONT_SIZE_REGEX = /\bfont-size:\s?([0-9\.]+)pt;?\b/
+      # Extend nokogiri nodes to guess their font size where defined
       def font_size
         @font_size ||= begin
           match = FONT_SIZE_REGEX.match attr("style")

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: word-to-markdown
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.4
 platform: ruby
 authors:
 - Ben Balter
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-03-24 00:00:00.000000000 Z
+date: 2014-03-26 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: reverse_markdown
@@ -38,6 +38,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: 1.1.3
+- !ruby/object:Gem::Dependency
+  name: premailer
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement