RubyGems - word-to-markdown - Versions diffs - 0.0.1 → 0.0.2 - Mend

word-to-markdown 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 858e79fff023fe2b3150484359c1bccc480182bf
-  data.tar.gz: a6c2ac95b8be35efa54bd1938c31d08dc8bb0df0
+  metadata.gz: 64e2eeda272f608f1d1bfd5690abdeebdd1f31d8
+  data.tar.gz: ad2cd8d38fe5893463097ddb41fd973e001642df
 SHA512:
-  metadata.gz: daa684a5bd4bda7eb465bc4c58c75dea0080d095943ead5d6e0696cd4c941cd80f268b3b9f638fc1e02f7875c50de757e3c85331054ee89bd173228cc650922f
-  data.tar.gz: 73466db83836034919b906dee2b549ced6077f8b6a343e9f19bb2488942056e618027873a3b43fa6bbaddcdb0a54acb9f010d6115cc1ad413269c5645ac775c8
+  metadata.gz: 37da10c45c34e29bd671c2d6833e81f754eaea14e07fbca693eb8b5d813db9a5a508d2bb483a0045f0d5df8de3d704db0ec8bfe25f2345c6358dd8d83eaea003
+  data.tar.gz: cff63c72bbc8fba7e994454875c01b56d9d239fa822762e30d317519d28e3ae7821f9414be923733f4a1cb48c888dcd7fd15c0c346c07d510687f11782507f55

data/lib/word-to-markdown.rb CHANGED

@@ -5,27 +5,62 @@ class WordToMarkdown
   HEADING_DEPTH = 6 # Number of headings to guess, e.g., h6
   HEADING_STEP = 100/HEADING_DEPTH
+  MIN_HEADING_SIZE = 20
   LI_SELECTORS = %w[
     MsoListParagraphCxSpFirst
     MsoListParagraphCxSpMiddle
     MsoListParagraphCxSpLast
   ]
-  attr_reader :path, :doc, :html
-  def initialize(path)
-    @path = path
-    @html = File.open(@path).read.encode("UTF-8", :invalid => :replace, :replace => "")
-    @doc = Nokogiri::HTML @html
+  attr_reader :path, :doc
+  # Create a new WordToMarkdown object
+  #
+  # input - a HTML string or path to an HTML file
+  #
+  # Returns the WordToMarkdown object
+  def initialize(input)
+    path = File.expand_path input, Dir.pwd
+    if File.exist?(path)
+      html = File.open(path).read
+      @path = path
+    else
+      @path = String
+      html = input.to_s
+    end
+    @doc = Nokogiri::HTML normalize(html)
     semanticize!
   end
+  # Perform pre-processing normalization
+  def normalize(html)
+    encoding = encoding(html)
+    html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
+    html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
+    html.gsub! /\n|\r/," "         # remove linebreaks
+    html
+  end
   def inspect
     "<WordToMarkdown path=\"#{@path}\">"
   end
   def to_s
-    @markdown ||= scrub_whitespace(ReverseMarkdown.parse(@doc.to_html))
+    @markdown ||= scrub_whitespace(ReverseMarkdown.parse(html))
+  end
+  def html
+    @doc.to_html
+  end
+  def encoding(html)
+    match = html.encode("UTF-8", :invalid => :replace, :replace => "").match(/charset=([^\"]+)/)
+    if match
+      match[1].sub("macintosh", "MacRoman")
+    else
+      "UTF-8"
+    end
   end
   def scrub_whitespace(string)
@@ -33,7 +68,9 @@ class WordToMarkdown
     string.sub!(/[[:space:]]+\z/,'')                # trailing whitespace
     string.gsub!(/\n\n \n\n/,"\n\n")                # Quadruple line breaks
     string.gsub!(/^([0-9]+)\.[[:space:]]*/,"\\1. ") # Numbered lists
-    string.gsub!(/^-[[:space:]]*/,"- ")             # Unnumbered lists
+    string.gsub!(/^-[[:space:]·]*/,"- ")            # Unnumbered lists
+    string.gsub!(/\u00A0/, "")                      # Unicode non-breaking spaces, injected as tabs
+    string.gsub!(/^ /, "")                          # Leading spaces
     string
   end
@@ -42,7 +79,7 @@ class WordToMarkdown
     @implicit_headings ||= begin
       headings = []
       @doc.css("[style]").each do |element|
-        headings.push element unless element.font_size.nil?
+        headings.push element unless element.font_size.nil? || element.font_size < MIN_HEADING_SIZE
       end
       headings
     end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: word-to-markdown
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
 platform: ruby
 authors:
 - Ben Balter
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-03-22 00:00:00.000000000 Z
+date: 2014-03-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: reverse_markdown