word-to-markdown 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/word-to-markdown.rb +33 -2
  3. metadata +16 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 438abfb05468da472c652d87e12b80d40f714572
4
- data.tar.gz: b43be781c4a35c7d5968bb97b8620af3cfd10bb8
3
+ metadata.gz: 2f7734f2a7a7b2ca0f3ac11ea9350cb5b34afdb3
4
+ data.tar.gz: f8b438412256bad44a7ca1ef5c094ef045a202be
5
5
  SHA512:
6
- metadata.gz: 0fe70f87fdb8524e85316fccf6784f23e43e60f614e6037e7289583804b9a0f0fb4930eb79c60c23b67e11c3e2c57c87aa16a36a67d07e3c1aba375938b50614
7
- data.tar.gz: e0ae583d9d9e343b9b722e236018a108e27ea454aec521607dd807a2dc63a4f96ca2a8392eec0024af1c1f723aa30c83621456c75726559df7180cacb5c3f310
6
+ metadata.gz: de20ce24839bc2e32f2405bc7b2acb37c1b0a78ca5a7bab06d16a3467639f6e00cccc8193a370b1fc695277857c9d69e8be87eb8ae82953cede2fc6b13732c91
7
+ data.tar.gz: 335420b487bd2bb17a11d8ec80321053e615502ab0dcb7217068ca5bd0a61a62a20a49ed437244bde7ae8494b099a6699b4c8cd823b8e6d14296985466ce7d78
@@ -1,5 +1,6 @@
1
1
  require 'reverse_markdown'
2
2
  require 'descriptive_statistics'
3
+ require 'premailer'
3
4
 
4
5
  class WordToMarkdown
5
6
 
@@ -34,9 +35,14 @@ class WordToMarkdown
34
35
  end
35
36
 
36
37
  # Perform pre-processing normalization
38
+ #
39
+ # html - the raw html input from the export
40
+ #
41
+ # Returns the normalized html
37
42
  def normalize(html)
38
43
  encoding = encoding(html)
39
44
  html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
45
+ html = Premailer.new(html, :with_html_string => true, :input_encoding => "UTF-8").to_inline_css
40
46
  html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
41
47
  html.gsub! /\n|\r/," " # Remove linebreaks
42
48
  html.gsub! /“|”/, '"' # Straighten curly double quotes
@@ -44,18 +50,26 @@ class WordToMarkdown
44
50
  html
45
51
  end
46
52
 
53
+ # Pretty print the class in console
47
54
  def inspect
48
55
  "<WordToMarkdown path=\"#{@path}\">"
49
56
  end
50
57
 
58
+ # Returns the markdown representation of the document
51
59
  def to_s
52
60
  @markdown ||= scrub_whitespace(ReverseMarkdown.parse(html))
53
61
  end
54
62
 
63
+ # Returns the html representation of the document
55
64
  def html
56
65
  doc.to_html
57
66
  end
58
67
 
68
+ # Determine the document encoding
69
+ #
70
+ # html - the raw html export
71
+ #
72
+ # Returns the encoding, defaulting to "UTF-8"
59
73
  def encoding(html)
60
74
  match = html.encode("UTF-8", :invalid => :replace, :replace => "").match(/charset=([^\"]+)/)
61
75
  if match
@@ -65,6 +79,11 @@ class WordToMarkdown
65
79
  end
66
80
  end
67
81
 
82
+ # Perform post-processing normalization of certain Word quirks
83
+ #
84
+ # string - the markdown representation of the document
85
+ #
86
+ # Returns the normalized markdown
68
87
  def scrub_whitespace(string)
69
88
  string.sub!(/\A[[:space:]]+/,'') # leading whitespace
70
89
  string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
@@ -73,6 +92,7 @@ class WordToMarkdown
73
92
  string.gsub!(/^-[[:space:]·]*/,"- ") # Unnumbered lists
74
93
  string.gsub!(/\u00A0/, "") # Unicode non-breaking spaces, injected as tabs
75
94
  string.gsub!(/^ /, "") # Leading spaces
95
+ string.gsub!(/^- (\d+)\./, "\\1.") # OL's wrapped in UL's see http://bit.ly/1ivqxy8
76
96
  string
77
97
  end
78
98
 
@@ -91,12 +111,18 @@ class WordToMarkdown
91
111
  def font_sizes
92
112
  @font_sizes ||= begin
93
113
  sizes = []
94
- implicit_headings.each { |element| sizes.push element.font_size }
95
- sizes
114
+ doc.css("[style]").each do |element|
115
+ sizes.push element.font_size.round(-1) unless element.font_size.nil?
116
+ end
117
+ sizes.uniq.sort
96
118
  end
97
119
  end
98
120
 
99
121
  # Given a Nokogiri node, guess what heading it represents, if any
122
+ #
123
+ # node - the nokigiri node
124
+ #
125
+ # retuns the heading tag (e.g., H1), or nil
100
126
  def guess_heading(node)
101
127
  return nil if node.font_size == nil
102
128
  [*1...HEADING_DEPTH].each do |heading|
@@ -107,6 +133,10 @@ class WordToMarkdown
107
133
 
108
134
  # Minimum font size required for a given heading
109
135
  # e.g., H(2) would represent the minimum font size of an implicit h2
136
+ #
137
+ # n - the heading number, e.g., 1, 2
138
+ #
139
+ # returns the minimum font size as an integer
110
140
  def h(n)
111
141
  font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
112
142
  end
@@ -133,6 +163,7 @@ module Nokogiri
133
163
 
134
164
  FONT_SIZE_REGEX = /\bfont-size:\s?([0-9\.]+)pt;?\b/
135
165
 
166
+ # Extend nokogiri nodes to guess their font size where defined
136
167
  def font_size
137
168
  @font_size ||= begin
138
169
  match = FONT_SIZE_REGEX.match attr("style")
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: word-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-24 00:00:00.000000000 Z
11
+ date: 2014-03-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: reverse_markdown
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: 1.1.3
41
+ - !ruby/object:Gem::Dependency
42
+ name: premailer
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: rake
43
57
  requirement: !ruby/object:Gem::Requirement