word-to-markdown 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/word-to-markdown.rb +33 -2
  3. metadata +16 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 438abfb05468da472c652d87e12b80d40f714572
4
- data.tar.gz: b43be781c4a35c7d5968bb97b8620af3cfd10bb8
3
+ metadata.gz: 2f7734f2a7a7b2ca0f3ac11ea9350cb5b34afdb3
4
+ data.tar.gz: f8b438412256bad44a7ca1ef5c094ef045a202be
5
5
  SHA512:
6
- metadata.gz: 0fe70f87fdb8524e85316fccf6784f23e43e60f614e6037e7289583804b9a0f0fb4930eb79c60c23b67e11c3e2c57c87aa16a36a67d07e3c1aba375938b50614
7
- data.tar.gz: e0ae583d9d9e343b9b722e236018a108e27ea454aec521607dd807a2dc63a4f96ca2a8392eec0024af1c1f723aa30c83621456c75726559df7180cacb5c3f310
6
+ metadata.gz: de20ce24839bc2e32f2405bc7b2acb37c1b0a78ca5a7bab06d16a3467639f6e00cccc8193a370b1fc695277857c9d69e8be87eb8ae82953cede2fc6b13732c91
7
+ data.tar.gz: 335420b487bd2bb17a11d8ec80321053e615502ab0dcb7217068ca5bd0a61a62a20a49ed437244bde7ae8494b099a6699b4c8cd823b8e6d14296985466ce7d78
@@ -1,5 +1,6 @@
1
1
  require 'reverse_markdown'
2
2
  require 'descriptive_statistics'
3
+ require 'premailer'
3
4
 
4
5
  class WordToMarkdown
5
6
 
@@ -34,9 +35,14 @@ class WordToMarkdown
34
35
  end
35
36
 
36
37
  # Perform pre-processing normalization
38
+ #
39
+ # html - the raw html input from the export
40
+ #
41
+ # Returns the normalized html
37
42
  def normalize(html)
38
43
  encoding = encoding(html)
39
44
  html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
45
+ html = Premailer.new(html, :with_html_string => true, :input_encoding => "UTF-8").to_inline_css
40
46
  html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
41
47
  html.gsub! /\n|\r/," " # Remove linebreaks
42
48
  html.gsub! /“|”/, '"' # Straighten curly double quotes
@@ -44,18 +50,26 @@ class WordToMarkdown
44
50
  html
45
51
  end
46
52
 
53
+ # Pretty print the class in console
47
54
  def inspect
48
55
  "<WordToMarkdown path=\"#{@path}\">"
49
56
  end
50
57
 
58
+ # Returns the markdown representation of the document
51
59
  def to_s
52
60
  @markdown ||= scrub_whitespace(ReverseMarkdown.parse(html))
53
61
  end
54
62
 
63
+ # Returns the html representation of the document
55
64
  def html
56
65
  doc.to_html
57
66
  end
58
67
 
68
+ # Determine the document encoding
69
+ #
70
+ # html - the raw html export
71
+ #
72
+ # Returns the encoding, defaulting to "UTF-8"
59
73
  def encoding(html)
60
74
  match = html.encode("UTF-8", :invalid => :replace, :replace => "").match(/charset=([^\"]+)/)
61
75
  if match
@@ -65,6 +79,11 @@ class WordToMarkdown
65
79
  end
66
80
  end
67
81
 
82
+ # Perform post-processing normalization of certain Word quirks
83
+ #
84
+ # string - the markdown representation of the document
85
+ #
86
+ # Returns the normalized markdown
68
87
  def scrub_whitespace(string)
69
88
  string.sub!(/\A[[:space:]]+/,'') # leading whitespace
70
89
  string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
@@ -73,6 +92,7 @@ class WordToMarkdown
73
92
  string.gsub!(/^-[[:space:]·]*/,"- ") # Unnumbered lists
74
93
  string.gsub!(/\u00A0/, "") # Unicode non-breaking spaces, injected as tabs
75
94
  string.gsub!(/^ /, "") # Leading spaces
95
+ string.gsub!(/^- (\d+)\./, "\\1.") # OL's wrapped in UL's see http://bit.ly/1ivqxy8
76
96
  string
77
97
  end
78
98
 
@@ -91,12 +111,18 @@ class WordToMarkdown
91
111
  def font_sizes
92
112
  @font_sizes ||= begin
93
113
  sizes = []
94
- implicit_headings.each { |element| sizes.push element.font_size }
95
- sizes
114
+ doc.css("[style]").each do |element|
115
+ sizes.push element.font_size.round(-1) unless element.font_size.nil?
116
+ end
117
+ sizes.uniq.sort
96
118
  end
97
119
  end
98
120
 
99
121
  # Given a Nokogiri node, guess what heading it represents, if any
122
+ #
123
+ # node - the nokigiri node
124
+ #
125
+ # retuns the heading tag (e.g., H1), or nil
100
126
  def guess_heading(node)
101
127
  return nil if node.font_size == nil
102
128
  [*1...HEADING_DEPTH].each do |heading|
@@ -107,6 +133,10 @@ class WordToMarkdown
107
133
 
108
134
  # Minimum font size required for a given heading
109
135
  # e.g., H(2) would represent the minimum font size of an implicit h2
136
+ #
137
+ # n - the heading number, e.g., 1, 2
138
+ #
139
+ # returns the minimum font size as an integer
110
140
  def h(n)
111
141
  font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
112
142
  end
@@ -133,6 +163,7 @@ module Nokogiri
133
163
 
134
164
  FONT_SIZE_REGEX = /\bfont-size:\s?([0-9\.]+)pt;?\b/
135
165
 
166
+ # Extend nokogiri nodes to guess their font size where defined
136
167
  def font_size
137
168
  @font_size ||= begin
138
169
  match = FONT_SIZE_REGEX.match attr("style")
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: word-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-24 00:00:00.000000000 Z
11
+ date: 2014-03-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: reverse_markdown
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: 1.1.3
41
+ - !ruby/object:Gem::Dependency
42
+ name: premailer
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: rake
43
57
  requirement: !ruby/object:Gem::Requirement