word-to-markdown 0.0.4 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/word-to-markdown.rb +79 -13
  3. metadata +16 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2f7734f2a7a7b2ca0f3ac11ea9350cb5b34afdb3
4
- data.tar.gz: f8b438412256bad44a7ca1ef5c094ef045a202be
3
+ metadata.gz: 81c1f8bce02e417b5338908ad224b5eede8170f1
4
+ data.tar.gz: 48ddedc28faa4f8de21b0038e8f758585fa376e5
5
5
  SHA512:
6
- metadata.gz: de20ce24839bc2e32f2405bc7b2acb37c1b0a78ca5a7bab06d16a3467639f6e00cccc8193a370b1fc695277857c9d69e8be87eb8ae82953cede2fc6b13732c91
7
- data.tar.gz: 335420b487bd2bb17a11d8ec80321053e615502ab0dcb7217068ca5bd0a61a62a20a49ed437244bde7ae8494b099a6699b4c8cd823b8e6d14296985466ce7d78
6
+ metadata.gz: 28a0c7229327a22874ee65dd7bf748c853deb9aafe83ad05ca7d228d701847f9707e7655dd354ddda4aa5b5400ca151ec0a67590aaf7bcc1fe83ade50bc4befc
7
+ data.tar.gz: 6f1e0ae2c3cca7ee0c9bce22439b6f21c79ad572851fa6056593f1fb0c1d26bee289bb586cdfab11102314cb7261e3ae0c7e515c04185fcb6d52bbc5e036c3b6
@@ -1,6 +1,8 @@
1
1
  require 'reverse_markdown'
2
2
  require 'descriptive_statistics'
3
3
  require 'premailer'
4
+ require 'nokogiri'
5
+ require 'nokogiri-styles'
4
6
 
5
7
  class WordToMarkdown
6
8
 
@@ -12,6 +14,7 @@ class WordToMarkdown
12
14
  MsoListParagraphCxSpFirst
13
15
  MsoListParagraphCxSpMiddle
14
16
  MsoListParagraphCxSpLast
17
+ MsoListParagraph
15
18
  ]
16
19
 
17
20
  attr_reader :path, :doc
@@ -44,6 +47,7 @@ class WordToMarkdown
44
47
  html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
45
48
  html = Premailer.new(html, :with_html_string => true, :input_encoding => "UTF-8").to_inline_css
46
49
  html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
50
+ html.gsub! /\<\/?w:[^>]+>/, "" # Strip everything in the word namespace
47
51
  html.gsub! /\n|\r/," " # Remove linebreaks
48
52
  html.gsub! /“|”/, '"' # Straighten curly double quotes
49
53
  html.gsub! /‘|’/, "'" # Straighten curly single quotes
@@ -62,7 +66,7 @@ class WordToMarkdown
62
66
 
63
67
  # Returns the html representation of the document
64
68
  def html
65
- doc.to_html
69
+ doc.to_html.gsub("</li>\n", "</li>")
66
70
  end
67
71
 
68
72
  # Determine the document encoding
@@ -88,11 +92,7 @@ class WordToMarkdown
88
92
  string.sub!(/\A[[:space:]]+/,'') # leading whitespace
89
93
  string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
90
94
  string.gsub!(/\n\n \n\n/,"\n\n") # Quadruple line breaks
91
- string.gsub!(/^([0-9]+)\.[[:space:]]*/,"\\1. ") # Numbered lists
92
- string.gsub!(/^-[[:space:]·]*/,"- ") # Unnumbered lists
93
95
  string.gsub!(/\u00A0/, "") # Unicode non-breaking spaces, injected as tabs
94
- string.gsub!(/^ /, "") # Leading spaces
95
- string.gsub!(/^- (\d+)\./, "\\1.") # OL's wrapped in UL's see http://bit.ly/1ivqxy8
96
96
  string
97
97
  end
98
98
 
@@ -141,10 +141,56 @@ class WordToMarkdown
141
141
  font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
142
142
  end
143
143
 
144
+ # CSS selector to select non-symantic lists
145
+ def li_selectors
146
+ ".#{LI_SELECTORS.join(",.")}"
147
+ end
148
+
144
149
  # Try to make semantic markup explicit where implied by the export
145
150
  def semanticize!
146
- # Convert unnumbered list paragraphs to actual unnumbered lists
147
- doc.css(".#{LI_SELECTORS.join(",.")}").each { |node| node.node_name = "li" }
151
+
152
+ # Semanticize lists
153
+ indent_level = 0
154
+ doc.css(li_selectors).each do |node|
155
+
156
+ # Determine if this is an implicit UL or an implicit OL list item
157
+ if node.classes.include?("MsoListParagraph") || node.content.match(/^[a-zA-Z0-9]+\./)
158
+ list_type = "ol"
159
+ else
160
+ list_type = "ul"
161
+ end
162
+
163
+ # Determine parent node for this li, creating it if necessary
164
+ if node.indent > indent_level
165
+ list = Nokogiri::XML::Node.new list_type, @doc
166
+ list.classes = ["list", "indent#{node.indent}"]
167
+ if node.indent == 1
168
+ list.parent = node.parent
169
+ else
170
+ list.parent = node.parent.css(".indent#{node.indent-1} li").last
171
+ end
172
+ else
173
+ list = node.parent.css(".indent#{node.indent}").last
174
+ end
175
+
176
+ # Note our current nesting depth
177
+ indent_level = node.indent
178
+
179
+ # Convert list paragraphs to actual numbered and unnumbered lists
180
+ node.node_name = "li"
181
+ node.parent = list
182
+
183
+ # Scrub unicode bullets
184
+ span = node.css("span:first")[1]
185
+ if span && span.styles["mso-list"] && span.styles["mso-list"] == "Ignore"
186
+ span.content = span.content[1..-1] unless span.content.match /^\d+\./
187
+ end
188
+
189
+ # Convert all pseudo-numbered list items into numbered list items, e.g., ii. => 2.
190
+ node.content = node.content.gsub /^[[:space:] ]+/, ""
191
+ node.content = node.content.gsub /^[a-zA-Z0-9]+\.[[:space:]]+/, ""
192
+
193
+ end
148
194
 
149
195
  # Try to guess heading where implicit bassed on font size
150
196
  implicit_headings.each do |element|
@@ -161,14 +207,34 @@ module Nokogiri
161
207
  module XML
162
208
  class Element
163
209
 
164
- FONT_SIZE_REGEX = /\bfont-size:\s?([0-9\.]+)pt;?\b/
210
+ def indent
211
+ if styles['mso-list']
212
+ styles['mso-list'].split(" ")[1].sub("level","").to_i
213
+ else
214
+ (left_margin / 0.5).to_i
215
+ end
216
+ end
217
+
218
+ # The node's left-margin
219
+ # Used for parsing nested Lis
220
+ #
221
+ # Returns a float with the left margin
222
+ def left_margin
223
+ if styles['margin-left']
224
+ styles['margin-left'].to_f
225
+ elsif styles['margin']
226
+ styles['margin'].split(" ").last.to_f
227
+ else
228
+ 0
229
+ end
230
+ end
165
231
 
166
- # Extend nokogiri nodes to guess their font size where defined
232
+ # The node's font size
233
+ # Used for guessing heading sizes
234
+ #
235
+ # Returns a float with the font-size
167
236
  def font_size
168
- @font_size ||= begin
169
- match = FONT_SIZE_REGEX.match attr("style")
170
- match[1].to_i unless match.nil?
171
- end
237
+ styles['font-size'].to_f if styles['font-size']
172
238
  end
173
239
  end
174
240
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: word-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-26 00:00:00.000000000 Z
11
+ date: 2014-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: reverse_markdown
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: nokogiri-styles
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: rake
57
71
  requirement: !ruby/object:Gem::Requirement