word-to-markdown 0.0.4 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/word-to-markdown.rb +79 -13
  3. metadata +16 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2f7734f2a7a7b2ca0f3ac11ea9350cb5b34afdb3
4
- data.tar.gz: f8b438412256bad44a7ca1ef5c094ef045a202be
3
+ metadata.gz: 81c1f8bce02e417b5338908ad224b5eede8170f1
4
+ data.tar.gz: 48ddedc28faa4f8de21b0038e8f758585fa376e5
5
5
  SHA512:
6
- metadata.gz: de20ce24839bc2e32f2405bc7b2acb37c1b0a78ca5a7bab06d16a3467639f6e00cccc8193a370b1fc695277857c9d69e8be87eb8ae82953cede2fc6b13732c91
7
- data.tar.gz: 335420b487bd2bb17a11d8ec80321053e615502ab0dcb7217068ca5bd0a61a62a20a49ed437244bde7ae8494b099a6699b4c8cd823b8e6d14296985466ce7d78
6
+ metadata.gz: 28a0c7229327a22874ee65dd7bf748c853deb9aafe83ad05ca7d228d701847f9707e7655dd354ddda4aa5b5400ca151ec0a67590aaf7bcc1fe83ade50bc4befc
7
+ data.tar.gz: 6f1e0ae2c3cca7ee0c9bce22439b6f21c79ad572851fa6056593f1fb0c1d26bee289bb586cdfab11102314cb7261e3ae0c7e515c04185fcb6d52bbc5e036c3b6
@@ -1,6 +1,8 @@
1
1
  require 'reverse_markdown'
2
2
  require 'descriptive_statistics'
3
3
  require 'premailer'
4
+ require 'nokogiri'
5
+ require 'nokogiri-styles'
4
6
 
5
7
  class WordToMarkdown
6
8
 
@@ -12,6 +14,7 @@ class WordToMarkdown
12
14
  MsoListParagraphCxSpFirst
13
15
  MsoListParagraphCxSpMiddle
14
16
  MsoListParagraphCxSpLast
17
+ MsoListParagraph
15
18
  ]
16
19
 
17
20
  attr_reader :path, :doc
@@ -44,6 +47,7 @@ class WordToMarkdown
44
47
  html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
45
48
  html = Premailer.new(html, :with_html_string => true, :input_encoding => "UTF-8").to_inline_css
46
49
  html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
50
+ html.gsub! /\<\/?w:[^>]+>/, "" # Strip everything in the word namespace
47
51
  html.gsub! /\n|\r/," " # Remove linebreaks
48
52
  html.gsub! /“|”/, '"' # Straighten curly double quotes
49
53
  html.gsub! /‘|’/, "'" # Straighten curly single quotes
@@ -62,7 +66,7 @@ class WordToMarkdown
62
66
 
63
67
  # Returns the html representation of the document
64
68
  def html
65
- doc.to_html
69
+ doc.to_html.gsub("</li>\n", "</li>")
66
70
  end
67
71
 
68
72
  # Determine the document encoding
@@ -88,11 +92,7 @@ class WordToMarkdown
88
92
  string.sub!(/\A[[:space:]]+/,'') # leading whitespace
89
93
  string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
90
94
  string.gsub!(/\n\n \n\n/,"\n\n") # Quadruple line breaks
91
- string.gsub!(/^([0-9]+)\.[[:space:]]*/,"\\1. ") # Numbered lists
92
- string.gsub!(/^-[[:space:]·]*/,"- ") # Unnumbered lists
93
95
  string.gsub!(/\u00A0/, "") # Unicode non-breaking spaces, injected as tabs
94
- string.gsub!(/^ /, "") # Leading spaces
95
- string.gsub!(/^- (\d+)\./, "\\1.") # OL's wrapped in UL's see http://bit.ly/1ivqxy8
96
96
  string
97
97
  end
98
98
 
@@ -141,10 +141,56 @@ class WordToMarkdown
141
141
  font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
142
142
  end
143
143
 
144
+ # CSS selector to select non-symantic lists
145
+ def li_selectors
146
+ ".#{LI_SELECTORS.join(",.")}"
147
+ end
148
+
144
149
  # Try to make semantic markup explicit where implied by the export
145
150
  def semanticize!
146
- # Convert unnumbered list paragraphs to actual unnumbered lists
147
- doc.css(".#{LI_SELECTORS.join(",.")}").each { |node| node.node_name = "li" }
151
+
152
+ # Semanticize lists
153
+ indent_level = 0
154
+ doc.css(li_selectors).each do |node|
155
+
156
+ # Determine if this is an implicit UL or an implicit OL list item
157
+ if node.classes.include?("MsoListParagraph") || node.content.match(/^[a-zA-Z0-9]+\./)
158
+ list_type = "ol"
159
+ else
160
+ list_type = "ul"
161
+ end
162
+
163
+ # Determine parent node for this li, creating it if necessary
164
+ if node.indent > indent_level
165
+ list = Nokogiri::XML::Node.new list_type, @doc
166
+ list.classes = ["list", "indent#{node.indent}"]
167
+ if node.indent == 1
168
+ list.parent = node.parent
169
+ else
170
+ list.parent = node.parent.css(".indent#{node.indent-1} li").last
171
+ end
172
+ else
173
+ list = node.parent.css(".indent#{node.indent}").last
174
+ end
175
+
176
+ # Note our current nesting depth
177
+ indent_level = node.indent
178
+
179
+ # Convert list paragraphs to actual numbered and unnumbered lists
180
+ node.node_name = "li"
181
+ node.parent = list
182
+
183
+ # Scrub unicode bullets
184
+ span = node.css("span:first")[1]
185
+ if span && span.styles["mso-list"] && span.styles["mso-list"] == "Ignore"
186
+ span.content = span.content[1..-1] unless span.content.match /^\d+\./
187
+ end
188
+
189
+ # Convert all pseudo-numbered list items into numbered list items, e.g., ii. => 2.
190
+ node.content = node.content.gsub /^[[:space:] ]+/, ""
191
+ node.content = node.content.gsub /^[a-zA-Z0-9]+\.[[:space:]]+/, ""
192
+
193
+ end
148
194
 
149
195
  # Try to guess heading where implicit bassed on font size
150
196
  implicit_headings.each do |element|
@@ -161,14 +207,34 @@ module Nokogiri
161
207
  module XML
162
208
  class Element
163
209
 
164
- FONT_SIZE_REGEX = /\bfont-size:\s?([0-9\.]+)pt;?\b/
210
+ def indent
211
+ if styles['mso-list']
212
+ styles['mso-list'].split(" ")[1].sub("level","").to_i
213
+ else
214
+ (left_margin / 0.5).to_i
215
+ end
216
+ end
217
+
218
+ # The node's left-margin
219
+ # Used for parsing nested Lis
220
+ #
221
+ # Returns a float with the left margin
222
+ def left_margin
223
+ if styles['margin-left']
224
+ styles['margin-left'].to_f
225
+ elsif styles['margin']
226
+ styles['margin'].split(" ").last.to_f
227
+ else
228
+ 0
229
+ end
230
+ end
165
231
 
166
- # Extend nokogiri nodes to guess their font size where defined
232
+ # The node's font size
233
+ # Used for guessing heading sizes
234
+ #
235
+ # Returns a float with the font-size
167
236
  def font_size
168
- @font_size ||= begin
169
- match = FONT_SIZE_REGEX.match attr("style")
170
- match[1].to_i unless match.nil?
171
- end
237
+ styles['font-size'].to_f if styles['font-size']
172
238
  end
173
239
  end
174
240
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: word-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-26 00:00:00.000000000 Z
11
+ date: 2014-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: reverse_markdown
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: nokogiri-styles
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: rake
57
71
  requirement: !ruby/object:Gem::Requirement