word-to-markdown 0.0.4 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/word-to-markdown.rb +79 -13
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 81c1f8bce02e417b5338908ad224b5eede8170f1
|
4
|
+
data.tar.gz: 48ddedc28faa4f8de21b0038e8f758585fa376e5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 28a0c7229327a22874ee65dd7bf748c853deb9aafe83ad05ca7d228d701847f9707e7655dd354ddda4aa5b5400ca151ec0a67590aaf7bcc1fe83ade50bc4befc
|
7
|
+
data.tar.gz: 6f1e0ae2c3cca7ee0c9bce22439b6f21c79ad572851fa6056593f1fb0c1d26bee289bb586cdfab11102314cb7261e3ae0c7e515c04185fcb6d52bbc5e036c3b6
|
data/lib/word-to-markdown.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'reverse_markdown'
|
2
2
|
require 'descriptive_statistics'
|
3
3
|
require 'premailer'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'nokogiri-styles'
|
4
6
|
|
5
7
|
class WordToMarkdown
|
6
8
|
|
@@ -12,6 +14,7 @@ class WordToMarkdown
|
|
12
14
|
MsoListParagraphCxSpFirst
|
13
15
|
MsoListParagraphCxSpMiddle
|
14
16
|
MsoListParagraphCxSpLast
|
17
|
+
MsoListParagraph
|
15
18
|
]
|
16
19
|
|
17
20
|
attr_reader :path, :doc
|
@@ -44,6 +47,7 @@ class WordToMarkdown
|
|
44
47
|
html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
|
45
48
|
html = Premailer.new(html, :with_html_string => true, :input_encoding => "UTF-8").to_inline_css
|
46
49
|
html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
|
50
|
+
html.gsub! /\<\/?w:[^>]+>/, "" # Strip everything in the word namespace
|
47
51
|
html.gsub! /\n|\r/," " # Remove linebreaks
|
48
52
|
html.gsub! /“|”/, '"' # Straighten curly double quotes
|
49
53
|
html.gsub! /‘|’/, "'" # Straighten curly single quotes
|
@@ -62,7 +66,7 @@ class WordToMarkdown
|
|
62
66
|
|
63
67
|
# Returns the html representation of the document
|
64
68
|
def html
|
65
|
-
doc.to_html
|
69
|
+
doc.to_html.gsub("</li>\n", "</li>")
|
66
70
|
end
|
67
71
|
|
68
72
|
# Determine the document encoding
|
@@ -88,11 +92,7 @@ class WordToMarkdown
|
|
88
92
|
string.sub!(/\A[[:space:]]+/,'') # leading whitespace
|
89
93
|
string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
|
90
94
|
string.gsub!(/\n\n \n\n/,"\n\n") # Quadruple line breaks
|
91
|
-
string.gsub!(/^([0-9]+)\.[[:space:]]*/,"\\1. ") # Numbered lists
|
92
|
-
string.gsub!(/^-[[:space:]·]*/,"- ") # Unnumbered lists
|
93
95
|
string.gsub!(/\u00A0/, "") # Unicode non-breaking spaces, injected as tabs
|
94
|
-
string.gsub!(/^ /, "") # Leading spaces
|
95
|
-
string.gsub!(/^- (\d+)\./, "\\1.") # OL's wrapped in UL's see http://bit.ly/1ivqxy8
|
96
96
|
string
|
97
97
|
end
|
98
98
|
|
@@ -141,10 +141,56 @@ class WordToMarkdown
|
|
141
141
|
font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
|
142
142
|
end
|
143
143
|
|
144
|
+
# CSS selector to select non-symantic lists
|
145
|
+
def li_selectors
|
146
|
+
".#{LI_SELECTORS.join(",.")}"
|
147
|
+
end
|
148
|
+
|
144
149
|
# Try to make semantic markup explicit where implied by the export
|
145
150
|
def semanticize!
|
146
|
-
|
147
|
-
|
151
|
+
|
152
|
+
# Semanticize lists
|
153
|
+
indent_level = 0
|
154
|
+
doc.css(li_selectors).each do |node|
|
155
|
+
|
156
|
+
# Determine if this is an implicit UL or an implicit OL list item
|
157
|
+
if node.classes.include?("MsoListParagraph") || node.content.match(/^[a-zA-Z0-9]+\./)
|
158
|
+
list_type = "ol"
|
159
|
+
else
|
160
|
+
list_type = "ul"
|
161
|
+
end
|
162
|
+
|
163
|
+
# Determine parent node for this li, creating it if necessary
|
164
|
+
if node.indent > indent_level
|
165
|
+
list = Nokogiri::XML::Node.new list_type, @doc
|
166
|
+
list.classes = ["list", "indent#{node.indent}"]
|
167
|
+
if node.indent == 1
|
168
|
+
list.parent = node.parent
|
169
|
+
else
|
170
|
+
list.parent = node.parent.css(".indent#{node.indent-1} li").last
|
171
|
+
end
|
172
|
+
else
|
173
|
+
list = node.parent.css(".indent#{node.indent}").last
|
174
|
+
end
|
175
|
+
|
176
|
+
# Note our current nesting depth
|
177
|
+
indent_level = node.indent
|
178
|
+
|
179
|
+
# Convert list paragraphs to actual numbered and unnumbered lists
|
180
|
+
node.node_name = "li"
|
181
|
+
node.parent = list
|
182
|
+
|
183
|
+
# Scrub unicode bullets
|
184
|
+
span = node.css("span:first")[1]
|
185
|
+
if span && span.styles["mso-list"] && span.styles["mso-list"] == "Ignore"
|
186
|
+
span.content = span.content[1..-1] unless span.content.match /^\d+\./
|
187
|
+
end
|
188
|
+
|
189
|
+
# Convert all pseudo-numbered list items into numbered list items, e.g., ii. => 2.
|
190
|
+
node.content = node.content.gsub /^[[:space:] ]+/, ""
|
191
|
+
node.content = node.content.gsub /^[a-zA-Z0-9]+\.[[:space:]]+/, ""
|
192
|
+
|
193
|
+
end
|
148
194
|
|
149
195
|
# Try to guess heading where implicit bassed on font size
|
150
196
|
implicit_headings.each do |element|
|
@@ -161,14 +207,34 @@ module Nokogiri
|
|
161
207
|
module XML
|
162
208
|
class Element
|
163
209
|
|
164
|
-
|
210
|
+
def indent
|
211
|
+
if styles['mso-list']
|
212
|
+
styles['mso-list'].split(" ")[1].sub("level","").to_i
|
213
|
+
else
|
214
|
+
(left_margin / 0.5).to_i
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
# The node's left-margin
|
219
|
+
# Used for parsing nested Lis
|
220
|
+
#
|
221
|
+
# Returns a float with the left margin
|
222
|
+
def left_margin
|
223
|
+
if styles['margin-left']
|
224
|
+
styles['margin-left'].to_f
|
225
|
+
elsif styles['margin']
|
226
|
+
styles['margin'].split(" ").last.to_f
|
227
|
+
else
|
228
|
+
0
|
229
|
+
end
|
230
|
+
end
|
165
231
|
|
166
|
-
#
|
232
|
+
# The node's font size
|
233
|
+
# Used for guessing heading sizes
|
234
|
+
#
|
235
|
+
# Returns a float with the font-size
|
167
236
|
def font_size
|
168
|
-
|
169
|
-
match = FONT_SIZE_REGEX.match attr("style")
|
170
|
-
match[1].to_i unless match.nil?
|
171
|
-
end
|
237
|
+
styles['font-size'].to_f if styles['font-size']
|
172
238
|
end
|
173
239
|
end
|
174
240
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: word-to-markdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-03-
|
11
|
+
date: 2014-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: reverse_markdown
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: nokogiri-styles
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: rake
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|