word-to-markdown 0.0.4 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/word-to-markdown.rb +79 -13
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 81c1f8bce02e417b5338908ad224b5eede8170f1
|
4
|
+
data.tar.gz: 48ddedc28faa4f8de21b0038e8f758585fa376e5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 28a0c7229327a22874ee65dd7bf748c853deb9aafe83ad05ca7d228d701847f9707e7655dd354ddda4aa5b5400ca151ec0a67590aaf7bcc1fe83ade50bc4befc
|
7
|
+
data.tar.gz: 6f1e0ae2c3cca7ee0c9bce22439b6f21c79ad572851fa6056593f1fb0c1d26bee289bb586cdfab11102314cb7261e3ae0c7e515c04185fcb6d52bbc5e036c3b6
|
data/lib/word-to-markdown.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'reverse_markdown'
|
2
2
|
require 'descriptive_statistics'
|
3
3
|
require 'premailer'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'nokogiri-styles'
|
4
6
|
|
5
7
|
class WordToMarkdown
|
6
8
|
|
@@ -12,6 +14,7 @@ class WordToMarkdown
|
|
12
14
|
MsoListParagraphCxSpFirst
|
13
15
|
MsoListParagraphCxSpMiddle
|
14
16
|
MsoListParagraphCxSpLast
|
17
|
+
MsoListParagraph
|
15
18
|
]
|
16
19
|
|
17
20
|
attr_reader :path, :doc
|
@@ -44,6 +47,7 @@ class WordToMarkdown
|
|
44
47
|
html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
|
45
48
|
html = Premailer.new(html, :with_html_string => true, :input_encoding => "UTF-8").to_inline_css
|
46
49
|
html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
|
50
|
+
html.gsub! /\<\/?w:[^>]+>/, "" # Strip everything in the word namespace
|
47
51
|
html.gsub! /\n|\r/," " # Remove linebreaks
|
48
52
|
html.gsub! /“|”/, '"' # Straighten curly double quotes
|
49
53
|
html.gsub! /‘|’/, "'" # Straighten curly single quotes
|
@@ -62,7 +66,7 @@ class WordToMarkdown
|
|
62
66
|
|
63
67
|
# Returns the html representation of the document
|
64
68
|
def html
|
65
|
-
doc.to_html
|
69
|
+
doc.to_html.gsub("</li>\n", "</li>")
|
66
70
|
end
|
67
71
|
|
68
72
|
# Determine the document encoding
|
@@ -88,11 +92,7 @@ class WordToMarkdown
|
|
88
92
|
string.sub!(/\A[[:space:]]+/,'') # leading whitespace
|
89
93
|
string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
|
90
94
|
string.gsub!(/\n\n \n\n/,"\n\n") # Quadruple line breaks
|
91
|
-
string.gsub!(/^([0-9]+)\.[[:space:]]*/,"\\1. ") # Numbered lists
|
92
|
-
string.gsub!(/^-[[:space:]·]*/,"- ") # Unnumbered lists
|
93
95
|
string.gsub!(/\u00A0/, "") # Unicode non-breaking spaces, injected as tabs
|
94
|
-
string.gsub!(/^ /, "") # Leading spaces
|
95
|
-
string.gsub!(/^- (\d+)\./, "\\1.") # OL's wrapped in UL's see http://bit.ly/1ivqxy8
|
96
96
|
string
|
97
97
|
end
|
98
98
|
|
@@ -141,10 +141,56 @@ class WordToMarkdown
|
|
141
141
|
font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
|
142
142
|
end
|
143
143
|
|
144
|
+
# CSS selector to select non-symantic lists
|
145
|
+
def li_selectors
|
146
|
+
".#{LI_SELECTORS.join(",.")}"
|
147
|
+
end
|
148
|
+
|
144
149
|
# Try to make semantic markup explicit where implied by the export
|
145
150
|
def semanticize!
|
146
|
-
|
147
|
-
|
151
|
+
|
152
|
+
# Semanticize lists
|
153
|
+
indent_level = 0
|
154
|
+
doc.css(li_selectors).each do |node|
|
155
|
+
|
156
|
+
# Determine if this is an implicit UL or an implicit OL list item
|
157
|
+
if node.classes.include?("MsoListParagraph") || node.content.match(/^[a-zA-Z0-9]+\./)
|
158
|
+
list_type = "ol"
|
159
|
+
else
|
160
|
+
list_type = "ul"
|
161
|
+
end
|
162
|
+
|
163
|
+
# Determine parent node for this li, creating it if necessary
|
164
|
+
if node.indent > indent_level
|
165
|
+
list = Nokogiri::XML::Node.new list_type, @doc
|
166
|
+
list.classes = ["list", "indent#{node.indent}"]
|
167
|
+
if node.indent == 1
|
168
|
+
list.parent = node.parent
|
169
|
+
else
|
170
|
+
list.parent = node.parent.css(".indent#{node.indent-1} li").last
|
171
|
+
end
|
172
|
+
else
|
173
|
+
list = node.parent.css(".indent#{node.indent}").last
|
174
|
+
end
|
175
|
+
|
176
|
+
# Note our current nesting depth
|
177
|
+
indent_level = node.indent
|
178
|
+
|
179
|
+
# Convert list paragraphs to actual numbered and unnumbered lists
|
180
|
+
node.node_name = "li"
|
181
|
+
node.parent = list
|
182
|
+
|
183
|
+
# Scrub unicode bullets
|
184
|
+
span = node.css("span:first")[1]
|
185
|
+
if span && span.styles["mso-list"] && span.styles["mso-list"] == "Ignore"
|
186
|
+
span.content = span.content[1..-1] unless span.content.match /^\d+\./
|
187
|
+
end
|
188
|
+
|
189
|
+
# Convert all pseudo-numbered list items into numbered list items, e.g., ii. => 2.
|
190
|
+
node.content = node.content.gsub /^[[:space:] ]+/, ""
|
191
|
+
node.content = node.content.gsub /^[a-zA-Z0-9]+\.[[:space:]]+/, ""
|
192
|
+
|
193
|
+
end
|
148
194
|
|
149
195
|
# Try to guess heading where implicit bassed on font size
|
150
196
|
implicit_headings.each do |element|
|
@@ -161,14 +207,34 @@ module Nokogiri
|
|
161
207
|
module XML
|
162
208
|
class Element
|
163
209
|
|
164
|
-
|
210
|
+
def indent
|
211
|
+
if styles['mso-list']
|
212
|
+
styles['mso-list'].split(" ")[1].sub("level","").to_i
|
213
|
+
else
|
214
|
+
(left_margin / 0.5).to_i
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
# The node's left-margin
|
219
|
+
# Used for parsing nested Lis
|
220
|
+
#
|
221
|
+
# Returns a float with the left margin
|
222
|
+
def left_margin
|
223
|
+
if styles['margin-left']
|
224
|
+
styles['margin-left'].to_f
|
225
|
+
elsif styles['margin']
|
226
|
+
styles['margin'].split(" ").last.to_f
|
227
|
+
else
|
228
|
+
0
|
229
|
+
end
|
230
|
+
end
|
165
231
|
|
166
|
-
#
|
232
|
+
# The node's font size
|
233
|
+
# Used for guessing heading sizes
|
234
|
+
#
|
235
|
+
# Returns a float with the font-size
|
167
236
|
def font_size
|
168
|
-
|
169
|
-
match = FONT_SIZE_REGEX.match attr("style")
|
170
|
-
match[1].to_i unless match.nil?
|
171
|
-
end
|
237
|
+
styles['font-size'].to_f if styles['font-size']
|
172
238
|
end
|
173
239
|
end
|
174
240
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: word-to-markdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-03-
|
11
|
+
date: 2014-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: reverse_markdown
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: nokogiri-styles
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: rake
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|