word-to-markdown 0.2.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/w2m +11 -0
- data/lib/nokogiri/xml/element.rb +22 -0
- data/lib/word-to-markdown.rb +41 -224
- data/lib/word-to-markdown/converter.rb +124 -0
- data/lib/word-to-markdown/document.rb +97 -0
- data/lib/word-to-markdown/version.rb +3 -0
- metadata +60 -40
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 65c40ca66cec250fa58be0b50f9691c49e80ba6c
|
4
|
+
data.tar.gz: 0bb6276f14bfbcbb6ff11b47f2b0fd0fbff94c10
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f20b0252fab914e0a412fdab1b76005d5ce2daa10392bcce0daafd6fae9a7744f66cd26259fb65f1272a7ba3845b460ca0c772bb924a72a09303240bf4586dce
|
7
|
+
data.tar.gz: 84c8ab75b71bc19933cbe6860e8511c31d5dbb370898229b9d67e933aa8f754cfe8dd85a1f3220a4454199011691d2185422076a65bda7d86257a9abd6a7cf8e
|
data/bin/w2m
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module Nokogiri
|
2
|
+
module XML
|
3
|
+
class Element
|
4
|
+
|
5
|
+
# The node's font size
|
6
|
+
# Used for guessing heading sizes
|
7
|
+
#
|
8
|
+
# Returns a float with the font-size
|
9
|
+
def font_size
|
10
|
+
styles['font-size'].to_f if styles['font-size']
|
11
|
+
end
|
12
|
+
|
13
|
+
def bold?
|
14
|
+
styles['font-weight'] && styles['font-weight'] == "bold"
|
15
|
+
end
|
16
|
+
|
17
|
+
def italic?
|
18
|
+
styles['font-style'] && styles['font-style'] == "italic"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/word-to-markdown.rb
CHANGED
@@ -3,253 +3,70 @@ require 'descriptive_statistics'
|
|
3
3
|
require 'premailer'
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'nokogiri-styles'
|
6
|
+
require 'tmpdir'
|
7
|
+
require_relative 'word-to-markdown/version'
|
8
|
+
require_relative 'word-to-markdown/document'
|
9
|
+
require_relative 'word-to-markdown/converter'
|
10
|
+
require_relative 'nokogiri/xml/element'
|
6
11
|
|
7
12
|
class WordToMarkdown
|
8
13
|
|
9
|
-
|
10
|
-
HEADING_STEP = 100/HEADING_DEPTH
|
11
|
-
MIN_HEADING_SIZE = 20
|
14
|
+
attr_reader :document, :converter
|
12
15
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
.MsoListParagraph
|
18
|
-
li
|
19
|
-
]
|
20
|
-
|
21
|
-
attr_reader :path, :doc
|
16
|
+
REVERSE_MARKDOWN_OPTIONS = {
|
17
|
+
unknown_tags: :bypass,
|
18
|
+
github_flavored: true
|
19
|
+
}
|
22
20
|
|
23
21
|
# Create a new WordToMarkdown object
|
24
22
|
#
|
25
23
|
# input - a HTML string or path to an HTML file
|
26
24
|
#
|
27
25
|
# Returns the WordToMarkdown object
|
28
|
-
def initialize(
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
26
|
+
def initialize(path)
|
27
|
+
@document = WordToMarkdown::Document.new path
|
28
|
+
@converter = WordToMarkdown::Converter.new @document
|
29
|
+
converter.convert!
|
30
|
+
end
|
31
|
+
|
32
|
+
# source: https://github.com/ricn/libreconv/blob/master/lib/libreconv.rb#L48
|
33
|
+
def self.which(cmd)
|
34
|
+
exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
|
35
|
+
ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
|
36
|
+
exts.each do |ext|
|
37
|
+
exe = File.join(path, "#{cmd}#{ext}")
|
38
|
+
return exe if File.executable? exe
|
39
|
+
end
|
36
40
|
end
|
37
|
-
@doc = Nokogiri::HTML normalize(html)
|
38
|
-
semanticize!
|
39
|
-
end
|
40
|
-
|
41
|
-
# Perform pre-processing normalization
|
42
|
-
#
|
43
|
-
# html - the raw html input from the export
|
44
|
-
#
|
45
|
-
# Returns the normalized html
|
46
|
-
def normalize(html)
|
47
|
-
encoding = encoding(html)
|
48
|
-
html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
|
49
|
-
html = Premailer.new(html, :with_html_string => true, :input_encoding => "UTF-8").to_inline_css
|
50
|
-
html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
|
51
|
-
html.gsub! /\<\/?w:[^>]+>/, "" # Strip everything in the word namespace
|
52
|
-
html.gsub! /\n|\r/," " # Remove linebreaks
|
53
|
-
html.gsub! /“|”/, '"' # Straighten curly double quotes
|
54
|
-
html.gsub! /‘|’/, "'" # Straighten curly single quotes
|
55
|
-
html
|
56
|
-
end
|
57
|
-
|
58
|
-
# Pretty print the class in console
|
59
|
-
def inspect
|
60
|
-
"<WordToMarkdown path=\"#{@path}\">"
|
61
|
-
end
|
62
41
|
|
63
|
-
|
64
|
-
def to_s
|
65
|
-
@markdown ||= scrub_whitespace(ReverseMarkdown.parse(html))
|
42
|
+
return nil
|
66
43
|
end
|
67
44
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
end
|
72
|
-
|
73
|
-
# Determine the document encoding
|
74
|
-
#
|
75
|
-
# html - the raw html export
|
76
|
-
#
|
77
|
-
# Returns the encoding, defaulting to "UTF-8"
|
78
|
-
def encoding(html)
|
79
|
-
match = html.encode("UTF-8", :invalid => :replace, :replace => "").match(/charset=([^\"]+)/)
|
80
|
-
if match
|
81
|
-
match[1].sub("macintosh", "MacRoman")
|
45
|
+
def self.soffice_path
|
46
|
+
if RUBY_PLATFORM.include?("darwin")
|
47
|
+
"/Applications/LibreOffice.app/Contents/MacOS/soffice"
|
82
48
|
else
|
83
|
-
"
|
49
|
+
soffice_path ||= which("soffice")
|
50
|
+
soffice_path ||= which("soffice.bin")
|
51
|
+
soffice_path ||= "soffice"
|
84
52
|
end
|
85
53
|
end
|
86
54
|
|
87
|
-
#
|
88
|
-
|
89
|
-
|
90
|
-
#
|
91
|
-
# Returns the normalized markdown
|
92
|
-
def scrub_whitespace(string)
|
93
|
-
string.sub!(/\A[[:space:]]+/,'') # leading whitespace
|
94
|
-
string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
|
95
|
-
string.gsub!(/\n\n \n\n/,"\n\n") # Quadruple line breaks
|
96
|
-
string.gsub!(/\u00A0/, "") # Unicode non-breaking spaces, injected as tabs
|
97
|
-
string
|
55
|
+
# Ideally this would be done via open3, but Travis CI can't seen to find soffice when we do
|
56
|
+
def self.run_command(*args)
|
57
|
+
`#{soffice_path} #{args.join(' ')}`
|
98
58
|
end
|
99
59
|
|
100
|
-
|
101
|
-
|
102
|
-
@implicit_headings ||= begin
|
103
|
-
headings = []
|
104
|
-
doc.css("[style]").each do |element|
|
105
|
-
headings.push element unless element.font_size.nil? || element.font_size < MIN_HEADING_SIZE
|
106
|
-
end
|
107
|
-
headings
|
108
|
-
end
|
60
|
+
def self.soffice_version
|
61
|
+
run_command('--version').strip.sub "LibreOffice ", ""
|
109
62
|
end
|
110
63
|
|
111
|
-
#
|
112
|
-
def
|
113
|
-
@
|
114
|
-
sizes = []
|
115
|
-
doc.css("[style]").each do |element|
|
116
|
-
sizes.push element.font_size.round(-1) unless element.font_size.nil?
|
117
|
-
end
|
118
|
-
sizes.uniq.sort
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
# Given a Nokogiri node, guess what heading it represents, if any
|
123
|
-
#
|
124
|
-
# node - the nokigiri node
|
125
|
-
#
|
126
|
-
# retuns the heading tag (e.g., H1), or nil
|
127
|
-
def guess_heading(node)
|
128
|
-
return nil if node.font_size == nil
|
129
|
-
[*1...HEADING_DEPTH].each do |heading|
|
130
|
-
return "h#{heading}" if node.font_size >= h(heading)
|
131
|
-
end
|
132
|
-
nil
|
133
|
-
end
|
134
|
-
|
135
|
-
# Minimum font size required for a given heading
|
136
|
-
# e.g., H(2) would represent the minimum font size of an implicit h2
|
137
|
-
#
|
138
|
-
# n - the heading number, e.g., 1, 2
|
139
|
-
#
|
140
|
-
# returns the minimum font size as an integer
|
141
|
-
def h(n)
|
142
|
-
font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
|
143
|
-
end
|
144
|
-
|
145
|
-
# CSS selector to select non-symantic lists
|
146
|
-
def li_selectors
|
147
|
-
LI_SELECTORS.join(",")
|
148
|
-
end
|
149
|
-
|
150
|
-
# Returns an array of all indented values
|
151
|
-
def indents
|
152
|
-
@indents ||= doc.css(li_selectors).map{ |el| el.indent }.uniq.sort
|
153
|
-
end
|
154
|
-
|
155
|
-
# Determine the indent level given an indent value
|
156
|
-
#
|
157
|
-
# level - the true indent, e.g., 2.5 (from 2.5em)
|
158
|
-
#
|
159
|
-
# Returns an integer representing the indent level
|
160
|
-
def indent(level)
|
161
|
-
indents.find_index level
|
162
|
-
end
|
163
|
-
|
164
|
-
# Try to make semantic markup explicit where implied by the export
|
165
|
-
def semanticize!
|
166
|
-
|
167
|
-
# Semanticize lists
|
168
|
-
indent_level = 0
|
169
|
-
doc.css(li_selectors).each do |node|
|
170
|
-
|
171
|
-
# Determine if this is an implicit UL or an implicit OL list item
|
172
|
-
if node.classes.include?("MsoListParagraph") || node.content.match(/^[a-zA-Z0-9]+\./)
|
173
|
-
list_type = "ol"
|
174
|
-
else
|
175
|
-
list_type = "ul"
|
176
|
-
end
|
177
|
-
|
178
|
-
# calculate indent level
|
179
|
-
current_indent = indent(node.indent)
|
180
|
-
|
181
|
-
# Determine parent node for this li, creating it if necessary
|
182
|
-
if current_indent > indent_level || indent_level == 0 && node.parent.css(".indent#{current_indent}").empty?
|
183
|
-
list = Nokogiri::XML::Node.new list_type, @doc
|
184
|
-
list.classes = ["list", "indent#{current_indent}"]
|
185
|
-
list.parent = node.parent.css(".indent#{current_indent-1} li").last || node.parent
|
186
|
-
else
|
187
|
-
list = node.parent.css(".indent#{current_indent}").last
|
188
|
-
end
|
189
|
-
|
190
|
-
# Note our current nesting depth
|
191
|
-
indent_level = current_indent
|
192
|
-
|
193
|
-
# Convert list paragraphs to actual numbered and unnumbered lists
|
194
|
-
node.node_name = "li"
|
195
|
-
node.parent = list if list
|
196
|
-
|
197
|
-
# Scrub unicode bullets
|
198
|
-
span = node.css("span:first")[1]
|
199
|
-
if span && span.styles["mso-list"] && span.styles["mso-list"] == "Ignore"
|
200
|
-
span.content = span.content[1..-1] unless span.content.match /^\d+\./
|
201
|
-
end
|
202
|
-
|
203
|
-
# Convert all pseudo-numbered list items into numbered list items, e.g., ii. => 2.
|
204
|
-
node.content = node.content.gsub /^[[:space:] ]+/, ""
|
205
|
-
node.content = node.content.gsub /^[a-zA-Z0-9]+\.[[:space:]]+/, ""
|
206
|
-
|
207
|
-
end
|
208
|
-
|
209
|
-
# Try to guess heading where implicit bassed on font size
|
210
|
-
implicit_headings.each do |element|
|
211
|
-
heading = guess_heading element
|
212
|
-
element.node_name = heading unless heading.nil?
|
213
|
-
end
|
214
|
-
|
215
|
-
# Removes paragraphs from tables
|
216
|
-
doc.search("td p").each { |node| node.node_name = "span" }
|
64
|
+
# Pretty print the class in console
|
65
|
+
def inspect
|
66
|
+
"<WordToMarkdown path=\"#{@document.path}\">"
|
217
67
|
end
|
218
|
-
end
|
219
68
|
|
220
|
-
|
221
|
-
|
222
|
-
class Element
|
223
|
-
|
224
|
-
def indent
|
225
|
-
if styles['mso-list']
|
226
|
-
styles['mso-list'].split(" ")[1].sub("level","").to_i
|
227
|
-
else
|
228
|
-
(left_margin / 0.5).to_i
|
229
|
-
end
|
230
|
-
end
|
231
|
-
|
232
|
-
# The node's left-margin
|
233
|
-
# Used for parsing nested Lis
|
234
|
-
#
|
235
|
-
# Returns a float with the left margin
|
236
|
-
def left_margin
|
237
|
-
if styles['margin-left']
|
238
|
-
styles['margin-left'].to_f
|
239
|
-
elsif styles['margin']
|
240
|
-
styles['margin'].split(" ").last.to_f
|
241
|
-
else
|
242
|
-
0
|
243
|
-
end
|
244
|
-
end
|
245
|
-
|
246
|
-
# The node's font size
|
247
|
-
# Used for guessing heading sizes
|
248
|
-
#
|
249
|
-
# Returns a float with the font-size
|
250
|
-
def font_size
|
251
|
-
styles['font-size'].to_f if styles['font-size']
|
252
|
-
end
|
253
|
-
end
|
69
|
+
def to_s
|
70
|
+
document.to_s
|
254
71
|
end
|
255
72
|
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
class WordToMarkdown
|
2
|
+
class Converter
|
3
|
+
|
4
|
+
attr_reader :document
|
5
|
+
|
6
|
+
HEADING_DEPTH = 6 # Number of headings to guess, e.g., h6
|
7
|
+
HEADING_STEP = 100/HEADING_DEPTH
|
8
|
+
MIN_HEADING_SIZE = 20
|
9
|
+
UNICODE_BULLETS = ["○", "o", "●", "\uF0B7", "\u2022", "\uF0A7"]
|
10
|
+
|
11
|
+
def initialize(document)
|
12
|
+
@document = document
|
13
|
+
end
|
14
|
+
|
15
|
+
def convert!
|
16
|
+
|
17
|
+
# Fonts and headings
|
18
|
+
semanticize_font_styles!
|
19
|
+
semanticize_headings!
|
20
|
+
|
21
|
+
# Tables
|
22
|
+
remove_paragraphs_from_tables!
|
23
|
+
semanticize_table_headers!
|
24
|
+
|
25
|
+
# list items
|
26
|
+
remove_paragraphs_from_list_items!
|
27
|
+
remove_unicode_bullets_from_list_items!
|
28
|
+
remove_whitespace_from_list_items!
|
29
|
+
remove_numbering_from_list_items!
|
30
|
+
end
|
31
|
+
|
32
|
+
# Returns an array of Nokogiri nodes that are implicit headings
|
33
|
+
def implicit_headings
|
34
|
+
@implicit_headings ||= begin
|
35
|
+
headings = []
|
36
|
+
@document.tree.css("[style]").each do |element|
|
37
|
+
headings.push element unless element.font_size.nil? || element.font_size < MIN_HEADING_SIZE
|
38
|
+
end
|
39
|
+
headings
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Returns an array of font-sizes for implicit headings in the document
|
44
|
+
def font_sizes
|
45
|
+
@font_sizes ||= begin
|
46
|
+
sizes = []
|
47
|
+
@document.tree.css("[style]").each do |element|
|
48
|
+
sizes.push element.font_size.round(-1) unless element.font_size.nil?
|
49
|
+
end
|
50
|
+
sizes.uniq.sort
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Given a Nokogiri node, guess what heading it represents, if any
|
55
|
+
#
|
56
|
+
# node - the nokigiri node
|
57
|
+
#
|
58
|
+
# retuns the heading tag (e.g., H1), or nil
|
59
|
+
def guess_heading(node)
|
60
|
+
return nil if node.font_size == nil
|
61
|
+
[*1...HEADING_DEPTH].each do |heading|
|
62
|
+
return "h#{heading}" if node.font_size >= h(heading)
|
63
|
+
end
|
64
|
+
nil
|
65
|
+
end
|
66
|
+
|
67
|
+
# Minimum font size required for a given heading
|
68
|
+
# e.g., H(2) would represent the minimum font size of an implicit h2
|
69
|
+
#
|
70
|
+
# n - the heading number, e.g., 1, 2
|
71
|
+
#
|
72
|
+
# returns the minimum font size as an integer
|
73
|
+
def h(n)
|
74
|
+
font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
|
75
|
+
end
|
76
|
+
|
77
|
+
def semanticize_font_styles!
|
78
|
+
@document.tree.css("span").each do |node|
|
79
|
+
if node.bold?
|
80
|
+
node.node_name = "strong"
|
81
|
+
elsif node.italic?
|
82
|
+
node.node_name = "em"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def remove_paragraphs_from_tables!
|
88
|
+
@document.tree.search("td p").each { |node| node.node_name = "span" }
|
89
|
+
end
|
90
|
+
|
91
|
+
def remove_paragraphs_from_list_items!
|
92
|
+
@document.tree.search("li p").each { |node| node.node_name = "span" }
|
93
|
+
end
|
94
|
+
|
95
|
+
def remove_unicode_bullets_from_list_items!
|
96
|
+
@document.tree.search("li span").each do |span|
|
97
|
+
span.content = span.content.gsub /^([#{UNICODE_BULLETS.join("")}]+)/, ""
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def remove_numbering_from_list_items!
|
102
|
+
@document.tree.search("li span").each do |span|
|
103
|
+
span.content = span.content.gsub /^[a-zA-Z0-9]+\./m, ""
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def remove_whitespace_from_list_items!
|
108
|
+
@document.tree.search("li span").each { |span| span.content.strip! }
|
109
|
+
end
|
110
|
+
|
111
|
+
def semanticize_table_headers!
|
112
|
+
@document.tree.search("table tr:first td").each { |node| node.node_name = "th" }
|
113
|
+
end
|
114
|
+
|
115
|
+
# Try to guess heading where implicit bassed on font size
|
116
|
+
def semanticize_headings!
|
117
|
+
implicit_headings.each do |element|
|
118
|
+
heading = guess_heading element
|
119
|
+
element.node_name = heading unless heading.nil?
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
class WordToMarkdown
|
2
|
+
class Document
|
3
|
+
class NotFoundError < StandardError; end
|
4
|
+
|
5
|
+
attr_reader :path, :raw_html
|
6
|
+
|
7
|
+
def initialize(path)
|
8
|
+
@path = File.expand_path path, Dir.pwd
|
9
|
+
raise NotFoundError, "File #{@path} does not exist" unless File.exist?(@path)
|
10
|
+
end
|
11
|
+
|
12
|
+
def extension
|
13
|
+
File.extname path
|
14
|
+
end
|
15
|
+
|
16
|
+
def tree
|
17
|
+
@tree ||= begin
|
18
|
+
tree = Nokogiri::HTML(normalize(raw_html))
|
19
|
+
tree.css("title").remove
|
20
|
+
tree
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Returns the html representation of the document
|
25
|
+
def html
|
26
|
+
tree.to_html.gsub("</li>\n", "</li>")
|
27
|
+
end
|
28
|
+
|
29
|
+
# Returns the markdown representation of the document
|
30
|
+
def to_s
|
31
|
+
@markdown ||= scrub_whitespace(ReverseMarkdown.convert(html, WordToMarkdown::REVERSE_MARKDOWN_OPTIONS))
|
32
|
+
end
|
33
|
+
|
34
|
+
# Determine the document encoding
|
35
|
+
#
|
36
|
+
# html - the raw html export
|
37
|
+
#
|
38
|
+
# Returns the encoding, defaulting to "UTF-8"
|
39
|
+
def encoding(html)
|
40
|
+
match = html.encode("UTF-8", :invalid => :replace, :replace => "").match(/charset=([^\"]+)/)
|
41
|
+
if match
|
42
|
+
match[1].sub("macintosh", "MacRoman")
|
43
|
+
else
|
44
|
+
"UTF-8"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
# Perform pre-processing normalization
|
51
|
+
#
|
52
|
+
# html - the raw html input from the export
|
53
|
+
#
|
54
|
+
# Returns the normalized html
|
55
|
+
def normalize(html)
|
56
|
+
encoding = encoding(html)
|
57
|
+
html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
|
58
|
+
html = Premailer.new(html, :with_html_string => true, :input_encoding => "UTF-8").to_inline_css
|
59
|
+
html.gsub! /\n|\r/," " # Remove linebreaks
|
60
|
+
html.gsub! /“|”/, '"' # Straighten curly double quotes
|
61
|
+
html.gsub! /‘|’/, "'" # Straighten curly single quotes
|
62
|
+
html.gsub! />\s+</, "><" # Remove extra whitespace between tags
|
63
|
+
html
|
64
|
+
end
|
65
|
+
|
66
|
+
# Perform post-processing normalization of certain Word quirks
|
67
|
+
#
|
68
|
+
# string - the markdown representation of the document
|
69
|
+
#
|
70
|
+
# Returns the normalized markdown
|
71
|
+
def scrub_whitespace(string)
|
72
|
+
string.sub!(/\A[[:space:]]+/,'') # leading whitespace
|
73
|
+
string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
|
74
|
+
string.gsub!(/\n\n \n\n/,"\n\n") # Quadruple line breaks
|
75
|
+
string.gsub!(/\u00A0/, "") # Unicode non-breaking spaces, injected as tabs
|
76
|
+
string
|
77
|
+
end
|
78
|
+
|
79
|
+
def tmpdir
|
80
|
+
@tmpdir ||= Dir.mktmpdir
|
81
|
+
end
|
82
|
+
|
83
|
+
def dest_path
|
84
|
+
dest_filename = File.basename(path).gsub(/#{Regexp.escape(extension)}$/, ".html")
|
85
|
+
File.expand_path(dest_filename, tmpdir)
|
86
|
+
end
|
87
|
+
|
88
|
+
def raw_html
|
89
|
+
@raw_html ||= begin
|
90
|
+
WordToMarkdown::run_command '--headless', '--convert-to', 'html', path, '--outdir', tmpdir
|
91
|
+
html = File.read dest_path
|
92
|
+
File.delete dest_path
|
93
|
+
html
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: word-to-markdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-05-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: reverse_markdown
|
@@ -16,147 +16,167 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.
|
19
|
+
version: '0.5'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.
|
26
|
+
version: '0.5'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: descriptive_statistics
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.1
|
33
|
+
version: '1.1'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.1
|
40
|
+
version: '1.1'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: premailer
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '1.8'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '1.8'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: nokogiri-styles
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
61
|
+
version: '0.1'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - "
|
66
|
+
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
68
|
+
version: '0.1'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rake
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - "
|
73
|
+
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
75
|
+
version: '10.3'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - "
|
80
|
+
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
82
|
+
version: '10.3'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: shoulda
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- - "
|
87
|
+
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
89
|
+
version: '3.5'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- - "
|
94
|
+
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
96
|
+
version: '3.5'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: rdoc
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- - "
|
101
|
+
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
103
|
+
version: '4.1'
|
104
104
|
type: :development
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
|
-
- - "
|
108
|
+
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: '
|
110
|
+
version: '4.1'
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: bundler
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
114
114
|
requirements:
|
115
|
-
- - "
|
115
|
+
- - "~>"
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: '
|
117
|
+
version: '1.6'
|
118
118
|
type: :development
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
|
-
- - "
|
122
|
+
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version: '
|
124
|
+
version: '1.6'
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
126
|
name: pry
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
128
128
|
requirements:
|
129
|
-
- - "
|
129
|
+
- - "~>"
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version: '0'
|
131
|
+
version: '0.9'
|
132
132
|
type: :development
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
|
-
- - "
|
136
|
+
- - "~>"
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version: '0'
|
138
|
+
version: '0.9'
|
139
139
|
- !ruby/object:Gem::Dependency
|
140
|
-
name:
|
140
|
+
name: mocha
|
141
141
|
requirement: !ruby/object:Gem::Requirement
|
142
142
|
requirements:
|
143
|
-
- - "
|
143
|
+
- - "~>"
|
144
144
|
- !ruby/object:Gem::Version
|
145
|
-
version: '0'
|
145
|
+
version: '1.0'
|
146
146
|
type: :development
|
147
147
|
prerelease: false
|
148
148
|
version_requirements: !ruby/object:Gem::Requirement
|
149
149
|
requirements:
|
150
|
-
- - "
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '1.0'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: minitest
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '4.7'
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - "~>"
|
151
165
|
- !ruby/object:Gem::Version
|
152
|
-
version: '
|
166
|
+
version: '4.7'
|
153
167
|
description: Ruby Gem to convert Word documents to markdown.
|
154
168
|
email: ben.balter@github.com
|
155
|
-
executables:
|
169
|
+
executables:
|
170
|
+
- w2m
|
156
171
|
extensions: []
|
157
172
|
extra_rdoc_files: []
|
158
173
|
files:
|
174
|
+
- bin/w2m
|
175
|
+
- lib/nokogiri/xml/element.rb
|
159
176
|
- lib/word-to-markdown.rb
|
177
|
+
- lib/word-to-markdown/converter.rb
|
178
|
+
- lib/word-to-markdown/document.rb
|
179
|
+
- lib/word-to-markdown/version.rb
|
160
180
|
homepage: https://github.com/benbalter/word-to-markdown
|
161
181
|
licenses:
|
162
182
|
- MIT
|