word-to-markdown 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/w2m +8 -3
- data/lib/word-to-markdown/converter.rb +3 -4
- data/lib/word-to-markdown/document.rb +11 -9
- data/lib/word-to-markdown/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 08754da1501d6e4918d753a0727a6fc28fe17962
|
4
|
+
data.tar.gz: 4947e109b790a2c61575ce8349b0f11f51598320
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b70ac3cf257afc4eea9e591923bb43835d6f1a31b45bf0d6a39728acbbd55bc759e2d88f9f3d9456740d683552f4379aa046b321121cf5721f7114ab634d788c
|
7
|
+
data.tar.gz: f1e16e97e23ff20229f187474872496e1201409a5f606d25f5f62dba039ea2e8c66c20ead82278037861fe8d9a9256113f4064c894cf37015261d9b8635a83ca
|
data/bin/w2m
CHANGED
@@ -2,10 +2,15 @@
|
|
2
2
|
|
3
3
|
require 'word-to-markdown'
|
4
4
|
|
5
|
-
if ARGV.size != 1
|
5
|
+
if ARGV.size != 1 || ARGV[0] == "--help"
|
6
6
|
puts "Usage: bundle exec w2m path/to/document.docx"
|
7
7
|
exit 1
|
8
8
|
end
|
9
9
|
|
10
|
-
|
11
|
-
puts
|
10
|
+
if ARGV[0] == "--version"
|
11
|
+
puts "WordToMarkdown v#{WordToMarkdown::VERSION}"
|
12
|
+
puts "LibreOffice #{WordToMarkdown.soffice_version}"
|
13
|
+
else
|
14
|
+
doc = WordToMarkdown.new ARGV[0]
|
15
|
+
puts doc.to_s
|
16
|
+
end
|
@@ -14,7 +14,6 @@ class WordToMarkdown
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def convert!
|
17
|
-
|
18
17
|
# Fonts and headings
|
19
18
|
semanticize_font_styles!
|
20
19
|
semanticize_headings!
|
@@ -95,18 +94,18 @@ class WordToMarkdown
|
|
95
94
|
|
96
95
|
def remove_unicode_bullets_from_list_items!
|
97
96
|
@document.tree.search("li span").each do |span|
|
98
|
-
span.
|
97
|
+
span.inner_html = span.inner_html.gsub /^([#{UNICODE_BULLETS.join("")}]+)/, ""
|
99
98
|
end
|
100
99
|
end
|
101
100
|
|
102
101
|
def remove_numbering_from_list_items!
|
103
102
|
@document.tree.search("li span").each do |span|
|
104
|
-
span.
|
103
|
+
span.inner_html = span.inner_html.gsub /^[a-zA-Z0-9]+\./m, ""
|
105
104
|
end
|
106
105
|
end
|
107
106
|
|
108
107
|
def remove_whitespace_from_list_items!
|
109
|
-
@document.tree.search("li span").each { |span| span.
|
108
|
+
@document.tree.search("li span").each { |span| span.inner_html.strip! }
|
110
109
|
end
|
111
110
|
|
112
111
|
def semanticize_table_headers!
|
@@ -17,7 +17,7 @@ class WordToMarkdown
|
|
17
17
|
|
18
18
|
def tree
|
19
19
|
@tree ||= begin
|
20
|
-
tree = Nokogiri::HTML(
|
20
|
+
tree = Nokogiri::HTML(normalized_html)
|
21
21
|
tree.css("title").remove
|
22
22
|
tree
|
23
23
|
end
|
@@ -38,8 +38,8 @@ class WordToMarkdown
|
|
38
38
|
# html - the raw html export
|
39
39
|
#
|
40
40
|
# Returns the encoding, defaulting to "UTF-8"
|
41
|
-
def encoding
|
42
|
-
match =
|
41
|
+
def encoding
|
42
|
+
match = raw_html.encode("UTF-8", :invalid => :replace, :replace => "").match(/charset=([^\"]+)/)
|
43
43
|
if match
|
44
44
|
match[1].sub("macintosh", "MacRoman")
|
45
45
|
else
|
@@ -54,9 +54,9 @@ class WordToMarkdown
|
|
54
54
|
# html - the raw html input from the export
|
55
55
|
#
|
56
56
|
# Returns the normalized html
|
57
|
-
def
|
58
|
-
|
59
|
-
html = html.
|
57
|
+
def normalized_html
|
58
|
+
html = raw_html.force_encoding(encoding)
|
59
|
+
html = html.encode("UTF-8", :invalid => :replace, :replace => "")
|
60
60
|
html = Premailer.new(html, :with_html_string => true, :input_encoding => "UTF-8").to_inline_css
|
61
61
|
html.gsub! /\n|\r/," " # Remove linebreaks
|
62
62
|
html.gsub! /“|”/, '"' # Straighten curly double quotes
|
@@ -71,9 +71,11 @@ class WordToMarkdown
|
|
71
71
|
#
|
72
72
|
# Returns the normalized markdown
|
73
73
|
def scrub_whitespace(string)
|
74
|
-
string.
|
75
|
-
string.sub!(
|
76
|
-
string.
|
74
|
+
string.gsub!(" ", " ") # HTML encoded spaces
|
75
|
+
string.sub!(/\A[[:space:]]+/,'') # document leading whitespace
|
76
|
+
string.sub!(/[[:space:]]+\z/,'') # document trailing whitespace
|
77
|
+
string.gsub!(/([ ]+)$/, '') # line trailing whitespace
|
78
|
+
string.gsub!(/\n\n\n\n/,"\n\n") # Quadruple line breaks
|
77
79
|
string.gsub!(/\u00A0/, "") # Unicode non-breaking spaces, injected as tabs
|
78
80
|
string
|
79
81
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: word-to-markdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-03-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: reverse_markdown
|