word-to-markdown 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/word-to-markdown.rb +33 -2
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2f7734f2a7a7b2ca0f3ac11ea9350cb5b34afdb3
|
4
|
+
data.tar.gz: f8b438412256bad44a7ca1ef5c094ef045a202be
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de20ce24839bc2e32f2405bc7b2acb37c1b0a78ca5a7bab06d16a3467639f6e00cccc8193a370b1fc695277857c9d69e8be87eb8ae82953cede2fc6b13732c91
|
7
|
+
data.tar.gz: 335420b487bd2bb17a11d8ec80321053e615502ab0dcb7217068ca5bd0a61a62a20a49ed437244bde7ae8494b099a6699b4c8cd823b8e6d14296985466ce7d78
|
data/lib/word-to-markdown.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'reverse_markdown'
|
2
2
|
require 'descriptive_statistics'
|
3
|
+
require 'premailer'
|
3
4
|
|
4
5
|
class WordToMarkdown
|
5
6
|
|
@@ -34,9 +35,14 @@ class WordToMarkdown
|
|
34
35
|
end
|
35
36
|
|
36
37
|
# Perform pre-processing normalization
|
38
|
+
#
|
39
|
+
# html - the raw html input from the export
|
40
|
+
#
|
41
|
+
# Returns the normalized html
|
37
42
|
def normalize(html)
|
38
43
|
encoding = encoding(html)
|
39
44
|
html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
|
45
|
+
html = Premailer.new(html, :with_html_string => true, :input_encoding => "UTF-8").to_inline_css
|
40
46
|
html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
|
41
47
|
html.gsub! /\n|\r/," " # Remove linebreaks
|
42
48
|
html.gsub! /“|”/, '"' # Straighten curly double quotes
|
@@ -44,18 +50,26 @@ class WordToMarkdown
|
|
44
50
|
html
|
45
51
|
end
|
46
52
|
|
53
|
+
# Pretty print the class in console
|
47
54
|
def inspect
|
48
55
|
"<WordToMarkdown path=\"#{@path}\">"
|
49
56
|
end
|
50
57
|
|
58
|
+
# Returns the markdown representation of the document
|
51
59
|
def to_s
|
52
60
|
@markdown ||= scrub_whitespace(ReverseMarkdown.parse(html))
|
53
61
|
end
|
54
62
|
|
63
|
+
# Returns the html representation of the document
|
55
64
|
def html
|
56
65
|
doc.to_html
|
57
66
|
end
|
58
67
|
|
68
|
+
# Determine the document encoding
|
69
|
+
#
|
70
|
+
# html - the raw html export
|
71
|
+
#
|
72
|
+
# Returns the encoding, defaulting to "UTF-8"
|
59
73
|
def encoding(html)
|
60
74
|
match = html.encode("UTF-8", :invalid => :replace, :replace => "").match(/charset=([^\"]+)/)
|
61
75
|
if match
|
@@ -65,6 +79,11 @@ class WordToMarkdown
|
|
65
79
|
end
|
66
80
|
end
|
67
81
|
|
82
|
+
# Perform post-processing normalization of certain Word quirks
|
83
|
+
#
|
84
|
+
# string - the markdown representation of the document
|
85
|
+
#
|
86
|
+
# Returns the normalized markdown
|
68
87
|
def scrub_whitespace(string)
|
69
88
|
string.sub!(/\A[[:space:]]+/,'') # leading whitespace
|
70
89
|
string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
|
@@ -73,6 +92,7 @@ class WordToMarkdown
|
|
73
92
|
string.gsub!(/^-[[:space:]·]*/,"- ") # Unnumbered lists
|
74
93
|
string.gsub!(/\u00A0/, "") # Unicode non-breaking spaces, injected as tabs
|
75
94
|
string.gsub!(/^ /, "") # Leading spaces
|
95
|
+
string.gsub!(/^- (\d+)\./, "\\1.") # OL's wrapped in UL's see http://bit.ly/1ivqxy8
|
76
96
|
string
|
77
97
|
end
|
78
98
|
|
@@ -91,12 +111,18 @@ class WordToMarkdown
|
|
91
111
|
def font_sizes
|
92
112
|
@font_sizes ||= begin
|
93
113
|
sizes = []
|
94
|
-
|
95
|
-
|
114
|
+
doc.css("[style]").each do |element|
|
115
|
+
sizes.push element.font_size.round(-1) unless element.font_size.nil?
|
116
|
+
end
|
117
|
+
sizes.uniq.sort
|
96
118
|
end
|
97
119
|
end
|
98
120
|
|
99
121
|
# Given a Nokogiri node, guess what heading it represents, if any
|
122
|
+
#
|
123
|
+
# node - the nokigiri node
|
124
|
+
#
|
125
|
+
# retuns the heading tag (e.g., H1), or nil
|
100
126
|
def guess_heading(node)
|
101
127
|
return nil if node.font_size == nil
|
102
128
|
[*1...HEADING_DEPTH].each do |heading|
|
@@ -107,6 +133,10 @@ class WordToMarkdown
|
|
107
133
|
|
108
134
|
# Minimum font size required for a given heading
|
109
135
|
# e.g., H(2) would represent the minimum font size of an implicit h2
|
136
|
+
#
|
137
|
+
# n - the heading number, e.g., 1, 2
|
138
|
+
#
|
139
|
+
# returns the minimum font size as an integer
|
110
140
|
def h(n)
|
111
141
|
font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
|
112
142
|
end
|
@@ -133,6 +163,7 @@ module Nokogiri
|
|
133
163
|
|
134
164
|
FONT_SIZE_REGEX = /\bfont-size:\s?([0-9\.]+)pt;?\b/
|
135
165
|
|
166
|
+
# Extend nokogiri nodes to guess their font size where defined
|
136
167
|
def font_size
|
137
168
|
@font_size ||= begin
|
138
169
|
match = FONT_SIZE_REGEX.match attr("style")
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: word-to-markdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-03-
|
11
|
+
date: 2014-03-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: reverse_markdown
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 1.1.3
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: premailer
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: rake
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|