word-to-markdown 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/word-to-markdown.rb +33 -2
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2f7734f2a7a7b2ca0f3ac11ea9350cb5b34afdb3
|
4
|
+
data.tar.gz: f8b438412256bad44a7ca1ef5c094ef045a202be
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de20ce24839bc2e32f2405bc7b2acb37c1b0a78ca5a7bab06d16a3467639f6e00cccc8193a370b1fc695277857c9d69e8be87eb8ae82953cede2fc6b13732c91
|
7
|
+
data.tar.gz: 335420b487bd2bb17a11d8ec80321053e615502ab0dcb7217068ca5bd0a61a62a20a49ed437244bde7ae8494b099a6699b4c8cd823b8e6d14296985466ce7d78
|
data/lib/word-to-markdown.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'reverse_markdown'
|
2
2
|
require 'descriptive_statistics'
|
3
|
+
require 'premailer'
|
3
4
|
|
4
5
|
class WordToMarkdown
|
5
6
|
|
@@ -34,9 +35,14 @@ class WordToMarkdown
|
|
34
35
|
end
|
35
36
|
|
36
37
|
# Perform pre-processing normalization
|
38
|
+
#
|
39
|
+
# html - the raw html input from the export
|
40
|
+
#
|
41
|
+
# Returns the normalized html
|
37
42
|
def normalize(html)
|
38
43
|
encoding = encoding(html)
|
39
44
|
html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
|
45
|
+
html = Premailer.new(html, :with_html_string => true, :input_encoding => "UTF-8").to_inline_css
|
40
46
|
html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
|
41
47
|
html.gsub! /\n|\r/," " # Remove linebreaks
|
42
48
|
html.gsub! /“|”/, '"' # Straighten curly double quotes
|
@@ -44,18 +50,26 @@ class WordToMarkdown
|
|
44
50
|
html
|
45
51
|
end
|
46
52
|
|
53
|
+
# Pretty print the class in console
|
47
54
|
def inspect
|
48
55
|
"<WordToMarkdown path=\"#{@path}\">"
|
49
56
|
end
|
50
57
|
|
58
|
+
# Returns the markdown representation of the document
|
51
59
|
def to_s
|
52
60
|
@markdown ||= scrub_whitespace(ReverseMarkdown.parse(html))
|
53
61
|
end
|
54
62
|
|
63
|
+
# Returns the html representation of the document
|
55
64
|
def html
|
56
65
|
doc.to_html
|
57
66
|
end
|
58
67
|
|
68
|
+
# Determine the document encoding
|
69
|
+
#
|
70
|
+
# html - the raw html export
|
71
|
+
#
|
72
|
+
# Returns the encoding, defaulting to "UTF-8"
|
59
73
|
def encoding(html)
|
60
74
|
match = html.encode("UTF-8", :invalid => :replace, :replace => "").match(/charset=([^\"]+)/)
|
61
75
|
if match
|
@@ -65,6 +79,11 @@ class WordToMarkdown
|
|
65
79
|
end
|
66
80
|
end
|
67
81
|
|
82
|
+
# Perform post-processing normalization of certain Word quirks
|
83
|
+
#
|
84
|
+
# string - the markdown representation of the document
|
85
|
+
#
|
86
|
+
# Returns the normalized markdown
|
68
87
|
def scrub_whitespace(string)
|
69
88
|
string.sub!(/\A[[:space:]]+/,'') # leading whitespace
|
70
89
|
string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
|
@@ -73,6 +92,7 @@ class WordToMarkdown
|
|
73
92
|
string.gsub!(/^-[[:space:]·]*/,"- ") # Unnumbered lists
|
74
93
|
string.gsub!(/\u00A0/, "") # Unicode non-breaking spaces, injected as tabs
|
75
94
|
string.gsub!(/^ /, "") # Leading spaces
|
95
|
+
string.gsub!(/^- (\d+)\./, "\\1.") # OL's wrapped in UL's see http://bit.ly/1ivqxy8
|
76
96
|
string
|
77
97
|
end
|
78
98
|
|
@@ -91,12 +111,18 @@ class WordToMarkdown
|
|
91
111
|
def font_sizes
|
92
112
|
@font_sizes ||= begin
|
93
113
|
sizes = []
|
94
|
-
|
95
|
-
|
114
|
+
doc.css("[style]").each do |element|
|
115
|
+
sizes.push element.font_size.round(-1) unless element.font_size.nil?
|
116
|
+
end
|
117
|
+
sizes.uniq.sort
|
96
118
|
end
|
97
119
|
end
|
98
120
|
|
99
121
|
# Given a Nokogiri node, guess what heading it represents, if any
|
122
|
+
#
|
123
|
+
# node - the nokigiri node
|
124
|
+
#
|
125
|
+
# retuns the heading tag (e.g., H1), or nil
|
100
126
|
def guess_heading(node)
|
101
127
|
return nil if node.font_size == nil
|
102
128
|
[*1...HEADING_DEPTH].each do |heading|
|
@@ -107,6 +133,10 @@ class WordToMarkdown
|
|
107
133
|
|
108
134
|
# Minimum font size required for a given heading
|
109
135
|
# e.g., H(2) would represent the minimum font size of an implicit h2
|
136
|
+
#
|
137
|
+
# n - the heading number, e.g., 1, 2
|
138
|
+
#
|
139
|
+
# returns the minimum font size as an integer
|
110
140
|
def h(n)
|
111
141
|
font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
|
112
142
|
end
|
@@ -133,6 +163,7 @@ module Nokogiri
|
|
133
163
|
|
134
164
|
FONT_SIZE_REGEX = /\bfont-size:\s?([0-9\.]+)pt;?\b/
|
135
165
|
|
166
|
+
# Extend nokogiri nodes to guess their font size where defined
|
136
167
|
def font_size
|
137
168
|
@font_size ||= begin
|
138
169
|
match = FONT_SIZE_REGEX.match attr("style")
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: word-to-markdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-03-
|
11
|
+
date: 2014-03-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: reverse_markdown
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 1.1.3
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: premailer
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: rake
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|