word-to-markdown 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/word-to-markdown.rb +46 -9
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 64e2eeda272f608f1d1bfd5690abdeebdd1f31d8
|
4
|
+
data.tar.gz: ad2cd8d38fe5893463097ddb41fd973e001642df
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 37da10c45c34e29bd671c2d6833e81f754eaea14e07fbca693eb8b5d813db9a5a508d2bb483a0045f0d5df8de3d704db0ec8bfe25f2345c6358dd8d83eaea003
|
7
|
+
data.tar.gz: cff63c72bbc8fba7e994454875c01b56d9d239fa822762e30d317519d28e3ae7821f9414be923733f4a1cb48c888dcd7fd15c0c346c07d510687f11782507f55
|
data/lib/word-to-markdown.rb
CHANGED
@@ -5,27 +5,62 @@ class WordToMarkdown
|
|
5
5
|
|
6
6
|
HEADING_DEPTH = 6 # Number of headings to guess, e.g., h6
|
7
7
|
HEADING_STEP = 100/HEADING_DEPTH
|
8
|
+
MIN_HEADING_SIZE = 20
|
9
|
+
|
8
10
|
LI_SELECTORS = %w[
|
9
11
|
MsoListParagraphCxSpFirst
|
10
12
|
MsoListParagraphCxSpMiddle
|
11
13
|
MsoListParagraphCxSpLast
|
12
14
|
]
|
13
15
|
|
14
|
-
attr_reader :path, :doc
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
16
|
+
attr_reader :path, :doc
|
17
|
+
|
18
|
+
# Create a new WordToMarkdown object
|
19
|
+
#
|
20
|
+
# input - a HTML string or path to an HTML file
|
21
|
+
#
|
22
|
+
# Returns the WordToMarkdown object
|
23
|
+
def initialize(input)
|
24
|
+
path = File.expand_path input, Dir.pwd
|
25
|
+
if File.exist?(path)
|
26
|
+
html = File.open(path).read
|
27
|
+
@path = path
|
28
|
+
else
|
29
|
+
@path = String
|
30
|
+
html = input.to_s
|
31
|
+
end
|
32
|
+
@doc = Nokogiri::HTML normalize(html)
|
20
33
|
semanticize!
|
21
34
|
end
|
22
35
|
|
36
|
+
# Perform pre-processing normalization
|
37
|
+
def normalize(html)
|
38
|
+
encoding = encoding(html)
|
39
|
+
html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
|
40
|
+
html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
|
41
|
+
html.gsub! /\n|\r/," " # remove linebreaks
|
42
|
+
html
|
43
|
+
end
|
44
|
+
|
23
45
|
def inspect
|
24
46
|
"<WordToMarkdown path=\"#{@path}\">"
|
25
47
|
end
|
26
48
|
|
27
49
|
def to_s
|
28
|
-
@markdown ||= scrub_whitespace(ReverseMarkdown.parse(
|
50
|
+
@markdown ||= scrub_whitespace(ReverseMarkdown.parse(html))
|
51
|
+
end
|
52
|
+
|
53
|
+
def html
|
54
|
+
@doc.to_html
|
55
|
+
end
|
56
|
+
|
57
|
+
def encoding(html)
|
58
|
+
match = html.encode("UTF-8", :invalid => :replace, :replace => "").match(/charset=([^\"]+)/)
|
59
|
+
if match
|
60
|
+
match[1].sub("macintosh", "MacRoman")
|
61
|
+
else
|
62
|
+
"UTF-8"
|
63
|
+
end
|
29
64
|
end
|
30
65
|
|
31
66
|
def scrub_whitespace(string)
|
@@ -33,7 +68,9 @@ class WordToMarkdown
|
|
33
68
|
string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
|
34
69
|
string.gsub!(/\n\n \n\n/,"\n\n") # Quadruple line breaks
|
35
70
|
string.gsub!(/^([0-9]+)\.[[:space:]]*/,"\\1. ") # Numbered lists
|
36
|
-
string.gsub!(/^-[[:space:]]*/,"- ")
|
71
|
+
string.gsub!(/^-[[:space:]·]*/,"- ") # Unnumbered lists
|
72
|
+
string.gsub!(/\u00A0/, "") # Unicode non-breaking spaces, injected as tabs
|
73
|
+
string.gsub!(/^ /, "") # Leading spaces
|
37
74
|
string
|
38
75
|
end
|
39
76
|
|
@@ -42,7 +79,7 @@ class WordToMarkdown
|
|
42
79
|
@implicit_headings ||= begin
|
43
80
|
headings = []
|
44
81
|
@doc.css("[style]").each do |element|
|
45
|
-
headings.push element unless element.font_size.nil?
|
82
|
+
headings.push element unless element.font_size.nil? || element.font_size < MIN_HEADING_SIZE
|
46
83
|
end
|
47
84
|
headings
|
48
85
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: word-to-markdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-03-
|
11
|
+
date: 2014-03-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: reverse_markdown
|