word-to-markdown 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/word-to-markdown.rb +10 -5
- metadata +15 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 438abfb05468da472c652d87e12b80d40f714572
|
4
|
+
data.tar.gz: b43be781c4a35c7d5968bb97b8620af3cfd10bb8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0fe70f87fdb8524e85316fccf6784f23e43e60f614e6037e7289583804b9a0f0fb4930eb79c60c23b67e11c3e2c57c87aa16a36a67d07e3c1aba375938b50614
|
7
|
+
data.tar.gz: e0ae583d9d9e343b9b722e236018a108e27ea454aec521607dd807a2dc63a4f96ca2a8392eec0024af1c1f723aa30c83621456c75726559df7180cacb5c3f310
|
data/lib/word-to-markdown.rb
CHANGED
@@ -6,7 +6,7 @@ class WordToMarkdown
|
|
6
6
|
HEADING_DEPTH = 6 # Number of headings to guess, e.g., h6
|
7
7
|
HEADING_STEP = 100/HEADING_DEPTH
|
8
8
|
MIN_HEADING_SIZE = 20
|
9
|
-
|
9
|
+
|
10
10
|
LI_SELECTORS = %w[
|
11
11
|
MsoListParagraphCxSpFirst
|
12
12
|
MsoListParagraphCxSpMiddle
|
@@ -38,7 +38,9 @@ class WordToMarkdown
|
|
38
38
|
encoding = encoding(html)
|
39
39
|
html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
|
40
40
|
html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
|
41
|
-
html.gsub! /\n|\r/," " #
|
41
|
+
html.gsub! /\n|\r/," " # Remove linebreaks
|
42
|
+
html.gsub! /“|”/, '"' # Straighten curly double quotes
|
43
|
+
html.gsub! /‘|’/, "'" # Straighten curly single quotes
|
42
44
|
html
|
43
45
|
end
|
44
46
|
|
@@ -51,7 +53,7 @@ class WordToMarkdown
|
|
51
53
|
end
|
52
54
|
|
53
55
|
def html
|
54
|
-
|
56
|
+
doc.to_html
|
55
57
|
end
|
56
58
|
|
57
59
|
def encoding(html)
|
@@ -78,7 +80,7 @@ class WordToMarkdown
|
|
78
80
|
def implicit_headings
|
79
81
|
@implicit_headings ||= begin
|
80
82
|
headings = []
|
81
|
-
|
83
|
+
doc.css("[style]").each do |element|
|
82
84
|
headings.push element unless element.font_size.nil? || element.font_size < MIN_HEADING_SIZE
|
83
85
|
end
|
84
86
|
headings
|
@@ -112,13 +114,16 @@ class WordToMarkdown
|
|
112
114
|
# Try to make semantic markup explicit where implied by the export
|
113
115
|
def semanticize!
|
114
116
|
# Convert unnumbered list paragraphs to actual unnumbered lists
|
115
|
-
|
117
|
+
doc.css(".#{LI_SELECTORS.join(",.")}").each { |node| node.node_name = "li" }
|
116
118
|
|
117
119
|
# Try to guess heading where implicit bassed on font size
|
118
120
|
implicit_headings.each do |element|
|
119
121
|
heading = guess_heading element
|
120
122
|
element.node_name = heading unless heading.nil?
|
121
123
|
end
|
124
|
+
|
125
|
+
# Removes paragraphs from tables
|
126
|
+
doc.search("td p").each { |node| node.node_name = "span" }
|
122
127
|
end
|
123
128
|
end
|
124
129
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: word-to-markdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
@@ -108,6 +108,20 @@ dependencies:
|
|
108
108
|
- - ">="
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: rerun
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
111
125
|
description: Ruby Gem to convert Word documents to markdown.
|
112
126
|
email: ben.balter@github.com
|
113
127
|
executables: []
|