word-to-markdown 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/word-to-markdown.rb +10 -5
- metadata +15 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 438abfb05468da472c652d87e12b80d40f714572
|
4
|
+
data.tar.gz: b43be781c4a35c7d5968bb97b8620af3cfd10bb8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0fe70f87fdb8524e85316fccf6784f23e43e60f614e6037e7289583804b9a0f0fb4930eb79c60c23b67e11c3e2c57c87aa16a36a67d07e3c1aba375938b50614
|
7
|
+
data.tar.gz: e0ae583d9d9e343b9b722e236018a108e27ea454aec521607dd807a2dc63a4f96ca2a8392eec0024af1c1f723aa30c83621456c75726559df7180cacb5c3f310
|
data/lib/word-to-markdown.rb
CHANGED
@@ -6,7 +6,7 @@ class WordToMarkdown
|
|
6
6
|
HEADING_DEPTH = 6 # Number of headings to guess, e.g., h6
|
7
7
|
HEADING_STEP = 100/HEADING_DEPTH
|
8
8
|
MIN_HEADING_SIZE = 20
|
9
|
-
|
9
|
+
|
10
10
|
LI_SELECTORS = %w[
|
11
11
|
MsoListParagraphCxSpFirst
|
12
12
|
MsoListParagraphCxSpMiddle
|
@@ -38,7 +38,9 @@ class WordToMarkdown
|
|
38
38
|
encoding = encoding(html)
|
39
39
|
html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
|
40
40
|
html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
|
41
|
-
html.gsub! /\n|\r/," " #
|
41
|
+
html.gsub! /\n|\r/," " # Remove linebreaks
|
42
|
+
html.gsub! /“|”/, '"' # Straighten curly double quotes
|
43
|
+
html.gsub! /‘|’/, "'" # Straighten curly single quotes
|
42
44
|
html
|
43
45
|
end
|
44
46
|
|
@@ -51,7 +53,7 @@ class WordToMarkdown
|
|
51
53
|
end
|
52
54
|
|
53
55
|
def html
|
54
|
-
|
56
|
+
doc.to_html
|
55
57
|
end
|
56
58
|
|
57
59
|
def encoding(html)
|
@@ -78,7 +80,7 @@ class WordToMarkdown
|
|
78
80
|
def implicit_headings
|
79
81
|
@implicit_headings ||= begin
|
80
82
|
headings = []
|
81
|
-
|
83
|
+
doc.css("[style]").each do |element|
|
82
84
|
headings.push element unless element.font_size.nil? || element.font_size < MIN_HEADING_SIZE
|
83
85
|
end
|
84
86
|
headings
|
@@ -112,13 +114,16 @@ class WordToMarkdown
|
|
112
114
|
# Try to make semantic markup explicit where implied by the export
|
113
115
|
def semanticize!
|
114
116
|
# Convert unnumbered list paragraphs to actual unnumbered lists
|
115
|
-
|
117
|
+
doc.css(".#{LI_SELECTORS.join(",.")}").each { |node| node.node_name = "li" }
|
116
118
|
|
117
119
|
# Try to guess heading where implicit bassed on font size
|
118
120
|
implicit_headings.each do |element|
|
119
121
|
heading = guess_heading element
|
120
122
|
element.node_name = heading unless heading.nil?
|
121
123
|
end
|
124
|
+
|
125
|
+
# Removes paragraphs from tables
|
126
|
+
doc.search("td p").each { |node| node.node_name = "span" }
|
122
127
|
end
|
123
128
|
end
|
124
129
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: word-to-markdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
@@ -108,6 +108,20 @@ dependencies:
|
|
108
108
|
- - ">="
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: rerun
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
111
125
|
description: Ruby Gem to convert Word documents to markdown.
|
112
126
|
email: ben.balter@github.com
|
113
127
|
executables: []
|