word-to-markdown 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/word-to-markdown.rb +102 -0
- metadata +142 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 858e79fff023fe2b3150484359c1bccc480182bf
|
4
|
+
data.tar.gz: a6c2ac95b8be35efa54bd1938c31d08dc8bb0df0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: daa684a5bd4bda7eb465bc4c58c75dea0080d095943ead5d6e0696cd4c941cd80f268b3b9f638fc1e02f7875c50de757e3c85331054ee89bd173228cc650922f
|
7
|
+
data.tar.gz: 73466db83836034919b906dee2b549ced6077f8b6a343e9f19bb2488942056e618027873a3b43fa6bbaddcdb0a54acb9f010d6115cc1ad413269c5645ac775c8
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'reverse_markdown'
|
2
|
+
require 'descriptive_statistics'
|
3
|
+
|
4
|
+
class WordToMarkdown
|
5
|
+
|
6
|
+
HEADING_DEPTH = 6 # Number of headings to guess, e.g., h6
|
7
|
+
HEADING_STEP = 100/HEADING_DEPTH
|
8
|
+
LI_SELECTORS = %w[
|
9
|
+
MsoListParagraphCxSpFirst
|
10
|
+
MsoListParagraphCxSpMiddle
|
11
|
+
MsoListParagraphCxSpLast
|
12
|
+
]
|
13
|
+
|
14
|
+
attr_reader :path, :doc, :html
|
15
|
+
|
16
|
+
def initialize(path)
|
17
|
+
@path = path
|
18
|
+
@html = File.open(@path).read.encode("UTF-8", :invalid => :replace, :replace => "")
|
19
|
+
@doc = Nokogiri::HTML @html
|
20
|
+
semanticize!
|
21
|
+
end
|
22
|
+
|
23
|
+
def inspect
|
24
|
+
"<WordToMarkdown path=\"#{@path}\">"
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_s
|
28
|
+
@markdown ||= scrub_whitespace(ReverseMarkdown.parse(@doc.to_html))
|
29
|
+
end
|
30
|
+
|
31
|
+
def scrub_whitespace(string)
|
32
|
+
string.sub!(/\A[[:space:]]+/,'') # leading whitespace
|
33
|
+
string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
|
34
|
+
string.gsub!(/\n\n \n\n/,"\n\n") # Quadruple line breaks
|
35
|
+
string.gsub!(/^([0-9]+)\.[[:space:]]*/,"\\1. ") # Numbered lists
|
36
|
+
string.gsub!(/^-[[:space:]]*/,"- ") # Unnumbered lists
|
37
|
+
string
|
38
|
+
end
|
39
|
+
|
40
|
+
# Returns an array of Nokogiri nodes that are implicit headings
|
41
|
+
def implicit_headings
|
42
|
+
@implicit_headings ||= begin
|
43
|
+
headings = []
|
44
|
+
@doc.css("[style]").each do |element|
|
45
|
+
headings.push element unless element.font_size.nil?
|
46
|
+
end
|
47
|
+
headings
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Returns an array of font-sizes for implicit headings in the document
|
52
|
+
def font_sizes
|
53
|
+
@font_sizes ||= begin
|
54
|
+
sizes = []
|
55
|
+
implicit_headings.each { |element| sizes.push element.font_size }
|
56
|
+
sizes
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Given a Nokogiri node, guess what heading it represents, if any
|
61
|
+
def guess_heading(node)
|
62
|
+
return nil if node.font_size == nil
|
63
|
+
[*1...HEADING_DEPTH].each do |heading|
|
64
|
+
return "h#{heading}" if node.font_size >= h(heading)
|
65
|
+
end
|
66
|
+
nil
|
67
|
+
end
|
68
|
+
|
69
|
+
# Minimum font size required for a given heading
|
70
|
+
# e.g., H(2) would represent the minimum font size of an implicit h2
|
71
|
+
def h(n)
|
72
|
+
font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
|
73
|
+
end
|
74
|
+
|
75
|
+
# Try to make semantic markup explicit where implied by the export
|
76
|
+
def semanticize!
|
77
|
+
# Convert unnumbered list paragraphs to actual unnumbered lists
|
78
|
+
@doc.css(".#{LI_SELECTORS.join(",.")}").each { |node| node.node_name = "li" }
|
79
|
+
|
80
|
+
# Try to guess heading where implicit bassed on font size
|
81
|
+
implicit_headings.each do |element|
|
82
|
+
heading = guess_heading element
|
83
|
+
element.node_name = heading unless heading.nil?
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
module Nokogiri
|
89
|
+
module XML
|
90
|
+
class Element
|
91
|
+
|
92
|
+
FONT_SIZE_REGEX = /\bfont-size:\s?([0-9\.]+)pt;?\b/
|
93
|
+
|
94
|
+
def font_size
|
95
|
+
@font_size ||= begin
|
96
|
+
match = FONT_SIZE_REGEX.match attr("style")
|
97
|
+
match[1].to_i unless match.nil?
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
metadata
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: word-to-markdown
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ben Balter
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-03-22 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: reverse_markdown
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.4.7
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.4.7
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: descriptive_statistics
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.1.3
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.1.3
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: shoulda
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rdoc
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: bundler
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: pry
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
description: Ruby Gem to convert Word documents to markdown.
|
112
|
+
email: ben.balter@github.com
|
113
|
+
executables: []
|
114
|
+
extensions: []
|
115
|
+
extra_rdoc_files: []
|
116
|
+
files:
|
117
|
+
- lib/word-to-markdown.rb
|
118
|
+
homepage: https://github.com/benbalter/word-to-markdown
|
119
|
+
licenses:
|
120
|
+
- MIT
|
121
|
+
metadata: {}
|
122
|
+
post_install_message:
|
123
|
+
rdoc_options: []
|
124
|
+
require_paths:
|
125
|
+
- lib
|
126
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
127
|
+
requirements:
|
128
|
+
- - ">="
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: '0'
|
131
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: '0'
|
136
|
+
requirements: []
|
137
|
+
rubyforge_project:
|
138
|
+
rubygems_version: 2.2.0
|
139
|
+
signing_key:
|
140
|
+
specification_version: 4
|
141
|
+
summary: Ruby Gem to convert Word documents to markdown
|
142
|
+
test_files: []
|