word-to-markdown 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/word-to-markdown.rb +102 -0
  3. metadata +142 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 858e79fff023fe2b3150484359c1bccc480182bf
4
+ data.tar.gz: a6c2ac95b8be35efa54bd1938c31d08dc8bb0df0
5
+ SHA512:
6
+ metadata.gz: daa684a5bd4bda7eb465bc4c58c75dea0080d095943ead5d6e0696cd4c941cd80f268b3b9f638fc1e02f7875c50de757e3c85331054ee89bd173228cc650922f
7
+ data.tar.gz: 73466db83836034919b906dee2b549ced6077f8b6a343e9f19bb2488942056e618027873a3b43fa6bbaddcdb0a54acb9f010d6115cc1ad413269c5645ac775c8
@@ -0,0 +1,102 @@
1
+ require 'reverse_markdown'
2
+ require 'descriptive_statistics'
3
+
4
+ class WordToMarkdown
5
+
6
+ HEADING_DEPTH = 6 # Number of headings to guess, e.g., h6
7
+ HEADING_STEP = 100/HEADING_DEPTH
8
+ LI_SELECTORS = %w[
9
+ MsoListParagraphCxSpFirst
10
+ MsoListParagraphCxSpMiddle
11
+ MsoListParagraphCxSpLast
12
+ ]
13
+
14
+ attr_reader :path, :doc, :html
15
+
16
+ def initialize(path)
17
+ @path = path
18
+ @html = File.open(@path).read.encode("UTF-8", :invalid => :replace, :replace => "")
19
+ @doc = Nokogiri::HTML @html
20
+ semanticize!
21
+ end
22
+
23
+ def inspect
24
+ "<WordToMarkdown path=\"#{@path}\">"
25
+ end
26
+
27
+ def to_s
28
+ @markdown ||= scrub_whitespace(ReverseMarkdown.parse(@doc.to_html))
29
+ end
30
+
31
+ def scrub_whitespace(string)
32
+ string.sub!(/\A[[:space:]]+/,'') # leading whitespace
33
+ string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
34
+ string.gsub!(/\n\n \n\n/,"\n\n") # Quadruple line breaks
35
+ string.gsub!(/^([0-9]+)\.[[:space:]]*/,"\\1. ") # Numbered lists
36
+ string.gsub!(/^-[[:space:]]*/,"- ") # Unnumbered lists
37
+ string
38
+ end
39
+
40
+ # Returns an array of Nokogiri nodes that are implicit headings
41
+ def implicit_headings
42
+ @implicit_headings ||= begin
43
+ headings = []
44
+ @doc.css("[style]").each do |element|
45
+ headings.push element unless element.font_size.nil?
46
+ end
47
+ headings
48
+ end
49
+ end
50
+
51
+ # Returns an array of font-sizes for implicit headings in the document
52
+ def font_sizes
53
+ @font_sizes ||= begin
54
+ sizes = []
55
+ implicit_headings.each { |element| sizes.push element.font_size }
56
+ sizes
57
+ end
58
+ end
59
+
60
+ # Given a Nokogiri node, guess what heading it represents, if any
61
+ def guess_heading(node)
62
+ return nil if node.font_size == nil
63
+ [*1...HEADING_DEPTH].each do |heading|
64
+ return "h#{heading}" if node.font_size >= h(heading)
65
+ end
66
+ nil
67
+ end
68
+
69
+ # Minimum font size required for a given heading
70
+ # e.g., H(2) would represent the minimum font size of an implicit h2
71
+ def h(n)
72
+ font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
73
+ end
74
+
75
+ # Try to make semantic markup explicit where implied by the export
76
+ def semanticize!
77
+ # Convert unnumbered list paragraphs to actual unnumbered lists
78
+ @doc.css(".#{LI_SELECTORS.join(",.")}").each { |node| node.node_name = "li" }
79
+
80
+ # Try to guess heading where implicit bassed on font size
81
+ implicit_headings.each do |element|
82
+ heading = guess_heading element
83
+ element.node_name = heading unless heading.nil?
84
+ end
85
+ end
86
+ end
87
+
88
+ module Nokogiri
89
+ module XML
90
+ class Element
91
+
92
+ FONT_SIZE_REGEX = /\bfont-size:\s?([0-9\.]+)pt;?\b/
93
+
94
+ def font_size
95
+ @font_size ||= begin
96
+ match = FONT_SIZE_REGEX.match attr("style")
97
+ match[1].to_i unless match.nil?
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
metadata ADDED
@@ -0,0 +1,142 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: word-to-markdown
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Ben Balter
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-03-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: reverse_markdown
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.4.7
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.4.7
27
+ - !ruby/object:Gem::Dependency
28
+ name: descriptive_statistics
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.1.3
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.1.3
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: shoulda
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rdoc
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: bundler
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: pry
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: Ruby Gem to convert Word documents to markdown.
112
+ email: ben.balter@github.com
113
+ executables: []
114
+ extensions: []
115
+ extra_rdoc_files: []
116
+ files:
117
+ - lib/word-to-markdown.rb
118
+ homepage: https://github.com/benbalter/word-to-markdown
119
+ licenses:
120
+ - MIT
121
+ metadata: {}
122
+ post_install_message:
123
+ rdoc_options: []
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ requirements: []
137
+ rubyforge_project:
138
+ rubygems_version: 2.2.0
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: Ruby Gem to convert Word documents to markdown
142
+ test_files: []