word-to-markdown 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/word-to-markdown.rb +102 -0
  3. metadata +142 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 858e79fff023fe2b3150484359c1bccc480182bf
4
+ data.tar.gz: a6c2ac95b8be35efa54bd1938c31d08dc8bb0df0
5
+ SHA512:
6
+ metadata.gz: daa684a5bd4bda7eb465bc4c58c75dea0080d095943ead5d6e0696cd4c941cd80f268b3b9f638fc1e02f7875c50de757e3c85331054ee89bd173228cc650922f
7
+ data.tar.gz: 73466db83836034919b906dee2b549ced6077f8b6a343e9f19bb2488942056e618027873a3b43fa6bbaddcdb0a54acb9f010d6115cc1ad413269c5645ac775c8
@@ -0,0 +1,102 @@
1
+ require 'reverse_markdown'
2
+ require 'descriptive_statistics'
3
+
4
+ class WordToMarkdown
5
+
6
+ HEADING_DEPTH = 6 # Number of headings to guess, e.g., h6
7
+ HEADING_STEP = 100/HEADING_DEPTH
8
+ LI_SELECTORS = %w[
9
+ MsoListParagraphCxSpFirst
10
+ MsoListParagraphCxSpMiddle
11
+ MsoListParagraphCxSpLast
12
+ ]
13
+
14
+ attr_reader :path, :doc, :html
15
+
16
+ def initialize(path)
17
+ @path = path
18
+ @html = File.open(@path).read.encode("UTF-8", :invalid => :replace, :replace => "")
19
+ @doc = Nokogiri::HTML @html
20
+ semanticize!
21
+ end
22
+
23
+ def inspect
24
+ "<WordToMarkdown path=\"#{@path}\">"
25
+ end
26
+
27
+ def to_s
28
+ @markdown ||= scrub_whitespace(ReverseMarkdown.parse(@doc.to_html))
29
+ end
30
+
31
+ def scrub_whitespace(string)
32
+ string.sub!(/\A[[:space:]]+/,'') # leading whitespace
33
+ string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
34
+ string.gsub!(/\n\n \n\n/,"\n\n") # Quadruple line breaks
35
+ string.gsub!(/^([0-9]+)\.[[:space:]]*/,"\\1. ") # Numbered lists
36
+ string.gsub!(/^-[[:space:]]*/,"- ") # Unnumbered lists
37
+ string
38
+ end
39
+
40
+ # Returns an array of Nokogiri nodes that are implicit headings
41
+ def implicit_headings
42
+ @implicit_headings ||= begin
43
+ headings = []
44
+ @doc.css("[style]").each do |element|
45
+ headings.push element unless element.font_size.nil?
46
+ end
47
+ headings
48
+ end
49
+ end
50
+
51
+ # Returns an array of font-sizes for implicit headings in the document
52
+ def font_sizes
53
+ @font_sizes ||= begin
54
+ sizes = []
55
+ implicit_headings.each { |element| sizes.push element.font_size }
56
+ sizes
57
+ end
58
+ end
59
+
60
+ # Given a Nokogiri node, guess what heading it represents, if any
61
+ def guess_heading(node)
62
+ return nil if node.font_size == nil
63
+ [*1...HEADING_DEPTH].each do |heading|
64
+ return "h#{heading}" if node.font_size >= h(heading)
65
+ end
66
+ nil
67
+ end
68
+
69
+ # Minimum font size required for a given heading
70
+ # e.g., H(2) would represent the minimum font size of an implicit h2
71
+ def h(n)
72
+ font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
73
+ end
74
+
75
+ # Try to make semantic markup explicit where implied by the export
76
+ def semanticize!
77
+ # Convert unnumbered list paragraphs to actual unnumbered lists
78
+ @doc.css(".#{LI_SELECTORS.join(",.")}").each { |node| node.node_name = "li" }
79
+
80
+ # Try to guess heading where implicit bassed on font size
81
+ implicit_headings.each do |element|
82
+ heading = guess_heading element
83
+ element.node_name = heading unless heading.nil?
84
+ end
85
+ end
86
+ end
87
+
88
+ module Nokogiri
89
+ module XML
90
+ class Element
91
+
92
+ FONT_SIZE_REGEX = /\bfont-size:\s?([0-9\.]+)pt;?\b/
93
+
94
+ def font_size
95
+ @font_size ||= begin
96
+ match = FONT_SIZE_REGEX.match attr("style")
97
+ match[1].to_i unless match.nil?
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
metadata ADDED
@@ -0,0 +1,142 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: word-to-markdown
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Ben Balter
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-03-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: reverse_markdown
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.4.7
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.4.7
27
+ - !ruby/object:Gem::Dependency
28
+ name: descriptive_statistics
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.1.3
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.1.3
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: shoulda
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rdoc
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: bundler
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: pry
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: Ruby Gem to convert Word documents to markdown.
112
+ email: ben.balter@github.com
113
+ executables: []
114
+ extensions: []
115
+ extra_rdoc_files: []
116
+ files:
117
+ - lib/word-to-markdown.rb
118
+ homepage: https://github.com/benbalter/word-to-markdown
119
+ licenses:
120
+ - MIT
121
+ metadata: {}
122
+ post_install_message:
123
+ rdoc_options: []
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ requirements: []
137
+ rubyforge_project:
138
+ rubygems_version: 2.2.0
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: Ruby Gem to convert Word documents to markdown
142
+ test_files: []