word-to-markdown 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/word-to-markdown.rb +46 -9
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 858e79fff023fe2b3150484359c1bccc480182bf
4
- data.tar.gz: a6c2ac95b8be35efa54bd1938c31d08dc8bb0df0
3
+ metadata.gz: 64e2eeda272f608f1d1bfd5690abdeebdd1f31d8
4
+ data.tar.gz: ad2cd8d38fe5893463097ddb41fd973e001642df
5
5
  SHA512:
6
- metadata.gz: daa684a5bd4bda7eb465bc4c58c75dea0080d095943ead5d6e0696cd4c941cd80f268b3b9f638fc1e02f7875c50de757e3c85331054ee89bd173228cc650922f
7
- data.tar.gz: 73466db83836034919b906dee2b549ced6077f8b6a343e9f19bb2488942056e618027873a3b43fa6bbaddcdb0a54acb9f010d6115cc1ad413269c5645ac775c8
6
+ metadata.gz: 37da10c45c34e29bd671c2d6833e81f754eaea14e07fbca693eb8b5d813db9a5a508d2bb483a0045f0d5df8de3d704db0ec8bfe25f2345c6358dd8d83eaea003
7
+ data.tar.gz: cff63c72bbc8fba7e994454875c01b56d9d239fa822762e30d317519d28e3ae7821f9414be923733f4a1cb48c888dcd7fd15c0c346c07d510687f11782507f55
@@ -5,27 +5,62 @@ class WordToMarkdown
5
5
 
6
6
  HEADING_DEPTH = 6 # Number of headings to guess, e.g., h6
7
7
  HEADING_STEP = 100/HEADING_DEPTH
8
+ MIN_HEADING_SIZE = 20
9
+
8
10
  LI_SELECTORS = %w[
9
11
  MsoListParagraphCxSpFirst
10
12
  MsoListParagraphCxSpMiddle
11
13
  MsoListParagraphCxSpLast
12
14
  ]
13
15
 
14
- attr_reader :path, :doc, :html
15
-
16
- def initialize(path)
17
- @path = path
18
- @html = File.open(@path).read.encode("UTF-8", :invalid => :replace, :replace => "")
19
- @doc = Nokogiri::HTML @html
16
+ attr_reader :path, :doc
17
+
18
+ # Create a new WordToMarkdown object
19
+ #
20
+ # input - a HTML string or path to an HTML file
21
+ #
22
+ # Returns the WordToMarkdown object
23
+ def initialize(input)
24
+ path = File.expand_path input, Dir.pwd
25
+ if File.exist?(path)
26
+ html = File.open(path).read
27
+ @path = path
28
+ else
29
+ @path = String
30
+ html = input.to_s
31
+ end
32
+ @doc = Nokogiri::HTML normalize(html)
20
33
  semanticize!
21
34
  end
22
35
 
36
+ # Perform pre-processing normalization
37
+ def normalize(html)
38
+ encoding = encoding(html)
39
+ html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
40
+ html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
41
+ html.gsub! /\n|\r/," " # remove linebreaks
42
+ html
43
+ end
44
+
23
45
  def inspect
24
46
  "<WordToMarkdown path=\"#{@path}\">"
25
47
  end
26
48
 
27
49
  def to_s
28
- @markdown ||= scrub_whitespace(ReverseMarkdown.parse(@doc.to_html))
50
+ @markdown ||= scrub_whitespace(ReverseMarkdown.parse(html))
51
+ end
52
+
53
+ def html
54
+ @doc.to_html
55
+ end
56
+
57
+ def encoding(html)
58
+ match = html.encode("UTF-8", :invalid => :replace, :replace => "").match(/charset=([^\"]+)/)
59
+ if match
60
+ match[1].sub("macintosh", "MacRoman")
61
+ else
62
+ "UTF-8"
63
+ end
29
64
  end
30
65
 
31
66
  def scrub_whitespace(string)
@@ -33,7 +68,9 @@ class WordToMarkdown
33
68
  string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
34
69
  string.gsub!(/\n\n \n\n/,"\n\n") # Quadruple line breaks
35
70
  string.gsub!(/^([0-9]+)\.[[:space:]]*/,"\\1. ") # Numbered lists
36
- string.gsub!(/^-[[:space:]]*/,"- ") # Unnumbered lists
71
+ string.gsub!(/^-[[:space:]·]*/,"- ") # Unnumbered lists
72
+ string.gsub!(/\u00A0/, "") # Unicode non-breaking spaces, injected as tabs
73
+ string.gsub!(/^ /, "") # Leading spaces
37
74
  string
38
75
  end
39
76
 
@@ -42,7 +79,7 @@ class WordToMarkdown
42
79
  @implicit_headings ||= begin
43
80
  headings = []
44
81
  @doc.css("[style]").each do |element|
45
- headings.push element unless element.font_size.nil?
82
+ headings.push element unless element.font_size.nil? || element.font_size < MIN_HEADING_SIZE
46
83
  end
47
84
  headings
48
85
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: word-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-22 00:00:00.000000000 Z
11
+ date: 2014-03-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: reverse_markdown