word-to-markdown 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/word-to-markdown.rb +46 -9
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 858e79fff023fe2b3150484359c1bccc480182bf
4
- data.tar.gz: a6c2ac95b8be35efa54bd1938c31d08dc8bb0df0
3
+ metadata.gz: 64e2eeda272f608f1d1bfd5690abdeebdd1f31d8
4
+ data.tar.gz: ad2cd8d38fe5893463097ddb41fd973e001642df
5
5
  SHA512:
6
- metadata.gz: daa684a5bd4bda7eb465bc4c58c75dea0080d095943ead5d6e0696cd4c941cd80f268b3b9f638fc1e02f7875c50de757e3c85331054ee89bd173228cc650922f
7
- data.tar.gz: 73466db83836034919b906dee2b549ced6077f8b6a343e9f19bb2488942056e618027873a3b43fa6bbaddcdb0a54acb9f010d6115cc1ad413269c5645ac775c8
6
+ metadata.gz: 37da10c45c34e29bd671c2d6833e81f754eaea14e07fbca693eb8b5d813db9a5a508d2bb483a0045f0d5df8de3d704db0ec8bfe25f2345c6358dd8d83eaea003
7
+ data.tar.gz: cff63c72bbc8fba7e994454875c01b56d9d239fa822762e30d317519d28e3ae7821f9414be923733f4a1cb48c888dcd7fd15c0c346c07d510687f11782507f55
@@ -5,27 +5,62 @@ class WordToMarkdown
5
5
 
6
6
  HEADING_DEPTH = 6 # Number of headings to guess, e.g., h6
7
7
  HEADING_STEP = 100/HEADING_DEPTH
8
+ MIN_HEADING_SIZE = 20
9
+
8
10
  LI_SELECTORS = %w[
9
11
  MsoListParagraphCxSpFirst
10
12
  MsoListParagraphCxSpMiddle
11
13
  MsoListParagraphCxSpLast
12
14
  ]
13
15
 
14
- attr_reader :path, :doc, :html
15
-
16
- def initialize(path)
17
- @path = path
18
- @html = File.open(@path).read.encode("UTF-8", :invalid => :replace, :replace => "")
19
- @doc = Nokogiri::HTML @html
16
+ attr_reader :path, :doc
17
+
18
+ # Create a new WordToMarkdown object
19
+ #
20
+ # input - a HTML string or path to an HTML file
21
+ #
22
+ # Returns the WordToMarkdown object
23
+ def initialize(input)
24
+ path = File.expand_path input, Dir.pwd
25
+ if File.exist?(path)
26
+ html = File.open(path).read
27
+ @path = path
28
+ else
29
+ @path = String
30
+ html = input.to_s
31
+ end
32
+ @doc = Nokogiri::HTML normalize(html)
20
33
  semanticize!
21
34
  end
22
35
 
36
+ # Perform pre-processing normalization
37
+ def normalize(html)
38
+ encoding = encoding(html)
39
+ html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
40
+ html.gsub! /\<\/?o:[^>]+>/, "" # Strip everything in the office namespace
41
+ html.gsub! /\n|\r/," " # remove linebreaks
42
+ html
43
+ end
44
+
23
45
  def inspect
24
46
  "<WordToMarkdown path=\"#{@path}\">"
25
47
  end
26
48
 
27
49
  def to_s
28
- @markdown ||= scrub_whitespace(ReverseMarkdown.parse(@doc.to_html))
50
+ @markdown ||= scrub_whitespace(ReverseMarkdown.parse(html))
51
+ end
52
+
53
+ def html
54
+ @doc.to_html
55
+ end
56
+
57
+ def encoding(html)
58
+ match = html.encode("UTF-8", :invalid => :replace, :replace => "").match(/charset=([^\"]+)/)
59
+ if match
60
+ match[1].sub("macintosh", "MacRoman")
61
+ else
62
+ "UTF-8"
63
+ end
29
64
  end
30
65
 
31
66
  def scrub_whitespace(string)
@@ -33,7 +68,9 @@ class WordToMarkdown
33
68
  string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
34
69
  string.gsub!(/\n\n \n\n/,"\n\n") # Quadruple line breaks
35
70
  string.gsub!(/^([0-9]+)\.[[:space:]]*/,"\\1. ") # Numbered lists
36
- string.gsub!(/^-[[:space:]]*/,"- ") # Unnumbered lists
71
+ string.gsub!(/^-[[:space:]·]*/,"- ") # Unnumbered lists
72
+ string.gsub!(/\u00A0/, "") # Unicode non-breaking spaces, injected as tabs
73
+ string.gsub!(/^ /, "") # Leading spaces
37
74
  string
38
75
  end
39
76
 
@@ -42,7 +79,7 @@ class WordToMarkdown
42
79
  @implicit_headings ||= begin
43
80
  headings = []
44
81
  @doc.css("[style]").each do |element|
45
- headings.push element unless element.font_size.nil?
82
+ headings.push element unless element.font_size.nil? || element.font_size < MIN_HEADING_SIZE
46
83
  end
47
84
  headings
48
85
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: word-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-22 00:00:00.000000000 Z
11
+ date: 2014-03-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: reverse_markdown