word-to-markdown 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8826b727f290781e7713325662d056482018d730
4
- data.tar.gz: 8692363359366aac70359bf251ccba409f80bba7
3
+ metadata.gz: 08754da1501d6e4918d753a0727a6fc28fe17962
4
+ data.tar.gz: 4947e109b790a2c61575ce8349b0f11f51598320
5
5
  SHA512:
6
- metadata.gz: 8db1e766ad6bfc341de71a4fc82d14f8a3d109005f2bda9d779af2bd7b0f722d160cf873bff98cf56f9fa7929ab9282e945d249fc220fa56beafe46bd962cf9c
7
- data.tar.gz: fd84315c249e983a6bdf073a8573d2fbca82886ed05da15b8325ceca4f655b3181cee6ee20cf888c933a4b10794a5c5145ad86a8f9ebf8f9550163de9ce284da
6
+ metadata.gz: b70ac3cf257afc4eea9e591923bb43835d6f1a31b45bf0d6a39728acbbd55bc759e2d88f9f3d9456740d683552f4379aa046b321121cf5721f7114ab634d788c
7
+ data.tar.gz: f1e16e97e23ff20229f187474872496e1201409a5f606d25f5f62dba039ea2e8c66c20ead82278037861fe8d9a9256113f4064c894cf37015261d9b8635a83ca
data/bin/w2m CHANGED
@@ -2,10 +2,15 @@
2
2
 
3
3
  require 'word-to-markdown'
4
4
 
5
- if ARGV.size != 1
5
+ if ARGV.size != 1 || ARGV[0] == "--help"
6
6
  puts "Usage: bundle exec w2m path/to/document.docx"
7
7
  exit 1
8
8
  end
9
9
 
10
- doc = WordToMarkdown.new ARGV[0]
11
- puts doc.to_s
10
+ if ARGV[0] == "--version"
11
+ puts "WordToMarkdown v#{WordToMarkdown::VERSION}"
12
+ puts "LibreOffice #{WordToMarkdown.soffice_version}"
13
+ else
14
+ doc = WordToMarkdown.new ARGV[0]
15
+ puts doc.to_s
16
+ end
@@ -14,7 +14,6 @@ class WordToMarkdown
14
14
  end
15
15
 
16
16
  def convert!
17
-
18
17
  # Fonts and headings
19
18
  semanticize_font_styles!
20
19
  semanticize_headings!
@@ -95,18 +94,18 @@ class WordToMarkdown
95
94
 
96
95
  def remove_unicode_bullets_from_list_items!
97
96
  @document.tree.search("li span").each do |span|
98
- span.content = span.content.gsub /^([#{UNICODE_BULLETS.join("")}]+)/, ""
97
+ span.inner_html = span.inner_html.gsub /^([#{UNICODE_BULLETS.join("")}]+)/, ""
99
98
  end
100
99
  end
101
100
 
102
101
  def remove_numbering_from_list_items!
103
102
  @document.tree.search("li span").each do |span|
104
- span.content = span.content.gsub /^[a-zA-Z0-9]+\./m, ""
103
+ span.inner_html = span.inner_html.gsub /^[a-zA-Z0-9]+\./m, ""
105
104
  end
106
105
  end
107
106
 
108
107
  def remove_whitespace_from_list_items!
109
- @document.tree.search("li span").each { |span| span.content.strip! }
108
+ @document.tree.search("li span").each { |span| span.inner_html.strip! }
110
109
  end
111
110
 
112
111
  def semanticize_table_headers!
@@ -17,7 +17,7 @@ class WordToMarkdown
17
17
 
18
18
  def tree
19
19
  @tree ||= begin
20
- tree = Nokogiri::HTML(normalize(raw_html))
20
+ tree = Nokogiri::HTML(normalized_html)
21
21
  tree.css("title").remove
22
22
  tree
23
23
  end
@@ -38,8 +38,8 @@ class WordToMarkdown
38
38
  # html - the raw html export
39
39
  #
40
40
  # Returns the encoding, defaulting to "UTF-8"
41
- def encoding(html)
42
- match = html.encode("UTF-8", :invalid => :replace, :replace => "").match(/charset=([^\"]+)/)
41
+ def encoding
42
+ match = raw_html.encode("UTF-8", :invalid => :replace, :replace => "").match(/charset=([^\"]+)/)
43
43
  if match
44
44
  match[1].sub("macintosh", "MacRoman")
45
45
  else
@@ -54,9 +54,9 @@ class WordToMarkdown
54
54
  # html - the raw html input from the export
55
55
  #
56
56
  # Returns the normalized html
57
- def normalize(html)
58
- encoding = encoding(html)
59
- html = html.force_encoding(encoding).encode("UTF-8", :invalid => :replace, :replace => "")
57
+ def normalized_html
58
+ html = raw_html.force_encoding(encoding)
59
+ html = html.encode("UTF-8", :invalid => :replace, :replace => "")
60
60
  html = Premailer.new(html, :with_html_string => true, :input_encoding => "UTF-8").to_inline_css
61
61
  html.gsub! /\n|\r/," " # Remove linebreaks
62
62
  html.gsub! /“|”/, '"' # Straighten curly double quotes
@@ -71,9 +71,11 @@ class WordToMarkdown
71
71
  #
72
72
  # Returns the normalized markdown
73
73
  def scrub_whitespace(string)
74
- string.sub!(/\A[[:space:]]+/,'') # leading whitespace
75
- string.sub!(/[[:space:]]+\z/,'') # trailing whitespace
76
- string.gsub!(/\n\n \n\n/,"\n\n") # Quadruple line breaks
74
+ string.gsub!(" ", " ") # HTML encoded spaces
75
+ string.sub!(/\A[[:space:]]+/,'') # document leading whitespace
76
+ string.sub!(/[[:space:]]+\z/,'') # document trailing whitespace
77
+ string.gsub!(/([ ]+)$/, '') # line trailing whitespace
78
+ string.gsub!(/\n\n\n\n/,"\n\n") # Quadruple line breaks
77
79
  string.gsub!(/\u00A0/, "") # Unicode non-breaking spaces, injected as tabs
78
80
  string
79
81
  end
@@ -1,3 +1,3 @@
1
1
  class WordToMarkdown
2
- VERSION = "1.1.1"
2
+ VERSION = "1.1.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: word-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-10 00:00:00.000000000 Z
11
+ date: 2015-03-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: reverse_markdown