html2doc 1.3.1 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 56d8c42bd609845f35a5a994fed43d12ebc9fb0d8d303fd60f9a064f4da26a7b
4
- data.tar.gz: e9310883dbc5991640e66a1c085d6bcb2ca87155449326b7076489e78d64d187
3
+ metadata.gz: df61b49c5ba557bf2742f1d7240d5990322f2be01019dc6dd712eeecc0752e61
4
+ data.tar.gz: d85fdda65fcc3c7ed6bdcd5a38501549abc03d536ab00ac9bcb28f061790a3fd
5
5
  SHA512:
6
- metadata.gz: 8d7076b196634dc81a3942a59155c7c80da21b9eb68721dab437170c54876f970b80448fa31f520648145eca9ace1fea0c7751be04021f9c1f95fe0bf3fa64ce
7
- data.tar.gz: 532b022bda9cc4fb88eafeb467c7d6d26ba8dc5ea21f5553ba251e8b92469de6e906f5947a3fdb3bbee241cbaea8805f3477978bbe78af359a6bb7140399a971
6
+ metadata.gz: 6173d729141614e61dfd5502c3ab0e6192b6c4fdf95b8689d882b14f97401086eab3188ac0c7a5154464a0c735226fa762041c926fe2827e8b936143e422ac29
7
+ data.tar.gz: dcc3a65d88d7ded0855930ac56f1848c72fa4ac45fa86187f2c2df4af9af593ec0f292a16e3df73a60a99d515dfff9dfaba7205c1bf633a425f54ead25b87760
data/README.adoc CHANGED
@@ -58,14 +58,14 @@ There there are two other Microsoft Word vendors in the Ruby ecosystem.
58
58
  --
59
59
  require "html2doc"
60
60
 
61
- Html2Doc.process(result, filename: filename, imagedir: imagedir, stylesheet: stylesheet, header_filename: header_filename, dir: dir, asciimathdelims: asciimathdelims, liststyles: liststyles)
61
+ Html2Doc.new(filename: filename, imagedir: imagedir, stylesheet: stylesheet, header_file: header_filename, dir: dir, asciimathdelims: asciimathdelims, liststyles: liststyles).process(result)
62
62
  --
63
63
 
64
64
  result:: is the Html document to be converted into Word, as a string.
65
65
  filename:: is the name the document is to be saved as, without a file suffix
66
66
  imagedir:: base directory for local image file names in source XML
67
67
  stylesheet:: is the full path filename of the CSS stylesheet for Microsoft Word-specific styles. If this is not provided, the program will used the default stylesheet included in the gem, `lib/html2doc/wordstyle.css`. The stylsheet provided must match this stylesheet; you can obtain one by saving a Word document with your desired styles to HTML, and extracting the style definitions from the HTML document header.
68
- header_filename:: is the filename of the HTML document containing header and footer for the document, as well as footnote/endnote separators; if there is none, use nil. To generate your own such document, save a Word document with headers/footers and/or footnote/endnote separators as an HTML document; the `header.html` will be in the `{filename}.fld` folder generated along with the HTML. A sample file is available at https://github.com/metanorma/metanorma-iso/blob/master/lib/asciidoctor/iso/word/header.html
68
+ header_file:: is the filename of the HTML document containing header and footer for the document, as well as footnote/endnote separators; if there is none, use nil. To generate your own such document, save a Word document with headers/footers and/or footnote/endnote separators as an HTML document; the `header.html` will be in the `{filename}.fld` folder generated along with the HTML. A sample file is available at https://github.com/metanorma/metanorma-iso/blob/master/lib/asciidoctor/iso/word/header.html
69
69
  dir:: is the folder that any ancillary files (images, headers, filelist) are to be saved to. If not provided, it will be created as `{filename}_files`. Anything in the directory will be attached to the Word document; so this folder should only contain the images that accompany the document. (If the images are elsewhere on the local drive, the gem will move them into the folder. External URL images are left alone, and are not downloaded.)
70
70
  asciimathdelims:: are the AsciiMath delimiters used in the text (an array of an opening and a closing delimiter). If none are provided, no AsciiMath conversion is attempted.
71
71
  liststyles:: a hash of list style labels in Word CSS, which are used to define the behaviour of list item labels (e.g. _i)_ vs _i._). The gem recognises the hash keys `ul`, `ol`. So if the appearance of an ordered list's item labels in the supplied stylesheet is governed by style `@list l1` (e.g. `@list l1:level1 {mso-level-text:"%1\)";}` appears in the stylesheet), call the method with `liststyles:{ol: "l1"}`. The lists that the `ul` and `ol` list styles are applied to are assumed not to have any CSS class. If there any additional hash keys, they are assumed to be classes applied to the topmost ordered or unordered list; e.g. `liststyles:{steps: "l5"}` means that any list with class `steps` at the topmost level has the list style `l5` recursively applied to it. Any top-level lists without a class named in liststyles will be treated like lists with no CSS class.
data/bin/html2doc CHANGED
@@ -21,8 +21,7 @@ if ARGV.length < 1
21
21
  end
22
22
 
23
23
  Html2Doc.process(
24
- File.read(ARGV[0], encoding: "utf-8"),
25
24
  filename: ARGV[0].gsub(/\.html?$/, ""),
26
25
  stylesheet: options[:stylesheet],
27
26
  header: options[:header],
28
- )
27
+ ).process(File.read(ARGV[0], encoding: "utf-8"))
data/lib/html2doc/base.rb CHANGED
@@ -4,27 +4,40 @@ require "htmlentities"
4
4
  require "nokogiri"
5
5
  require "fileutils"
6
6
 
7
- module Html2Doc
8
- def self.process(result, hash)
9
- hash[:dir1] = create_dir(hash[:filename], hash[:dir])
10
- result = process_html(result, hash)
11
- process_header(hash[:header_file], hash)
12
- generate_filelist(hash[:filename], hash[:dir1])
13
- File.open("#{hash[:filename]}.htm", "w:UTF-8") { |f| f.write(result) }
14
- mime_package result, hash[:filename], hash[:dir1]
15
- rm_temp_files(hash[:filename], hash[:dir], hash[:dir1]) unless hash[:debug]
16
- end
17
-
18
- def self.process_header(headerfile, hash)
7
+ class Html2Doc
8
+ def initialize(hash)
9
+ @filename = hash[:filename]
10
+ @dir = hash[:dir]
11
+ @dir1 = create_dir(@filename, @dir)
12
+ @header_file = hash[:header_file]
13
+ @asciimathdelims = hash[:asciimathdelims]
14
+ @imagedir = hash[:imagedir]
15
+ @debug = hash[:debug]
16
+ @liststyles = hash[:liststyles]
17
+ @xsltemplate =
18
+ Nokogiri::XSLT(File.read(File.join(File.dirname(__FILE__), "mml2omml.xsl"),
19
+ encoding: "utf-8"))
20
+ end
21
+
22
+ def process(result)
23
+ result = process_html(result)
24
+ process_header(@header_file)
25
+ generate_filelist(@filename, @dir1)
26
+ File.open("#{@filename}.htm", "w:UTF-8") { |f| f.write(result) }
27
+ mime_package result, @filename, @dir1
28
+ rm_temp_files(@filename, @dir, @dir1) unless @debug
29
+ end
30
+
31
+ def process_header(headerfile)
19
32
  return if headerfile.nil?
20
33
 
21
34
  doc = File.read(headerfile, encoding: "utf-8")
22
- doc = header_image_cleanup(doc, hash[:dir1], hash[:filename],
23
- File.dirname(hash[:filename]))
24
- File.open("#{hash[:dir1]}/header.html", "w:UTF-8") { |f| f.write(doc) }
35
+ doc = header_image_cleanup(doc, @dir1, @filename,
36
+ File.dirname(@filename))
37
+ File.open("#{@dir1}/header.html", "w:UTF-8") { |f| f.write(doc) }
25
38
  end
26
39
 
27
- def self.clear_dir(dir)
40
+ def clear_dir(dir)
28
41
  Dir.foreach(dir) do |f|
29
42
  fn = File.join(dir, f)
30
43
  File.delete(fn) if f != "." && f != ".."
@@ -32,30 +45,30 @@ module Html2Doc
32
45
  dir
33
46
  end
34
47
 
35
- def self.create_dir(filename, dir)
48
+ def create_dir(filename, dir)
36
49
  dir and return clear_dir(dir)
37
50
  dir = "#{filename}_files"
38
51
  Dir.mkdir(dir) unless File.exists?(dir)
39
52
  clear_dir(dir)
40
53
  end
41
54
 
42
- def self.process_html(result, hash)
43
- docxml = to_xhtml(asciimath_to_mathml(result, hash[:asciimathdelims]))
44
- define_head(cleanup(docxml, hash), hash)
55
+ def process_html(result)
56
+ docxml = to_xhtml(asciimath_to_mathml(result, @asciimathdelims))
57
+ define_head(cleanup(docxml))
45
58
  msword_fix(from_xhtml(docxml))
46
59
  end
47
60
 
48
- def self.rm_temp_files(filename, dir, dir1)
61
+ def rm_temp_files(filename, dir, dir1)
49
62
  FileUtils.rm "#{filename}.htm"
50
63
  FileUtils.rm_f "#{dir1}/header.html"
51
64
  FileUtils.rm_r dir1 unless dir
52
65
  end
53
66
 
54
- def self.cleanup(docxml, hash)
67
+ def cleanup(docxml)
55
68
  namespace(docxml.root)
56
- image_cleanup(docxml, hash[:dir1], hash[:imagedir])
69
+ image_cleanup(docxml, @dir1, @imagedir)
57
70
  mathml_to_ooml(docxml)
58
- lists(docxml, hash[:liststyles])
71
+ lists(docxml, @liststyles)
59
72
  footnotes(docxml)
60
73
  bookmarks(docxml)
61
74
  msonormal(docxml)
@@ -70,7 +83,7 @@ module Html2Doc
70
83
  <body> </body> </html>
71
84
  HERE
72
85
 
73
- def self.to_xhtml(xml)
86
+ def to_xhtml(xml)
74
87
  xml.gsub!(/<\?xml[^>]*>/, "")
75
88
  unless /<!DOCTYPE /.match? xml
76
89
  xml = '<!DOCTYPE html SYSTEM
@@ -85,7 +98,7 @@ module Html2Doc
85
98
  <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
86
99
  DOCTYPE
87
100
 
88
- def self.from_xhtml(xml)
101
+ def from_xhtml(xml)
89
102
  xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
90
103
  .sub(DOCTYPE, "").gsub(%{ />}, "/>")
91
104
  .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
@@ -93,7 +106,7 @@ module Html2Doc
93
106
  .gsub("\n--&gt;\n", "\n-->\n")
94
107
  end
95
108
 
96
- def self.msword_fix(doc)
109
+ def msword_fix(doc)
97
110
  # brain damage in MSWord parser
98
111
  doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
99
112
  "<w:DoNotOptimizeForBrowser/>")
@@ -133,7 +146,7 @@ module Html2Doc
133
146
  <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
134
147
  XML
135
148
 
136
- def self.define_head1(docxml, _dir)
149
+ def define_head1(docxml, _dir)
137
150
  docxml.xpath("//*[local-name() = 'head']").each do |h|
138
151
  h.children.first.add_previous_sibling <<~XML
139
152
  #{PRINT_VIEW}
@@ -142,7 +155,7 @@ module Html2Doc
142
155
  end
143
156
  end
144
157
 
145
- def self.filename_substitute(head, header_filename)
158
+ def filename_substitute(head, header_filename)
146
159
  return if header_filename.nil?
147
160
 
148
161
  head.xpath(".//*[local-name() = 'style']").each do |s|
@@ -153,30 +166,30 @@ module Html2Doc
153
166
  end
154
167
  end
155
168
 
156
- def self.stylesheet(_filename, _header_filename, cssname)
169
+ def stylesheet(_filename, _header_filename, cssname)
157
170
  (cssname.nil? || cssname.empty?) and
158
171
  cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
159
172
  stylesheet = File.read(cssname, encoding: "UTF-8")
160
173
  xml = Nokogiri::XML("<style/>")
161
- #s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
162
- #xml.children.first << Nokogiri::XML::Comment.new(xml, s)
174
+ # s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
175
+ # xml.children.first << Nokogiri::XML::Comment.new(xml, s)
163
176
  xml.children.first << Nokogiri::XML::CDATA
164
177
  .new(xml, "\n<!--\n#{stylesheet}\n-->\n")
165
178
 
166
179
  xml.root.to_s
167
180
  end
168
181
 
169
- def self.define_head(docxml, hash)
182
+ def define_head(docxml)
170
183
  title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
171
184
  head = docxml.at("//*[local-name() = 'head']")
172
- css = stylesheet(hash[:filename], hash[:header_file], hash[:stylesheet])
185
+ css = stylesheet(@filename, @header_file, @stylesheet)
173
186
  add_stylesheet(head, title, css)
174
- filename_substitute(head, hash[:header_file])
175
- define_head1(docxml, hash[:dir1])
187
+ filename_substitute(head, @header_file)
188
+ define_head1(docxml, @dir1)
176
189
  rootnamespace(docxml.root)
177
190
  end
178
191
 
179
- def self.add_stylesheet(head, title, css)
192
+ def add_stylesheet(head, title, css)
180
193
  if head.children.empty?
181
194
  head.add_child css
182
195
  elsif title.nil?
@@ -186,7 +199,7 @@ module Html2Doc
186
199
  end
187
200
  end
188
201
 
189
- def self.namespace(root)
202
+ def namespace(root)
190
203
  {
191
204
  o: "urn:schemas-microsoft-com:office:office",
192
205
  w: "urn:schemas-microsoft-com:office:word",
@@ -195,11 +208,11 @@ module Html2Doc
195
208
  }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
196
209
  end
197
210
 
198
- def self.rootnamespace(root)
211
+ def rootnamespace(root)
199
212
  root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
200
213
  end
201
214
 
202
- def self.bookmarks(docxml)
215
+ def bookmarks(docxml)
203
216
  docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
204
217
  .each do |x|
205
218
  next if x["id"].empty? ||
@@ -212,7 +225,7 @@ module Html2Doc
212
225
  end
213
226
  end
214
227
 
215
- def self.msonormal(docxml)
228
+ def msonormal(docxml)
216
229
  docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p|
217
230
  p["class"] = "MsoNormal"
218
231
  end
@@ -3,8 +3,8 @@ require "asciimath"
3
3
  require "htmlentities"
4
4
  require "nokogiri"
5
5
 
6
- module Html2Doc
7
- def self.style_list(elem, level, liststyle, listnumber)
6
+ class Html2Doc
7
+ def style_list(elem, level, liststyle, listnumber)
8
8
  return unless liststyle
9
9
 
10
10
  if elem["style"]
@@ -15,7 +15,7 @@ module Html2Doc
15
15
  elem["style"] += "mso-list:#{liststyle} level#{level} lfo#{listnumber};"
16
16
  end
17
17
 
18
- def self.list_add1(elem, liststyles, listtype, level)
18
+ def list_add1(elem, liststyles, listtype, level)
19
19
  if %i[ul ol].include? listtype
20
20
  list_add(elem.xpath(".//ul") - elem.xpath(".//ul//ul | .//ol//ul"),
21
21
  liststyles, :ul, level + 1)
@@ -29,7 +29,7 @@ module Html2Doc
29
29
  end
30
30
  end
31
31
 
32
- def self.list_add(xpath, liststyles, listtype, level)
32
+ def list_add(xpath, liststyles, listtype, level)
33
33
  xpath.each_with_index do |l, _i|
34
34
  @listnumber += 1 if level == 1
35
35
  l["seen"] = true if level == 1
@@ -46,7 +46,7 @@ module Html2Doc
46
46
  end
47
47
  end
48
48
 
49
- def self.list2para(list)
49
+ def list2para(list)
50
50
  return if list.xpath("./li").empty?
51
51
 
52
52
  list.xpath("./li").first["class"] ||= "MsoListParagraphCxSpFirst"
@@ -63,7 +63,7 @@ module Html2Doc
63
63
 
64
64
  TOPLIST = "[not(ancestor::ul) and not(ancestor::ol)]".freeze
65
65
 
66
- def self.lists1(docxml, liststyles, style)
66
+ def lists1(docxml, liststyles, style)
67
67
  case style
68
68
  when :ul then list_add(docxml.xpath("//ul[not(@class)]#{TOPLIST}"),
69
69
  liststyles, :ul, 1)
@@ -76,7 +76,7 @@ module Html2Doc
76
76
  end
77
77
  end
78
78
 
79
- def self.lists_unstyled(docxml, liststyles)
79
+ def lists_unstyled(docxml, liststyles)
80
80
  liststyles.has_key?(:ul) and
81
81
  list_add(docxml.xpath("//ul#{TOPLIST}[not(@seen)]"),
82
82
  liststyles, :ul, 1)
@@ -88,7 +88,7 @@ module Html2Doc
88
88
  end
89
89
  end
90
90
 
91
- def self.lists(docxml, liststyles)
91
+ def lists(docxml, liststyles)
92
92
  return if liststyles.nil?
93
93
 
94
94
  @listnumber = 0
data/lib/html2doc/math.rb CHANGED
@@ -4,12 +4,8 @@ require "htmlentities"
4
4
  require "nokogiri"
5
5
  require "plane1converter"
6
6
 
7
- module Html2Doc
8
- @xsltemplate =
9
- Nokogiri::XSLT(File.read(File.join(File.dirname(__FILE__), "mml2omml.xsl"),
10
- encoding: "utf-8"))
11
-
12
- def self.asciimath_to_mathml1(expr)
7
+ class Html2Doc
8
+ def asciimath_to_mathml1(expr)
13
9
  AsciiMath::MathMLBuilder.new(msword: true).append_expression(
14
10
  AsciiMath.parse(HTMLEntities.new.decode(expr)).ast,
15
11
  ).to_s
@@ -20,7 +16,7 @@ module Html2Doc
20
16
  raise e
21
17
  end
22
18
 
23
- def self.asciimath_to_mathml(doc, delims)
19
+ def asciimath_to_mathml(doc, delims)
24
20
  return doc if delims.nil? || delims.size < 2
25
21
 
26
22
  m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
@@ -31,13 +27,13 @@ module Html2Doc
31
27
  end.join
32
28
  end
33
29
 
34
- def self.progress_conv(idx, step, total, threshold, msg)
30
+ def progress_conv(idx, step, total, threshold, msg)
35
31
  return unless (idx % step).zero? && total > threshold && idx.positive?
36
32
 
37
33
  warn "#{msg} #{idx} of #{total}"
38
34
  end
39
35
 
40
- def self.unwrap_accents(doc)
36
+ def unwrap_accents(doc)
41
37
  doc.xpath("//*[@accent = 'true']").each do |x|
42
38
  x.elements.length > 1 or next
43
39
  x.elements[1].name == "mrow" and
@@ -47,7 +43,7 @@ module Html2Doc
47
43
  end
48
44
 
49
45
  # random fixes to MathML input that OOXML needs to render properly
50
- def self.ooxml_cleanup(math, docnamespaces)
46
+ def ooxml_cleanup(math, docnamespaces)
51
47
  math = unwrap_accents(
52
48
  mathml_preserve_space(
53
49
  mathml_insert_rows(math, docnamespaces), docnamespaces
@@ -57,7 +53,7 @@ module Html2Doc
57
53
  math
58
54
  end
59
55
 
60
- def self.mathml_insert_rows(math, docnamespaces)
56
+ def mathml_insert_rows(math, docnamespaces)
61
57
  math.xpath(%w(msup msub msubsup munder mover munderover)
62
58
  .map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
63
59
  next unless x.next_element && x.next_element != "mrow"
@@ -67,7 +63,7 @@ module Html2Doc
67
63
  math
68
64
  end
69
65
 
70
- def self.mathml_preserve_space(math, docnamespaces)
66
+ def mathml_preserve_space(math, docnamespaces)
71
67
  math.xpath(".//xmlns:mtext", docnamespaces).each do |x|
72
68
  x.children = x.children.to_xml.gsub(/^\s/, "&#xA0;").gsub(/\s$/, "&#xA0;")
73
69
  end
@@ -76,7 +72,7 @@ module Html2Doc
76
72
 
77
73
  HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze
78
74
 
79
- def self.unitalic(math)
75
+ def unitalic(math)
80
76
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
81
77
  x.wrap("<span #{HTML_NS} style='font-style:normal;'></span>")
82
78
  end
@@ -122,7 +118,7 @@ module Html2Doc
122
118
  math
123
119
  end
124
120
 
125
- def self.to_plane1(xml, font)
121
+ def to_plane1(xml, font)
126
122
  xml.traverse do |n|
127
123
  next unless n.text?
128
124
 
@@ -131,7 +127,7 @@ module Html2Doc
131
127
  xml
132
128
  end
133
129
 
134
- def self.mathml_to_ooml(docxml)
130
+ def mathml_to_ooml(docxml)
135
131
  docnamespaces = docxml.collect_namespaces
136
132
  m = docxml.xpath("//*[local-name() = 'math']")
137
133
  m.each_with_index do |x, i|
@@ -144,14 +140,14 @@ module Html2Doc
144
140
  # namespaces.
145
141
  # We will end up stripping them out again under Nokogiri 1.11, which correctly
146
142
  # insists on inheriting namespace from parent.
147
- def self.ooml_clean(xml)
143
+ def ooml_clean(xml)
148
144
  xml.to_s
149
145
  .gsub(/<\?[^>]+>\s*/, "")
150
146
  .gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
151
147
  .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
152
148
  end
153
149
 
154
- def self.mathml_to_ooml1(xml, docnamespaces)
150
+ def mathml_to_ooml1(xml, docnamespaces)
155
151
  doc = Nokogiri::XML::Document::new
156
152
  doc.root = ooxml_cleanup(xml, docnamespaces)
157
153
  ooxml = ooml_clean(unitalic(esc_space(accent_tr(@xsltemplate.transform(doc)))))
@@ -159,7 +155,7 @@ module Html2Doc
159
155
  xml.swap(ooxml)
160
156
  end
161
157
 
162
- def self.accent_tr(xml)
158
+ def accent_tr(xml)
163
159
  xml.xpath(".//*[local-name()='accPr']/*[local-name()='chr']").each do |x|
164
160
  x["m:val"] &&= accent_tr1(x["m:val"])
165
161
  x["val"] &&= accent_tr1(x["val"])
@@ -167,7 +163,7 @@ module Html2Doc
167
163
  xml
168
164
  end
169
165
 
170
- def self.accent_tr1(accent)
166
+ def accent_tr1(accent)
171
167
  case accent
172
168
  when "\u2192" then "\u20D7"
173
169
  when "^" then "\u0302"
@@ -178,7 +174,7 @@ module Html2Doc
178
174
 
179
175
  # escape space as &#x32;; we are removing any spaces generated by
180
176
  # XML indentation
181
- def self.esc_space(xml)
177
+ def esc_space(xml)
182
178
  xml.traverse do |n|
183
179
  next unless n.text?
184
180
 
@@ -189,7 +185,7 @@ module Html2Doc
189
185
 
190
186
  # if oomml has no siblings, by default it is centered; override this with
191
187
  # left/right if parent is so tagged
192
- def self.uncenter(math, ooxml)
188
+ def uncenter(math, ooxml)
193
189
  alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\
194
190
  "local-name() = 'div' or local-name() = 'td']/@style")
195
191
  return ooxml unless alignnode && (math.next == nil && math.previous == nil)
data/lib/html2doc/mime.rb CHANGED
@@ -4,8 +4,8 @@ require "mime/types"
4
4
  require "image_size"
5
5
  require "fileutils"
6
6
 
7
- module Html2Doc
8
- def self.mime_preamble(boundary, filename, result)
7
+ class Html2Doc
8
+ def mime_preamble(boundary, filename, result)
9
9
  <<~"PREAMBLE"
10
10
  MIME-Version: 1.0
11
11
  Content-Type: multipart/related; boundary="#{boundary}"
@@ -20,7 +20,7 @@ module Html2Doc
20
20
  PREAMBLE
21
21
  end
22
22
 
23
- def self.mime_attachment(boundary, _filename, item, dir)
23
+ def mime_attachment(boundary, _filename, item, dir)
24
24
  content_type = mime_type(item)
25
25
  text_mode = %w[text application].any? { |p| content_type.start_with? p }
26
26
 
@@ -40,19 +40,19 @@ module Html2Doc
40
40
  FILE
41
41
  end
42
42
 
43
- def self.mime_type(item)
43
+ def mime_type(item)
44
44
  types = MIME::Types.type_for(item)
45
45
  type = types ? types.first.to_s : 'text/plain; charset="utf-8"'
46
46
  type = %(#{type} charset="utf-8") if /^text/.match(type) && types
47
47
  type
48
48
  end
49
49
 
50
- def self.mime_boundary
50
+ def mime_boundary
51
51
  salt = UUIDTools::UUID.random_create.to_s.gsub(/-/, ".")[0..17]
52
52
  "----=_NextPart_#{salt}"
53
53
  end
54
54
 
55
- def self.mime_package(result, filename, dir)
55
+ def mime_package(result, filename, dir)
56
56
  boundary = mime_boundary
57
57
  mhtml = mime_preamble(boundary, "#{filename}.htm", result)
58
58
  mhtml += mime_attachment(boundary, "#{filename}.htm", "filelist.xml", dir)
@@ -66,7 +66,7 @@ module Html2Doc
66
66
  File.open("#{filename}.doc", "w:UTF-8") { |f| f.write contentid(mhtml) }
67
67
  end
68
68
 
69
- def self.contentid(mhtml)
69
+ def contentid(mhtml)
70
70
  mhtml.gsub %r{(<img[^>]*?src=")([^\"']+)(['"])}m do |m|
71
71
  repl = "#{$1}cid:#{File.basename($2)}#{$3}"
72
72
  /^data:|^https?:/.match($2) ? m : repl
@@ -77,7 +77,7 @@ module Html2Doc
77
77
  end
78
78
 
79
79
  # max width for Word document is 400, max height is 680
80
- def self.image_resize(img, path, maxheight, maxwidth)
80
+ def image_resize(img, path, maxheight, maxwidth)
81
81
  realsize = ImageSize.path(path).size
82
82
  s = [img["width"].to_i, img["height"].to_i]
83
83
  s = realsize if s[0].zero? && s[1].zero?
@@ -92,20 +92,20 @@ module Html2Doc
92
92
 
93
93
  IMAGE_PATH = "//*[local-name() = 'img' or local-name() = 'imagedata']".freeze
94
94
 
95
- def self.mkuuid
95
+ def mkuuid
96
96
  UUIDTools::UUID.random_create.to_s
97
97
  end
98
98
 
99
- def self.warnsvg(src)
99
+ def warnsvg(src)
100
100
  warn "#{src}: SVG not supported" if /\.svg$/i.match?(src)
101
101
  end
102
102
 
103
- def self.localname(src, localdir)
103
+ def localname(src, localdir)
104
104
  %r{^([A-Z]:)?/}.match?(src) ? src : File.join(localdir, src)
105
105
  end
106
106
 
107
107
  # only processes locally stored images
108
- def self.image_cleanup(docxml, dir, localdir)
108
+ def image_cleanup(docxml, dir, localdir)
109
109
  docxml.traverse do |i|
110
110
  src = i["src"]
111
111
  next unless i.element? && %w(img v:imagedata).include?(i.name)
@@ -123,13 +123,13 @@ module Html2Doc
123
123
 
124
124
  # do not parse the header through Nokogiri, since it will contain
125
125
  # non-XML like <![if !supportFootnotes]>
126
- def self.header_image_cleanup(doc, dir, filename, localdir)
126
+ def header_image_cleanup(doc, dir, filename, localdir)
127
127
  doc.split(%r{(<img [^>]*>|<v:imagedata [^>]*>)}).each_slice(2).map do |a|
128
128
  header_image_cleanup1(a, dir, filename, localdir)
129
129
  end.join
130
130
  end
131
131
 
132
- def self.header_image_cleanup1(a, dir, _filename, localdir)
132
+ def header_image_cleanup1(a, dir, _filename, localdir)
133
133
  if a.size == 2 && !(/ src="https?:/.match a[1]) &&
134
134
  !(%r{ src="data:(image|application)/[^;]+;base64}.match a[1])
135
135
  m = / src=['"](?<src>[^"']+)['"]/.match a[1]
@@ -141,7 +141,7 @@ module Html2Doc
141
141
  a.join
142
142
  end
143
143
 
144
- def self.generate_filelist(filename, dir)
144
+ def generate_filelist(filename, dir)
145
145
  File.open(File.join(dir, "filelist.xml"), "w") do |f|
146
146
  f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office">
147
147
  <o:MainFile HRef="../#{filename}.htm"/>}
@@ -1,7 +1,7 @@
1
1
  require "uuidtools"
2
2
 
3
- module Html2Doc
4
- def self.footnotes(docxml)
3
+ class Html2Doc
4
+ def footnotes(docxml)
5
5
  i = 1
6
6
  fn = []
7
7
  docxml.xpath("//a").each do |a|
@@ -12,7 +12,7 @@ module Html2Doc
12
12
  process_footnote_texts(docxml, fn)
13
13
  end
14
14
 
15
- def self.process_footnote_texts(docxml, footnotes)
15
+ def process_footnote_texts(docxml, footnotes)
16
16
  body = docxml.at("//body")
17
17
  list = body.add_child("<div style='mso-element:footnote-list'/>")
18
18
  footnotes.each_with_index do |f, i|
@@ -23,7 +23,7 @@ module Html2Doc
23
23
  footnote_cleanup(docxml)
24
24
  end
25
25
 
26
- def self.footnote_div_to_p(elem)
26
+ def footnote_div_to_p(elem)
27
27
  if %w{div aside}.include? elem.name
28
28
  if elem.at(".//p")
29
29
  elem.replace(elem.children)
@@ -37,7 +37,7 @@ module Html2Doc
37
37
  FN = "<span class='MsoFootnoteReference'>"\
38
38
  "<span style='mso-special-character:footnote'/></span>".freeze
39
39
 
40
- def self.footnote_container(docxml, idx)
40
+ def footnote_container(docxml, idx)
41
41
  ref = docxml&.at("//a[@href='#_ftn#{idx}']")&.children&.to_xml(indent: 0)
42
42
  &.gsub(/>\n</, "><") || FN
43
43
  <<~DIV
@@ -47,7 +47,7 @@ module Html2Doc
47
47
  DIV
48
48
  end
49
49
 
50
- def self.process_footnote_link(docxml, elem, idx, footnote)
50
+ def process_footnote_link(docxml, elem, idx, footnote)
51
51
  return false unless footnote?(elem)
52
52
 
53
53
  href = elem["href"].gsub(/^#/, "")
@@ -62,7 +62,7 @@ module Html2Doc
62
62
  footnote << transform_footnote_text(note)
63
63
  end
64
64
 
65
- def self.process_footnote_link1(elem)
65
+ def process_footnote_link1(elem)
66
66
  elem.children.each do |c|
67
67
  if c.name == "span" && c["class"] == "MsoFootnoteReference"
68
68
  c.replace(FN)
@@ -72,7 +72,7 @@ module Html2Doc
72
72
  end
73
73
  end
74
74
 
75
- def self.transform_footnote_text(note)
75
+ def transform_footnote_text(note)
76
76
  note["id"] = ""
77
77
  note.xpath(".//div").each { |div| div.replace(div.children) }
78
78
  note.xpath(".//aside | .//p").each do |p|
@@ -82,12 +82,12 @@ module Html2Doc
82
82
  note.remove
83
83
  end
84
84
 
85
- def self.footnote?(elem)
85
+ def footnote?(elem)
86
86
  elem["epub:type"]&.casecmp("footnote")&.zero? ||
87
87
  elem["class"]&.casecmp("footnote")&.zero?
88
88
  end
89
89
 
90
- def self.set_footnote_link_attrs(elem, idx)
90
+ def set_footnote_link_attrs(elem, idx)
91
91
  elem["style"] = "mso-footnote-id:ftn#{idx}"
92
92
  elem["href"] = "#_ftn#{idx}"
93
93
  elem["name"] = "_ftnref#{idx}"
@@ -99,7 +99,7 @@ module Html2Doc
99
99
  # to p). We do not expect any <a name> or links back to text; if they
100
100
  # are present in the HTML, they need to have been cleaned out before
101
101
  # passing to this gem
102
- def self.footnote_cleanup(docxml)
102
+ def footnote_cleanup(docxml)
103
103
  docxml.xpath('//div[@style="mso-element:footnote"]/a')
104
104
  .each do |x|
105
105
  n = x.next_element
@@ -1,3 +1,3 @@
1
- module Html2Doc
2
- VERSION = "1.3.1".freeze
1
+ class Html2Doc
2
+ VERSION = "1.4.0".freeze
3
3
  end
@@ -76,7 +76,7 @@ WORD_FTR1 = <<~FTR.freeze
76
76
  Content-ID: <filelist.xml>
77
77
  Content-Disposition: inline; filename="filelist.xml"
78
78
  Content-Transfer-Encoding: base64
79
- Content-Type: #{Html2Doc::mime_type('filelist.xml')}
79
+ Content-Type: #{Html2Doc.new({}).mime_type('filelist.xml')}
80
80
 
81
81
  PHhtbCB4bWxuczpvPSJ1cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiPgog
82
82
  ICAgICAgIDxvOk1haW5GaWxlIEhSZWY9Ii4uL3Rlc3QuaHRtIi8+ICA8bzpGaWxlIEhSZWY9ImZp
@@ -90,7 +90,7 @@ WORD_FTR2 = <<~FTR.freeze
90
90
  Content-ID: <filelist.xml>
91
91
  Content-Disposition: inline; filename="filelist.xml"
92
92
  Content-Transfer-Encoding: base64
93
- Content-Type: #{Html2Doc::mime_type('filelist.xml')}
93
+ Content-Type: #{Html2Doc.new({}).mime_type('filelist.xml')}
94
94
  PHhtbCB4bWxuczpvPSJ1cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiPgog
95
95
  ICAgICAgIDxvOk1haW5GaWxlIEhSZWY9Ii4uL3Rlc3QuaHRtIi8+ICA8bzpGaWxlIEhSZWY9ImZp
96
96
  bGVsaXN0LnhtbCIvPgogIDxvOkZpbGUgSFJlZj0iaGVhZGVyLmh0bWwiLz4KPC94bWw+Cg==
@@ -102,7 +102,7 @@ WORD_FTR3 = <<~FTR.freeze
102
102
  Content-ID: <filelist.xml>
103
103
  Content-Disposition: inline; filename="filelist.xml"
104
104
  Content-Transfer-Encoding: base64
105
- Content-Type: #{Html2Doc::mime_type('filelist.xml')}
105
+ Content-Type: #{Html2Doc.new({}).mime_type('filelist.xml')}
106
106
 
107
107
  PHhtbCB4bWxuczpvPSJ1cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiPgog
108
108
  ICAgICAgIDxvOk1haW5GaWxlIEhSZWY9Ii4uL3Rlc3QuaHRtIi8+ICA8bzpGaWxlIEhSZWY9IjFh
@@ -278,18 +278,18 @@ RSpec.describe Html2Doc do
278
278
  end
279
279
 
280
280
  it "preserves Word HTML directives" do
281
- Html2Doc.process(html_input(%[A<!--[if gte mso 9]>X<![endif]-->B]), filename: "test")
281
+ Html2Doc.new(filename: "test").process(html_input(%[A<!--[if gte mso 9]>X<![endif]-->B]))
282
282
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
283
283
  .to match_fuzzy(<<~OUTPUT)
284
284
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
285
285
  #{word_body(%{A<!--[if gte mso 9]>X<![endif]-->B},
286
- '<div style="mso-element:footnote-list"/>')}
286
+ '<div style="mso-element:footnote-list"/>')}
287
287
  #{WORD_FTR1}
288
288
  OUTPUT
289
289
  end
290
290
 
291
291
  it "processes a blank document" do
292
- Html2Doc.process(html_input(""), filename: "test")
292
+ Html2Doc.new(filename: "test").process(html_input(""))
293
293
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
294
294
  .to match_fuzzy(<<~OUTPUT)
295
295
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -299,15 +299,15 @@ RSpec.describe Html2Doc do
299
299
 
300
300
  it "removes any temp files" do
301
301
  File.delete("test.doc")
302
- Html2Doc.process(html_input(""), filename: "test")
302
+ Html2Doc.new(filename: "test").process(html_input(""))
303
303
  expect(File.exist?("test.doc")).to be true
304
304
  expect(File.exist?("test.htm")).to be false
305
305
  expect(File.exist?("test_files")).to be false
306
306
  end
307
307
 
308
308
  it "processes a stylesheet in an HTML document with a title" do
309
- Html2Doc.process(html_input(""),
310
- filename: "test", stylesheet: "lib/html2doc/wordstyle.css")
309
+ Html2Doc.new(filename: "test", stylesheet: "lib/html2doc/wordstyle.css")
310
+ .process(html_input(""))
311
311
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
312
312
  .to match_fuzzy(<<~OUTPUT)
313
313
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -316,9 +316,11 @@ RSpec.describe Html2Doc do
316
316
  end
317
317
 
318
318
  it "processes a stylesheet in an HTML document without a title" do
319
- Html2Doc.process(html_input_no_title(""),
320
- filename: "test", stylesheet: "lib/html2doc/wordstyle.css")
321
- expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
319
+ Html2Doc.new(filename: "test",
320
+ stylesheet: "lib/html2doc/wordstyle.css")
321
+ .process(html_input_no_title(""))
322
+ expect(guid_clean(File.read("test.doc",
323
+ encoding: "utf-8")))
322
324
  .to match_fuzzy(<<~OUTPUT)
323
325
  #{WORD_HDR.sub('<title>blank</title>', '')}
324
326
  #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -327,12 +329,14 @@ RSpec.describe Html2Doc do
327
329
  end
328
330
 
329
331
  it "processes a stylesheet in an HTML document with an empty head" do
330
- Html2Doc.process(html_input_empty_head(""),
331
- filename: "test", stylesheet: "lib/html2doc/wordstyle.css")
332
+ Html2Doc.new(filename: "test",
333
+ stylesheet: "lib/html2doc/wordstyle.css")
334
+ .process(html_input_empty_head(""))
332
335
  word_hdr_end = WORD_HDR_END
333
336
  .sub(%(<meta name="Originator" content="Me"/>\n), "")
334
337
  .sub("</style>\n</head>", "</style></head>")
335
- expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
338
+ expect(guid_clean(File.read("test.doc",
339
+ encoding: "utf-8")))
336
340
  .to match_fuzzy(<<~OUTPUT)
337
341
  #{WORD_HDR.sub('<title>blank</title>', '')}
338
342
  #{DEFAULT_STYLESHEET}
@@ -342,8 +346,9 @@ RSpec.describe Html2Doc do
342
346
  end
343
347
 
344
348
  it "processes a header" do
345
- Html2Doc.process(html_input(""),
346
- filename: "test", header_file: "spec/header.html")
349
+ Html2Doc.new(filename: "test",
350
+ header_file: "spec/header.html")
351
+ .process(html_input(""))
347
352
  html = guid_clean(File.read("test.doc", encoding: "utf-8"))
348
353
  hdr = Base64.decode64(
349
354
  html
@@ -365,8 +370,9 @@ RSpec.describe Html2Doc do
365
370
  end
366
371
 
367
372
  it "processes a header with an image" do
368
- Html2Doc.process(html_input(""),
369
- filename: "test", header_file: "spec/header_img.html")
373
+ Html2Doc.new(filename: "test",
374
+ header_file: "spec/header_img.html")
375
+ .process(html_input(""))
370
376
  doc = guid_clean(File.read("test.doc", encoding: "utf-8"))
371
377
  expect(doc).to match(%r{Content-Type: image/png})
372
378
  expect(doc).to match(%r{iVBORw0KGgoAAAANSUhEUgAAA5cAAAN7CAYAAADRE24cAAAgAElEQVR4XuydB5gUxdaGC65gTogB})
@@ -381,8 +387,9 @@ RSpec.describe Html2Doc do
381
387
  "19160-6.png"))),
382
388
  )
383
389
  end
384
- Html2Doc.process(html_input(""),
385
- filename: "test", header_file: "spec/header_img1.html")
390
+ Html2Doc.new(filename: "test",
391
+ header_file: "spec/header_img1.html")
392
+ .process(html_input(""))
386
393
  doc = guid_clean(File.read("test.doc", encoding: "utf-8"))
387
394
  expect(doc).to match(%r{Content-Type: image/png})
388
395
  expect(doc).to match(%r{iVBORw0KGgoAAAANSUhEUgAAA5cAAAN7CAYAAADRE24cAAAgAElEQVR4XuydB5gUxdaGC65gTogB})
@@ -391,7 +398,7 @@ RSpec.describe Html2Doc do
391
398
  it "processes a populated document" do
392
399
  simple_body = "<h1>Hello word!</h1>
393
400
  <div>This is a very simple document</div>"
394
- Html2Doc.process(html_input(simple_body), filename: "test")
401
+ Html2Doc.new(filename: "test").process(html_input(simple_body))
395
402
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
396
403
  .to match_fuzzy(<<~OUTPUT)
397
404
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -401,9 +408,11 @@ RSpec.describe Html2Doc do
401
408
  end
402
409
 
403
410
  it "processes AsciiMath" do
404
- Html2Doc.process(html_input(%[<div>{{sum_(i=1)^n i^3=((n(n+1))/2)^2 text("integer"))}}</div>]),
405
- filename: "test", asciimathdelims: ["{{", "}}"])
406
- expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
411
+ Html2Doc.new(filename: "test",
412
+ asciimathdelims: ["{{", "}}"])
413
+ .process(html_input(%[<div>{{sum_(i=1)^n i^3=((n(n+1))/2)^2 text("integer"))}}</div>]))
414
+ expect(guid_clean(File.read("test.doc",
415
+ encoding: "utf-8")))
407
416
  .to match_fuzzy(<<~OUTPUT)
408
417
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
409
418
  #{word_body(%{
@@ -416,8 +425,8 @@ RSpec.describe Html2Doc do
416
425
  end
417
426
 
418
427
  it "processes mstyle" do
419
- Html2Doc.process(html_input(%[<div>{{bb (-log_2 (p_u)) bb "BB" bbb "BBB" cc "CC" bcc "BCC" tt "TT" fr "FR" bfr "BFR" sf "SF" bsf "BSFα" sfi "SFI" sfbi "SFBIα" bii "BII" ii "II"}}</div>]),
420
- filename: "test", asciimathdelims: ["{{", "}}"])
428
+ Html2Doc.new(filename: "test", asciimathdelims: ["{{", "}}"])
429
+ .process(html_input(%[<div>{{bb (-log_2 (p_u)) bb "BB" bbb "BBB" cc "CC" bcc "BCC" tt "TT" fr "FR" bfr "BFR" sf "SF" bsf "BSFα" sfi "SFI" sfbi "SFBIα" bii "BII" ii "II"}}</div>]))
421
430
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
422
431
  .to match_fuzzy(<<~OUTPUT)
423
432
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -431,8 +440,8 @@ RSpec.describe Html2Doc do
431
440
  end
432
441
 
433
442
  it "processes spaces in AsciiMath" do
434
- Html2Doc.process(html_input(%[<div>{{text " integer ")}}</div>]),
435
- filename: "test", asciimathdelims: ["{{", "}}"])
443
+ Html2Doc.new(filename: "test", asciimathdelims: ["{{", "}}"])
444
+ .process(html_input(%[<div>{{text " integer ")}}</div>]))
436
445
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
437
446
  .to match_fuzzy(<<~OUTPUT)
438
447
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -446,10 +455,10 @@ RSpec.describe Html2Doc do
446
455
  end
447
456
 
448
457
  it "processes spaces in MathML mtext" do
449
- Html2Doc.process(html_input("<div><math xmlns='http://www.w3.org/1998/Math/MathML'>
458
+ Html2Doc.new(filename: "test", asciimathdelims: ["{{", "}}"])
459
+ .process(html_input("<div><math xmlns='http://www.w3.org/1998/Math/MathML'>
450
460
  <mrow><mi>H</mi><mtext> original </mtext><mi>J</mi></mrow>
451
- </math></div>"),
452
- filename: "test", asciimathdelims: ["{{", "}}"])
461
+ </math></div>"))
453
462
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
454
463
  .to match_fuzzy(<<~OUTPUT)
455
464
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -462,9 +471,10 @@ RSpec.describe Html2Doc do
462
471
  end
463
472
 
464
473
  it "unwraps and converts accent in MathML" do
465
- Html2Doc.process(html_input("<div><math xmlns='http://www.w3.org/1998/Math/MathML'>
474
+ Html2Doc.new(filename: "test", asciimathdelims: ["{{", "}}"])
475
+ .process(html_input("<div><math xmlns='http://www.w3.org/1998/Math/MathML'>
466
476
  <mover accent='true'><mrow><mi>p</mi></mrow><mrow><mo>^</mo></mrow></mover>
467
- </math></div>"), filename: "test", asciimathdelims: ["{{", "}}"])
477
+ </math></div>"))
468
478
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
469
479
  .to match_fuzzy(<<~OUTPUT)
470
480
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -477,8 +487,8 @@ RSpec.describe Html2Doc do
477
487
  end
478
488
 
479
489
  it "left-aligns AsciiMath" do
480
- Html2Doc.process(html_input("<div style='text-align:left;'>{{sum_(i=1)^n i^3=((n(n+1))/2)^2}}</div>"),
481
- filename: "test", asciimathdelims: ["{{", "}}"])
490
+ Html2Doc.new(filename: "test", asciimathdelims: ["{{", "}}"])
491
+ .process(html_input("<div style='text-align:left;'>{{sum_(i=1)^n i^3=((n(n+1))/2)^2}}</div>"))
482
492
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
483
493
  .to match_fuzzy(<<~OUTPUT)
484
494
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -492,9 +502,11 @@ RSpec.describe Html2Doc do
492
502
  end
493
503
 
494
504
  it "right-aligns AsciiMath" do
495
- Html2Doc.process(html_input("<div style='text-align:right;'>{{sum_(i=1)^n i^3=((n(n+1))/2)^2}}</div>"),
496
- filename: "test", asciimathdelims: ["{{", "}}"])
497
- expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
505
+ Html2Doc.new(filename: "test",
506
+ asciimathdelims: ["{{", "}}"])
507
+ .process(html_input("<div style='text-align:right;'>{{sum_(i=1)^n i^3=((n(n+1))/2)^2}}</div>"))
508
+ expect(guid_clean(File.read("test.doc",
509
+ encoding: "utf-8")))
498
510
  .to match_fuzzy(<<~OUTPUT)
499
511
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
500
512
  #{word_body(%{
@@ -509,21 +521,21 @@ RSpec.describe Html2Doc do
509
521
  it "raises error in processing of broken AsciiMath" do
510
522
  begin
511
523
  expect do
512
- Html2Doc.process(html_input(%[<div style='text-align:right;'>{{u_c = 6.6"unitsml(kHz)}}</div>]),
513
- filename: "test", asciimathdelims: ["{{", "}}"])
524
+ Html2Doc.new(filename: "test", asciimathdelims: ["{{", "}}"])
525
+ .process(html_input(%[<div style='text-align:right;'>{{u_c = 6.6"unitsml(kHz)}}</div>]))
514
526
  end.to output('parsing: u_c = 6.6"unitsml(kHz)').to_stderr
515
527
  rescue StandardError
516
528
  end
517
529
  expect do
518
- Html2Doc.process(html_input(%[<div style='text-align:right;'>{{u_c = 6.6"unitsml(kHz)}}</div>]),
519
- filename: "test", asciimathdelims: ["{{", "}}"])
530
+ Html2Doc.new(filename: "test", asciimathdelims: ["{{", "}}"])
531
+ .process(html_input(%[<div style='text-align:right;'>{{u_c = 6.6"unitsml(kHz)}}</div>]))
520
532
  end.to raise_error(StandardError)
521
533
  end
522
534
 
523
535
  it "wraps msup after munderover in MathML" do
524
- Html2Doc.process(html_input("<div><math xmlns='http://www.w3.org/1998/Math/MathML'>
525
- <munderover><mo>&#x2211;</mo><mrow><mi>i</mi><mo>=</mo><mn>0</mn></mrow><mrow><mi>n</mi></mrow></munderover><msup><mn>2</mn><mrow><mi>i</mi></mrow></msup></math></div>"),
526
- filename: "test", asciimathdelims: ["{{", "}}"])
536
+ Html2Doc.new(filename: "test", asciimathdelims: ["{{", "}}"])
537
+ .process(html_input("<div><math xmlns='http://www.w3.org/1998/Math/MathML'>
538
+ <munderover><mo>&#x2211;</mo><mrow><mi>i</mi><mo>=</mo><mn>0</mn></mrow><mrow><mi>n</mi></mrow></munderover><msup><mn>2</mn><mrow><mi>i</mi></mrow></msup></math></div>"))
527
539
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
528
540
  .to match_fuzzy(<<~OUTPUT)
529
541
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -537,7 +549,7 @@ RSpec.describe Html2Doc do
537
549
  it "processes tabs" do
538
550
  simple_body = "<h1>Hello word!</h1>
539
551
  <div>This is a very &tab; simple document</div>"
540
- Html2Doc.process(html_input(simple_body), filename: "test")
552
+ Html2Doc.new(filename: "test").process(html_input(simple_body))
541
553
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
542
554
  .to match_fuzzy(<<~OUTPUT)
543
555
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -550,7 +562,7 @@ RSpec.describe Html2Doc do
550
562
  simple_body = '<h1>Hello word!</h1>
551
563
  <p>This is a very simple document</p>
552
564
  <p class="x">This style stays</p>'
553
- Html2Doc.process(html_input(simple_body), filename: "test")
565
+ Html2Doc.new(filename: "test").process(html_input(simple_body))
554
566
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
555
567
  .to match_fuzzy(<<~OUTPUT)
556
568
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -565,7 +577,7 @@ RSpec.describe Html2Doc do
565
577
  <li>This is a very simple document</li>
566
578
  <li class="x">This style stays</li>
567
579
  </ul>'
568
- Html2Doc.process(html_input(simple_body), filename: "test")
580
+ Html2Doc.new(filename: "test").process(html_input(simple_body))
569
581
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
570
582
  .to match_fuzzy(<<~OUTPUT)
571
583
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -576,8 +588,8 @@ RSpec.describe Html2Doc do
576
588
 
577
589
  it "resizes images for height, in a file in a subdirectory" do
578
590
  simple_body = '<img src="19160-6.png">'
579
- Html2Doc.process(html_input(simple_body), filename: "spec/test",
580
- imagedir: "spec")
591
+ Html2Doc.new(filename: "spec/test", imagedir: "spec")
592
+ .process(html_input(simple_body))
581
593
  testdoc = File.read("spec/test.doc", encoding: "utf-8")
582
594
  expect(testdoc).to match(%r{Content-Type: image/png})
583
595
  expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
@@ -589,7 +601,8 @@ RSpec.describe Html2Doc do
589
601
 
590
602
  it "resizes images for width" do
591
603
  simple_body = '<img src="spec/19160-7.gif">'
592
- Html2Doc.process(html_input(simple_body), filename: "test", imagedir: ".")
604
+ Html2Doc.new(filename: "test", imagedir: ".")
605
+ .process(html_input(simple_body))
593
606
  testdoc = File.read("test.doc", encoding: "utf-8")
594
607
  expect(testdoc).to match(%r{Content-Type: image/gif})
595
608
  expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
@@ -601,7 +614,8 @@ RSpec.describe Html2Doc do
601
614
 
602
615
  it "resizes images for height" do
603
616
  simple_body = '<img src="spec/19160-8.jpg">'
604
- Html2Doc.process(html_input(simple_body), filename: "test", imagedir: ".")
617
+ Html2Doc.new(filename: "test", imagedir: ".")
618
+ .process(html_input(simple_body))
605
619
  testdoc = File.read("test.doc", encoding: "utf-8")
606
620
  expect(testdoc).to match(%r{Content-Type: image/jpeg})
607
621
  expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
@@ -613,48 +627,49 @@ RSpec.describe Html2Doc do
613
627
 
614
628
  it "resizes images with missing or auto sizes" do
615
629
  image = { "src" => "spec/19160-8.jpg" }
616
- expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
630
+ expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
617
631
  .to eq [30, 100]
618
632
  image["width"] = "20"
619
- expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
633
+ expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
620
634
  .to eq [20, 65]
621
635
  image.delete("width")
622
636
  image["height"] = "50"
623
- expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
637
+ expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
624
638
  .to eq [15, 50]
625
639
  image.delete("height")
626
640
  image["width"] = "500"
627
- expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
641
+ expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
628
642
  .to eq [30, 100]
629
643
  image.delete("width")
630
644
  image["height"] = "500"
631
- expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
645
+ expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
632
646
  .to eq [30, 100]
633
647
  image["width"] = "20"
634
648
  image["height"] = "auto"
635
- expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
649
+ expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
636
650
  .to eq [20, 65]
637
651
  image["width"] = "auto"
638
652
  image["height"] = "50"
639
- expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
653
+ expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
640
654
  .to eq [15, 50]
641
655
  image["width"] = "500"
642
656
  image["height"] = "auto"
643
- expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
657
+ expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
644
658
  .to eq [30, 100]
645
659
  image["width"] = "auto"
646
660
  image["height"] = "500"
647
- expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
661
+ expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
648
662
  .to eq [30, 100]
649
663
  image["width"] = "auto"
650
664
  image["height"] = "auto"
651
- expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
665
+ expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
652
666
  .to eq [30, 100]
653
667
  end
654
668
 
655
669
  it "does not move images if they are external URLs" do
656
670
  simple_body = '<img src="https://example.com/19160-6.png">'
657
- Html2Doc.process(html_input(simple_body), filename: "test", imagedir: ".")
671
+ Html2Doc.new(filename: "test", imagedir: ".")
672
+ .process(html_input(simple_body))
658
673
  testdoc = File.read("test.doc", encoding: "utf-8")
659
674
  expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
660
675
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -665,8 +680,8 @@ RSpec.describe Html2Doc do
665
680
 
666
681
  it "deals with absolute image locations" do
667
682
  simple_body = %{<img src="#{__dir__}/19160-6.png">}
668
- Html2Doc.process(html_input(simple_body), filename: "spec/test",
669
- imagedir: ".")
683
+ Html2Doc.new(filename: "spec/test", imagedir: ".")
684
+ .process(html_input(simple_body))
670
685
  testdoc = File.read("spec/test.doc", encoding: "utf-8")
671
686
  expect(testdoc).to match(%r{Content-Type: image/png})
672
687
  expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
@@ -687,7 +702,7 @@ RSpec.describe Html2Doc do
687
702
  document<a epub:type="footnote" href="#a1">1</a> allegedly<a epub:type="footnote" href="#a2">2</a></div>
688
703
  <aside id="a1">Footnote</aside>
689
704
  <aside id="a2">Other Footnote</aside>'
690
- Html2Doc.process(html_input(simple_body), filename: "test")
705
+ Html2Doc.new(filename: "test").process(html_input(simple_body))
691
706
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
692
707
  .to match_fuzzy(<<~OUTPUT)
693
708
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -707,7 +722,7 @@ RSpec.describe Html2Doc do
707
722
  document<a class="footnote" href="#a1">1</a> allegedly<a class="footnote" href="#a2">2</a></div>
708
723
  <aside id="a1">Footnote</aside>
709
724
  <aside id="a2">Other Footnote</aside>'
710
- Html2Doc.process(html_input(simple_body), filename: "test")
725
+ Html2Doc.new(filename: "test").process(html_input(simple_body))
711
726
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
712
727
  .to match_fuzzy(<<~OUTPUT)
713
728
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -727,7 +742,7 @@ RSpec.describe Html2Doc do
727
742
  document<a class="footnote" href="#a1">(<span class="MsoFootnoteReference">1</span>)</a> allegedly<a class="footnote" href="#a2">2</a></div>
728
743
  <aside id="a1">Footnote</aside>
729
744
  <aside id="a2">Other Footnote</aside>'
730
- Html2Doc.process(html_input(simple_body), filename: "test")
745
+ Html2Doc.new(filename: "test").process(html_input(simple_body))
731
746
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
732
747
  .to match_fuzzy(<<~OUTPUT)
733
748
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -747,7 +762,7 @@ RSpec.describe Html2Doc do
747
762
  document<a class="footnote" href="#a1">1</a> allegedly<a class="footnote" href="#a2">2</a></div>
748
763
  <aside id="a1"><p>Footnote</p></aside>
749
764
  <div id="a2"><p>Other Footnote</p></div>'
750
- Html2Doc.process(html_input(simple_body), filename: "test")
765
+ Html2Doc.new(filename: "test").process(html_input(simple_body))
751
766
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
752
767
  .to match_fuzzy(<<~OUTPUT)
753
768
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -767,8 +782,8 @@ RSpec.describe Html2Doc do
767
782
  <div><ul id="0">
768
783
  <li><div><p><ol id="1"><li><ul id="2"><li><p><ol id="3"><li><ol id="4"><li>A</li><li><p>B</p><p>B2</p></li><li>C</li></ol></li></ol></p></li></ul></li></ol></p></div></li><div><ul id="5"><li>C</li></ul></div>
769
784
  BODY
770
- Html2Doc.process(html_input(simple_body),
771
- filename: "test", liststyles: { ul: "l1", ol: "l2" })
785
+ Html2Doc.new(filename: "test", liststyles: { ul: "l1", ol: "l2" })
786
+ .process(html_input(simple_body))
772
787
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
773
788
  .to match_fuzzy(<<~OUTPUT)
774
789
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -786,8 +801,8 @@ RSpec.describe Html2Doc do
786
801
  <ol id="1"><li><div><p><ol id="2"><li><ul id="3"><li><p><ol id="4"><li><ol id="5"><li>A</li></ol></li></ol></p></li></ul></li></ol></p></div></li></ol>
787
802
  <ol id="6"><li><div><p><ol id="7"><li><ul id="8"><li><p><ol id="9"><li><ol id="10"><li>A</li></ol></li></ol></p></li></ul></li></ol></p></div></li></ol></div>
788
803
  BODY
789
- Html2Doc.process(html_input(simple_body),
790
- filename: "test", liststyles: { ul: "l1", ol: "l2" })
804
+ Html2Doc.new(filename: "test", liststyles: { ul: "l1", ol: "l2" })
805
+ .process(html_input(simple_body))
791
806
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
792
807
  .to match_fuzzy(<<~OUTPUT)
793
808
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -808,9 +823,10 @@ RSpec.describe Html2Doc do
808
823
  <div><ul class="other" id="10">
809
824
  <li><div><p><ol id="11"><li><ul id="12"><li><p><ol id="13"><li><ol id="14"><li>A</li><li><p>B</p><p>B2</p></li><li>C</li></ol></li></ol></p></li></ul></li></ol></p></div></li></ul></div>
810
825
  BODY
811
- Html2Doc.process(html_input(simple_body),
812
- filename: "test",
813
- liststyles: { ul: "l1", ol: "l2", steps: "l3" })
826
+ Html2Doc.new(filename: "test",
827
+ liststyles: { ul: "l1", ol: "l2",
828
+ steps: "l3" })
829
+ .process(html_input(simple_body))
814
830
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
815
831
  .to match_fuzzy(<<~OUTPUT)
816
832
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -832,8 +848,8 @@ RSpec.describe Html2Doc do
832
848
  <p id="b"/>
833
849
  </div>
834
850
  BODY
835
- Html2Doc.process(html_input(simple_body),
836
- filename: "test", liststyles: { ul: "l1", ol: "l2" })
851
+ Html2Doc.new(filename: "test", liststyles: { ul: "l1", ol: "l2" })
852
+ .process(html_input(simple_body))
837
853
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
838
854
  .to match_fuzzy(<<~OUTPUT)
839
855
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -848,8 +864,8 @@ RSpec.describe Html2Doc do
848
864
 
849
865
  it "test image base64 image encoding" do
850
866
  simple_body = '<img src="19160-6.png">'
851
- Html2Doc.process(html_input(simple_body),
852
- filename: "spec/test", debug: true, imagedir: "spec")
867
+ Html2Doc.new(filename: "spec/test", debug: true, imagedir: "spec")
868
+ .process(html_input(simple_body))
853
869
  testdoc = File.read("spec/test.doc", encoding: "utf-8")
854
870
  base64_image = testdoc[/image\/png\n\n(.*?)\n\n----/m, 1].gsub!("\n", "")
855
871
  base64_image_basename = testdoc[%r{Content-ID: <([0-9a-z\-]+)\.png}m, 1]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.1
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-02-06 00:00:00.000000000 Z
11
+ date: 2022-05-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: asciimath
@@ -334,7 +334,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
334
334
  - !ruby/object:Gem::Version
335
335
  version: '0'
336
336
  requirements: []
337
- rubygems_version: 3.2.32
337
+ rubygems_version: 3.3.9
338
338
  signing_key:
339
339
  specification_version: 4
340
340
  summary: Convert HTML document to Microsoft Word document