html2doc 1.5.4 → 1.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 74b05f46f1fd365f9ff0766e95d884bd2959c01b92c70d4a080651adfc2e8d3c
4
- data.tar.gz: f70eb009e705ff767b34922fc0444740be8dde80da8b78c503784e02be0e4560
3
+ metadata.gz: 47535bf46876ee49a732b6c136f78b58a9ac009f880b95c5a73c8770293f3735
4
+ data.tar.gz: a052e0c3ba3ee27ca208b2624d7a832cba67bcc959f1ec5da36f9a7049c26c35
5
5
  SHA512:
6
- metadata.gz: e3d93501d63bd27ed6e5245cb18dbc49013fcecd83bc57acf3a5d3c797636b928b91e148e33e8326f10f77f9b94a7175d85294eb86a1b4b2261aafb7dfe9d7a4
7
- data.tar.gz: 4cbb8887089e622b9d9d1fd82dc4e5fd4e8e81a28a59dcf02ccce22cb3c9e6e7c4c7802177259557c268d697ced17ec09e4181e82c0dc851a613553e7f5b58c1
6
+ metadata.gz: '096dc5a7fe4b35e5afdec632f37b28f9980fcbcba4222e1ec1eb81fe4653a62cc00c5d9b90ed38ae54d4320ea8bf7e0fd0698625045504d371bcb90fb6247a54'
7
+ data.tar.gz: 7aeebef3892dc2273bc4ab9899624fc113b1989c3af097204b4f73eb250a7d52e8b3cfe62439e84fb179c1e9bbc96563af669d87f56b15dd82b4fc99953a2227
data/lib/html2doc/base.rb CHANGED
@@ -30,8 +30,7 @@ class Html2Doc
30
30
  end
31
31
 
32
32
  def process_header(headerfile)
33
- return if headerfile.nil?
34
-
33
+ headerfile.nil? and return
35
34
  doc = File.read(headerfile, encoding: "utf-8")
36
35
  doc = header_image_cleanup(doc, @dir1, @filename,
37
36
  File.dirname(@filename))
@@ -54,7 +53,7 @@ class Html2Doc
54
53
  end
55
54
 
56
55
  def process_html(result)
57
- docxml = to_xhtml(asciimath_to_mathml(result, @asciimathdelims))
56
+ docxml = to_xhtml(result)
58
57
  define_head(cleanup(docxml))
59
58
  msword_fix(from_xhtml(docxml))
60
59
  end
@@ -66,6 +65,7 @@ class Html2Doc
66
65
  end
67
66
 
68
67
  def cleanup(docxml)
68
+ locate_landscape(docxml)
69
69
  namespace(docxml.root)
70
70
  image_cleanup(docxml, @dir1, @imagedir)
71
71
  mathml_to_ooml(docxml)
@@ -76,76 +76,11 @@ class Html2Doc
76
76
  docxml
77
77
  end
78
78
 
79
- NOKOHEAD = <<~HERE.freeze
80
- <!DOCTYPE html SYSTEM
81
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
82
- <html xmlns="http://www.w3.org/1999/xhtml">
83
- <head> <title></title> <meta charset="UTF-8" /> </head>
84
- <body> </body> </html>
85
- HERE
86
-
87
- def to_xhtml(xml)
88
- xml.gsub!(/<\?xml[^>]*>/, "")
89
- unless /<!DOCTYPE /.match? xml
90
- xml = '<!DOCTYPE html SYSTEM
91
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
92
- end
93
- xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
94
- .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
95
- Nokogiri::XML.parse(xml)
96
- end
97
-
98
- DOCTYPE = <<~"DOCTYPE".freeze
99
- <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
100
- DOCTYPE
101
-
102
- def from_xhtml(xml)
103
- xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
104
- .sub(DOCTYPE, "").gsub(%{ />}, "/>")
105
- .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
106
- .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
107
- .gsub("\n--&gt;\n", "\n-->\n")
108
- end
109
-
110
- def msword_fix(doc)
111
- # brain damage in MSWord parser
112
- doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
113
- "<w:DoNotOptimizeForBrowser/>")
114
- doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
115
- '<span style="mso-special-character:footnote"></span>')
116
- doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
117
- '<div style="mso-element:footnote-list"/>')
118
- doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
119
- doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
120
- doc.gsub!(%r{<meta http-equiv="Content-Type"},
121
- "<meta http-equiv=Content-Type")
122
- doc.gsub!(%r{></m:jc>}, "/>")
123
- doc.gsub!(%r{></v:stroke>}, "/>")
124
- doc.gsub!(%r{></v:f>}, "/>")
125
- doc.gsub!(%r{></v:path>}, "/>")
126
- doc.gsub!(%r{></o:lock>}, "/>")
127
- doc.gsub!(%r{></v:imagedata>}, "/>")
128
- doc.gsub!(%r{></w:wrap>}, "/>")
129
- doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
130
- doc.gsub!(%r{&tab;|&amp;tab;},
131
- '<span style="mso-tab-count:1">&#xA0; </span>')
132
- doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
133
- a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
134
- a
135
- end.join
136
- end
137
-
138
- PRINT_VIEW = <<~XML.freeze
139
-
140
- <xml>
141
- <w:WordDocument>
142
- <w:View>Print</w:View>
143
- <w:Zoom>100</w:Zoom>
144
- <w:DoNotOptimizeForBrowser/>
145
- </w:WordDocument>
146
- </xml>
147
- <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
148
- XML
79
+ def locate_landscape(_docxml)
80
+ css = read_stylesheet(@stylesheet)
81
+ @landscape = css.scan(/div\.\S+\s+\{\s*page:\s*[^;]+L;\s*\}/m)
82
+ .map { |e| e.sub(/^div\.(\S+).*$/m, "\\1") }
83
+ end
149
84
 
150
85
  def define_head1(docxml, _dir)
151
86
  docxml.xpath("//*[local-name() = 'head']").each do |h|
@@ -174,7 +109,6 @@ class Html2Doc
174
109
  # xml.children.first << Nokogiri::XML::Comment.new(xml, s)
175
110
  xml.children.first << Nokogiri::XML::CDATA
176
111
  .new(xml, "\n<!--\n#{stylesheet}\n-->\n")
177
-
178
112
  xml.root.to_s
179
113
  end
180
114
 
@@ -199,30 +133,15 @@ class Html2Doc
199
133
  head.add_child css
200
134
  elsif title.nil?
201
135
  head.children.first.add_previous_sibling css
202
- else
203
- title.add_next_sibling css
136
+ else title.add_next_sibling css
204
137
  end
205
138
  end
206
139
 
207
- def namespace(root)
208
- {
209
- o: "urn:schemas-microsoft-com:office:office",
210
- w: "urn:schemas-microsoft-com:office:word",
211
- v: "urn:schemas-microsoft-com:vml",
212
- m: "http://schemas.microsoft.com/office/2004/12/omml",
213
- }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
214
- end
215
-
216
- def rootnamespace(root)
217
- root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
218
- end
219
-
220
140
  def bookmarks(docxml)
221
141
  docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
222
142
  .each do |x|
223
- next if x["id"].empty? ||
224
- %w(shapetype v:shapetype shape v:shape).include?(x.name)
225
-
143
+ (x["id"].empty? ||
144
+ %w(shapetype v:shapetype shape v:shape).include?(x.name)) and next
226
145
  if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
227
146
  else x.children.first.previous = "<a name='#{x['id']}'></a>"
228
147
  end
data/lib/html2doc/math.rb CHANGED
@@ -5,29 +5,6 @@ require "nokogiri"
5
5
  require "plane1converter"
6
6
 
7
7
  class Html2Doc
8
- def asciimath_to_mathml1(expr, retain_asciimath)
9
- ret = Plurimath::Math.parse(HTMLEntities.new.decode(expr), "asciimath").to_mathml
10
- .gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>").strip
11
- retain_asciimath and
12
- ret += "<asciimath>#{@c.encode(@c.decode(expr), :basic)}</asciimath>"
13
- ret
14
- rescue StandardError => e
15
- puts "parsing: #{expr}"
16
- puts e.message
17
- raise e
18
- end
19
-
20
- def asciimath_to_mathml(doc, delims, retain_asciimath: false)
21
- return doc if delims.nil? || delims.size < 2
22
-
23
- m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
24
- m.each_slice(4).map.with_index do |(*a), i|
25
- progress_conv(i, 500, (m.size / 4).floor, 1000, "AsciiMath")
26
- a[2].nil? or a[2] = asciimath_to_mathml1(a[2], retain_asciimath)
27
- a.size > 1 ? a[0] + a[2] : a[0]
28
- end.join
29
- end
30
-
31
8
  def progress_conv(idx, step, total, threshold, msg)
32
9
  return unless (idx % step).zero? && total > threshold && idx.positive?
33
10
 
data/lib/html2doc/mime.rb CHANGED
@@ -78,8 +78,7 @@ class Html2Doc
78
78
 
79
79
  def image_resize(img, path, maxheight, maxwidth)
80
80
  s, realsize = get_image_size(img, path)
81
- return s if s[0] == nil && s[1] == nil
82
-
81
+ s[0] == nil && s[1] == nil and return s
83
82
  if img.name == "svg" && !img["viewBox"]
84
83
  img["viewBox"] = "0 0 #{s[0]} #{s[1]}"
85
84
  end
@@ -118,12 +117,24 @@ class Html2Doc
118
117
  docxml.traverse do |i|
119
118
  skip_image_cleanup?(i) and next
120
119
  local_filename = rename_image(i, dir, localdir)
121
- i["width"], i["height"] = image_resize(i, local_filename, maxheight,
122
- maxwidth)
120
+ i["width"], i["height"] =
121
+ if landscape?(i)
122
+ image_resize(i, local_filename, maxwidth, maxheight)
123
+ else
124
+ image_resize(i, local_filename, maxheight, maxwidth)
125
+ end
123
126
  end
124
127
  docxml
125
128
  end
126
129
 
130
+ def landscape?(img)
131
+ img.ancestors.each do |a|
132
+ a.name == "div" or next
133
+ @landscape.include?(a["class"]) and return true
134
+ end
135
+ false
136
+ end
137
+
127
138
  def rename_image(img, dir, localdir)
128
139
  local_filename = localname(img["src"], localdir)
129
140
  new_filename = "#{mkuuid}#{File.extname(img['src'])}"
@@ -134,10 +145,9 @@ class Html2Doc
134
145
 
135
146
  def skip_image_cleanup?(img)
136
147
  src = img["src"]
137
- return true unless img.element? && %w(img v:imagedata).include?(img.name)
138
- return true if src.nil? || src.empty? || /^http/.match?(src) ||
139
- %r{^data:(image|application)/[^;]+;base64}.match?(src)
140
-
148
+ (img.element? && %w(img v:imagedata).include?(img.name)) or return true
149
+ (src.nil? || src.empty? || /^http/.match?(src) ||
150
+ %r{^data:(image|application)/[^;]+;base64}.match?(src)) and return true
141
151
  false
142
152
  end
143
153
 
@@ -222,8 +232,7 @@ class Html2Doc
222
232
  f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office">
223
233
  <o:MainFile HRef="../#{filename}.htm"/>}
224
234
  Dir.entries(dir).sort.each do |item|
225
- next if item == "." || item == ".." || /^\./.match(item)
226
-
235
+ (item == "." || item == ".." || /^\./.match(item)) and next
227
236
  f.write %{ <o:File HRef="#{item}"/>\n}
228
237
  end
229
238
  f.write("</xml>\n")
@@ -1,3 +1,3 @@
1
1
  class Html2Doc
2
- VERSION = "1.5.4".freeze
2
+ VERSION = "1.6.0".freeze
3
3
  end
@@ -0,0 +1,83 @@
1
+ class Html2Doc
2
+ NOKOHEAD = <<~HERE.freeze
3
+ <!DOCTYPE html SYSTEM
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
5
+ <html xmlns="http://www.w3.org/1999/xhtml">
6
+ <head> <title></title> <meta charset="UTF-8" /> </head>
7
+ <body> </body> </html>
8
+ HERE
9
+
10
+ def to_xhtml(xml)
11
+ xml.gsub!(/<\?xml[^>]*>/, "")
12
+ unless /<!DOCTYPE /.match? xml
13
+ xml = '<!DOCTYPE html SYSTEM
14
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
15
+ end
16
+ xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
17
+ .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
18
+ Nokogiri::XML.parse(xml)
19
+ end
20
+
21
+ DOCTYPE = <<~DOCTYPE.freeze
22
+ <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
23
+ DOCTYPE
24
+
25
+ def from_xhtml(xml)
26
+ xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
27
+ .sub(DOCTYPE, "").gsub(%{ />}, "/>")
28
+ .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
29
+ .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
30
+ .gsub("\n--&gt;\n", "\n-->\n")
31
+ end
32
+
33
+ def msword_fix(doc)
34
+ # brain damage in MSWord parser
35
+ doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
36
+ "<w:DoNotOptimizeForBrowser/>")
37
+ doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
38
+ '<span style="mso-special-character:footnote"></span>')
39
+ doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
40
+ '<div style="mso-element:footnote-list"/>')
41
+ doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
42
+ doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
43
+ doc.gsub!(%r{<meta http-equiv="Content-Type"},
44
+ "<meta http-equiv=Content-Type")
45
+ doc.gsub!(%r{></m:jc>}, "/>")
46
+ doc.gsub!(%r{></v:stroke>}, "/>")
47
+ doc.gsub!(%r{></v:f>}, "/>")
48
+ doc.gsub!(%r{></v:path>}, "/>")
49
+ doc.gsub!(%r{></o:lock>}, "/>")
50
+ doc.gsub!(%r{></v:imagedata>}, "/>")
51
+ doc.gsub!(%r{></w:wrap>}, "/>")
52
+ doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
53
+ doc.gsub!(%r{&tab;|&amp;tab;},
54
+ '<span style="mso-tab-count:1">&#xA0; </span>')
55
+ doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
56
+ a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
57
+ a
58
+ end.join
59
+ end
60
+
61
+ PRINT_VIEW = <<~XML.freeze
62
+
63
+ <xml>
64
+ <w:WordDocument>
65
+ <w:View>Print</w:View>
66
+ <w:Zoom>100</w:Zoom>
67
+ <w:DoNotOptimizeForBrowser/>
68
+ </w:WordDocument>
69
+ </xml>
70
+ <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
71
+ XML
72
+
73
+ def namespace(root)
74
+ { o: "urn:schemas-microsoft-com:office:office",
75
+ w: "urn:schemas-microsoft-com:office:word",
76
+ v: "urn:schemas-microsoft-com:vml",
77
+ m: "http://schemas.microsoft.com/office/2004/12/omml" }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
78
+ end
79
+
80
+ def rootnamespace(root)
81
+ root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
82
+ end
83
+ end
data/lib/html2doc.rb CHANGED
@@ -4,3 +4,4 @@ require_relative "html2doc/mime"
4
4
  require_relative "html2doc/notes"
5
5
  require_relative "html2doc/math"
6
6
  require_relative "html2doc/lists"
7
+ require_relative "html2doc/xml"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.5.4
4
+ version: 1.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-05-19 00:00:00.000000000 Z
11
+ date: 2023-08-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities
@@ -293,6 +293,7 @@ files:
293
293
  - lib/html2doc/notes.rb
294
294
  - lib/html2doc/version.rb
295
295
  - lib/html2doc/wordstyle.css
296
+ - lib/html2doc/xml.rb
296
297
  homepage: https://github.com/metanorma/html2doc
297
298
  licenses:
298
299
  - CC-BY-SA-3.0