html2doc 1.0.5 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6616c37575b4bd09b8b7bf7a89beffece7388f3e2a4039610825a030ffb72318
4
- data.tar.gz: f29a3348e0b9260c4178af9a7f939e8ec39c2401cdc321346e5a6321e862cbad
3
+ metadata.gz: b92a72c9d0ee6005e38ab8de1f0cbc48455819955d34eb349fef1244504a9971
4
+ data.tar.gz: 52b07e9c1720bc2bf7a7196e0f650c62b81bf534535795bb2a136f2e79829416
5
5
  SHA512:
6
- metadata.gz: 4dee0b6541178293833caf2d55e1ec382ea16692b4efe4455ff0fd627e89ef5e67be747248cefd6c56ff151d8f9ea46f54633241d90996055e3574604eed175e
7
- data.tar.gz: da5201fa1568f34e0638947d6dcfb5df1b7cf4303de1f74323122fcf92f6060cc7d5ddcc483cbfbfcb70798d12afc95c89b48d0274a34114fd8b18d7c1d58692
6
+ metadata.gz: af4b84183859fd83ac500c6c1ac28e76d49ec7fb48c8a4cf49bc3543909b913e78c8293918b34e21e9e68f2dc78a500824337677ae1aea54bcf7cbecebd9363d
7
+ data.tar.gz: 9e0bd48390458ec57dcc9650643a9332a549148a1297b37962e37399453d95132e39c69cb94ea13fcc70f9d6bc273fccf742da61e6f4a3025710182df1d65144
@@ -0,0 +1,42 @@
1
+ # Auto-generated by Cimas: Do not edit it manually!
2
+ # See https://github.com/metanorma/cimas
3
+ name: rake
4
+
5
+ on:
6
+ push:
7
+ branches: [ master, main ]
8
+ tags: [ v* ]
9
+ pull_request:
10
+
11
+ jobs:
12
+ rake:
13
+ name: Test on Ruby ${{ matrix.ruby }} ${{ matrix.os }}
14
+ runs-on: ${{ matrix.os }}
15
+ continue-on-error: ${{ matrix.experimental }}
16
+ strategy:
17
+ fail-fast: false
18
+ matrix:
19
+ ruby: [ '2.7', '2.6', '2.5', '2.4' ]
20
+ os: [ ubuntu-latest, windows-latest, macos-latest ]
21
+ experimental: [ false ]
22
+ include:
23
+ - ruby: '3.0'
24
+ os: 'ubuntu-latest'
25
+ experimental: true
26
+ - ruby: '3.0'
27
+ os: 'windows-latest'
28
+ experimental: true
29
+ - ruby: '3.0'
30
+ os: 'macos-latest'
31
+ experimental: true
32
+ steps:
33
+ - uses: actions/checkout@v2
34
+ with:
35
+ submodules: true
36
+
37
+ - uses: ruby/setup-ruby@v1
38
+ with:
39
+ ruby-version: ${{ matrix.ruby }}
40
+ bundler-cache: true
41
+
42
+ - run: bundle exec rake
data/.gitignore CHANGED
@@ -9,3 +9,5 @@
9
9
 
10
10
  # rspec failure tracking
11
11
  .rspec_status
12
+
13
+ .rubocop-https--*
data/.rubocop.yml CHANGED
@@ -1,10 +1,12 @@
1
1
  # This project follows the Ribose OSS style guide.
2
2
  # https://github.com/riboseinc/oss-guides
3
3
  # All project-specific additions and overrides should be specified in this file.
4
-
5
4
  inherit_from:
6
5
  - https://raw.githubusercontent.com/riboseinc/oss-guides/master/ci/rubocop.yml
6
+
7
+ # local repo-specific modifications
8
+
7
9
  AllCops:
8
- TargetRubyVersion: 2.3
9
- Rails:
10
- Enabled: true
10
+ DisplayCopNames: false
11
+ StyleGuideCopsOnly: false
12
+ TargetRubyVersion: 2.4
data/Gemfile CHANGED
@@ -10,6 +10,6 @@ end
10
10
 
11
11
  gemspec
12
12
 
13
- if File.exist? 'Gemfile.devel'
14
- eval File.read('Gemfile.devel'), nil, 'Gemfile.devel' # rubocop:disable Security/Eval
13
+ if File.exist? "Gemfile.devel"
14
+ eval File.read("Gemfile.devel"), nil, "Gemfile.devel" # rubocop:disable Security/Eval
15
15
  end
data/README.adoc CHANGED
@@ -3,9 +3,7 @@
3
3
  https://github.com/metanorma/html2doc/workflows/main/badge.svg
4
4
 
5
5
  image:https://img.shields.io/gem/v/html2doc.svg["Gem Version", link="https://rubygems.org/gems/html2doc"]
6
- image:https://github.com/metanorma/html2doc/workflows/ubuntu/badge.svg["Ubuntu Build Status", link="https://github.com/metanorma/html2doc/actions?query=workflow%3Aubuntu"]
7
- image:https://github.com/metanorma/html2doc/workflows/macos/badge.svg["OSX Build Status", link="https://github.com/metanorma/html2doc/actions?query=workflow%3Amacos"]
8
- image:https://github.com/metanorma/html2doc/workflows/windows/badge.svg["Windows Build Status", link="https://github.com/metanorma/html2doc/actions?query=workflow%3Awindows"]
6
+ image:https://github.com/metanorma/html2doc/workflows/rake/badge.svg["Build Status", link="https://github.com/metanorma/html2doc/actions?workflow=rake"]
9
7
  image:https://codeclimate.com/github/metanorma/html2doc/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/html2doc"]
10
8
  image:https://img.shields.io/github/issues-pr-raw/metanorma/html2doc.svg["Pull Requests", link="https://github.com/metanorma/html2doc/pulls"]
11
9
  image:https://img.shields.io/github/commits-since/metanorma/html2doc/latest.svg["Commits since latest",link="https://github.com/metanorma/html2doc/releases"]
data/Rakefile CHANGED
@@ -3,4 +3,4 @@ require "rspec/core/rake_task"
3
3
 
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
6
- task :default => :spec
6
+ task default: :spec
data/bin/html2doc CHANGED
@@ -21,9 +21,8 @@ if ARGV.length < 1
21
21
  end
22
22
 
23
23
  Html2Doc.process(
24
- File.read(ARGV[0], encoding: "utf-8"),
24
+ File.read(ARGV[0], encoding: "utf-8"),
25
25
  filename: ARGV[0].gsub(/\.html?$/, ""),
26
26
  stylesheet: options[:stylesheet],
27
- header: options[:header],
27
+ header: options[:header]
28
28
  )
29
-
data/html2doc.gemspec CHANGED
@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
28
28
  spec.add_dependency "htmlentities", "~> 4.3.4"
29
29
  spec.add_dependency "image_size"
30
30
  spec.add_dependency "mime-types"
31
- spec.add_dependency "nokogiri", ">= 1.10.4"
31
+ spec.add_dependency "nokogiri", "~> 1.10.4"
32
32
  spec.add_dependency "thread_safe"
33
33
  spec.add_dependency "uuidtools"
34
- spec.add_dependency "asciimath", "~> 2.0.0"
34
+ spec.add_dependency "asciimath", "~> 2.0.2"
35
+ spec.add_dependency "plane1converter", "~> 0.0.1"
35
36
 
36
37
  spec.add_development_dependency "byebug", "~> 9.1"
37
38
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
@@ -39,7 +40,7 @@ Gem::Specification.new do |spec|
39
40
  spec.add_development_dependency "guard-rspec", "~> 4.7"
40
41
  spec.add_development_dependency "rake", "~> 12.0"
41
42
  spec.add_development_dependency "rspec", "~> 3.6"
42
- spec.add_development_dependency "rubocop", "= 0.54.0"
43
+ spec.add_development_dependency "rubocop", "~> 1.5.2"
43
44
  spec.add_development_dependency "simplecov", "~> 0.15"
44
45
  spec.add_development_dependency "timecop", "~> 0.9"
45
46
  spec.add_development_dependency "rspec-match_fuzzy", "~> 0.1.3"
data/lib/html2doc/base.rb CHANGED
@@ -2,8 +2,6 @@ require "uuidtools"
2
2
  require "asciimath"
3
3
  require "htmlentities"
4
4
  require "nokogiri"
5
- #require "xml/xslt"
6
- require "pp"
7
5
  require "fileutils"
8
6
 
9
7
  module Html2Doc
@@ -19,16 +17,26 @@ module Html2Doc
19
17
 
20
18
  def self.process_header(headerfile, hash)
21
19
  return if headerfile.nil?
20
+
22
21
  doc = File.read(headerfile, encoding: "utf-8")
23
- doc = header_image_cleanup(doc, hash[:dir1], hash[:filename], File.dirname(hash[:filename]))
22
+ doc = header_image_cleanup(doc, hash[:dir1], hash[:filename],
23
+ File.dirname(hash[:filename]))
24
24
  File.open("#{hash[:dir1]}/header.html", "w:UTF-8") { |f| f.write(doc) }
25
25
  end
26
26
 
27
+ def self.clear_dir(dir)
28
+ Dir.foreach(dir) do |f|
29
+ fn = File.join(dir, f)
30
+ File.delete(fn) if f != "." && f != ".."
31
+ end
32
+ dir
33
+ end
34
+
27
35
  def self.create_dir(filename, dir)
28
- return dir if dir
36
+ dir and return clear_dir(dir)
29
37
  dir = "#{filename}_files"
30
38
  Dir.mkdir(dir) unless File.exists?(dir)
31
- dir
39
+ clear_dir(dir)
32
40
  end
33
41
 
34
42
  def self.process_html(result, hash)
@@ -64,7 +72,7 @@ module Html2Doc
64
72
 
65
73
  def self.to_xhtml(xml)
66
74
  xml.gsub!(/<\?xml[^>]*>/, "")
67
- unless /<!DOCTYPE /.match xml
75
+ unless /<!DOCTYPE /.match? xml
68
76
  xml = '<!DOCTYPE html SYSTEM
69
77
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
70
78
  end
@@ -76,34 +84,34 @@ module Html2Doc
76
84
  DOCTYPE
77
85
 
78
86
  def self.from_xhtml(xml)
79
- xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "").
80
- sub(DOCTYPE, "").
81
- gsub(%{ />}, "/>")
87
+ xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
88
+ .sub(DOCTYPE, "")
89
+ .gsub(%{ />}, "/>")
82
90
  end
83
91
 
84
- def self.msword_fix(r)
92
+ def self.msword_fix(doc)
85
93
  # brain damage in MSWord parser
86
- r.gsub!(%r{<span style="mso-special-character:footnote"/>},
87
- '<span style="mso-special-character:footnote"></span>')
88
- r.gsub!(%r{<div style="mso-element:footnote-list"></div>},
89
- '<div style="mso-element:footnote-list"/>')
90
- r.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
91
- r.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
92
- r.gsub!(%r{<meta http-equiv="Content-Type"},
93
- "<meta http-equiv=Content-Type")
94
- r.gsub!(%r{></m:jc>}, "/>")
95
- r.gsub!(%r{></v:stroke>}, "/>")
96
- r.gsub!(%r{></v:f>}, "/>")
97
- r.gsub!(%r{></v:path>}, "/>")
98
- r.gsub!(%r{></o:lock>}, "/>")
99
- r.gsub!(%r{></v:imagedata>}, "/>")
100
- r.gsub!(%r{></w:wrap>}, "/>")
101
- r.gsub!(%r{&tab;|&amp;tab;}, '<span style="mso-tab-count:1">&#xA0; </span>')
102
- r = r.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
94
+ doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
95
+ '<span style="mso-special-character:footnote"></span>')
96
+ doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
97
+ '<div style="mso-element:footnote-list"/>')
98
+ doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
99
+ doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
100
+ doc.gsub!(%r{<meta http-equiv="Content-Type"},
101
+ "<meta http-equiv=Content-Type")
102
+ doc.gsub!(%r{></m:jc>}, "/>")
103
+ doc.gsub!(%r{></v:stroke>}, "/>")
104
+ doc.gsub!(%r{></v:f>}, "/>")
105
+ doc.gsub!(%r{></v:path>}, "/>")
106
+ doc.gsub!(%r{></o:lock>}, "/>")
107
+ doc.gsub!(%r{></v:imagedata>}, "/>")
108
+ doc.gsub!(%r{></w:wrap>}, "/>")
109
+ doc.gsub!(%r{&tab;|&amp;tab;},
110
+ '<span style="mso-tab-count:1">&#xA0; </span>')
111
+ doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
103
112
  a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
104
113
  a
105
114
  end.join
106
- r
107
115
  end
108
116
 
109
117
  PRINT_VIEW = <<~XML.freeze
@@ -122,26 +130,27 @@ module Html2Doc
122
130
  def self.define_head1(docxml, dir)
123
131
  docxml.xpath("//*[local-name() = 'head']").each do |h|
124
132
  h.children.first.add_previous_sibling <<~XML
125
- #{PRINT_VIEW}
126
- <link rel="File-List" href="#{File.basename(dir)}/filelist.xml"/>
133
+ #{PRINT_VIEW}
134
+ <link rel="File-List" href="cid:filelist.xml"/>
127
135
  XML
128
136
  end
129
137
  end
130
138
 
131
- def self.filename_substitute(stylesheet, header_filename, filename)
132
- if header_filename.nil?
133
- stylesheet.gsub!(/\n[^\n]*FILENAME[^\n]*i\n/, "\n")
134
- else
135
- stylesheet.gsub!(/FILENAME/, File.basename(filename))
139
+ def self.filename_substitute(head, header_filename)
140
+ return if header_filename.nil?
141
+
142
+ head.xpath(".//*[local-name() = 'style']").each do |s|
143
+ s1 = s.to_xml.gsub(/url\("[^"]+"\)/) do |m|
144
+ /FILENAME/.match?(m) ? "url(cid:header.html)" : m
145
+ end
146
+ s.replace(s1)
136
147
  end
137
- stylesheet
138
148
  end
139
149
 
140
150
  def self.stylesheet(filename, header_filename, fn)
141
- (fn.nil? || fn.empty?) &&
151
+ (fn.nil? || fn.empty?) and
142
152
  fn = File.join(File.dirname(__FILE__), "wordstyle.css")
143
153
  stylesheet = File.read(fn, encoding: "UTF-8")
144
- stylesheet = filename_substitute(stylesheet, header_filename, filename)
145
154
  xml = Nokogiri::XML("<style/>")
146
155
  xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
147
156
  xml.root.to_s
@@ -152,6 +161,7 @@ module Html2Doc
152
161
  head = docxml.at("//*[local-name() = 'head']")
153
162
  css = stylesheet(hash[:filename], hash[:header_file], hash[:stylesheet])
154
163
  add_stylesheet(head, title, css)
164
+ filename_substitute(head, hash[:header_file])
155
165
  define_head1(docxml, hash[:dir1])
156
166
  rootnamespace(docxml.root)
157
167
  end
@@ -180,13 +190,13 @@ module Html2Doc
180
190
  end
181
191
 
182
192
  def self.bookmarks(docxml)
183
- docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]").each do |x|
184
- next if x["id"].empty?
185
- next if %w(shapetype v:shapetype shape v:shape).include? x.name
186
- if x.children.empty?
187
- x.add_child("<a name='#{x["id"]}'></a>")
188
- else
189
- x.children.first.previous = "<a name='#{x["id"]}'></a>"
193
+ docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
194
+ .each do |x|
195
+ next if x["id"].empty? ||
196
+ %w(shapetype v:shapetype shape v:shape).include?(x.name)
197
+
198
+ if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
199
+ else x.children.first.previous = "<a name='#{x['id']}'></a>"
190
200
  end
191
201
  x.delete("id")
192
202
  end
@@ -7,6 +7,7 @@ require "uuidtools"
7
7
  module Html2Doc
8
8
  def self.style_list(li, level, liststyle, listnumber)
9
9
  return unless liststyle
10
+
10
11
  if li["style"]
11
12
  li["style"] += ";"
12
13
  else
@@ -16,37 +17,39 @@ module Html2Doc
16
17
  end
17
18
 
18
19
  def self.list_add1(li, liststyles, listtype, level)
19
- if [:ul, :ol].include? listtype
20
- list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
21
- liststyles, :ul, level + 1)
22
- list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
23
- liststyles, :ol, level + 1)
24
- else
25
- list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
26
- liststyles, listtype, level + 1)
27
- list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
28
- liststyles, listtype, level + 1)
29
- end
20
+ if %i[ul ol].include? listtype
21
+ list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
22
+ liststyles, :ul, level + 1)
23
+ list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
24
+ liststyles, :ol, level + 1)
25
+ else
26
+ list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
27
+ liststyles, listtype, level + 1)
28
+ list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
29
+ liststyles, listtype, level + 1)
30
+ end
30
31
  end
31
32
 
32
33
  def self.list_add(xpath, liststyles, listtype, level)
33
- xpath.each_with_index do |list, i|
34
+ xpath.each_with_index do |l, _i|
34
35
  @listnumber += 1 if level == 1
35
- list["seen"] = true if level == 1
36
- list["id"] ||= UUIDTools::UUID.random_create
37
- (list.xpath(".//li") - list.xpath(".//ol//li | .//ul//li")).each do |li|
36
+ l["seen"] = true if level == 1
37
+ l["id"] ||= UUIDTools::UUID.random_create
38
+ (l.xpath(".//li") - l.xpath(".//ol//li | .//ul//li")).each do |li|
38
39
  style_list(li, level, liststyles[listtype], @listnumber)
39
40
  list_add1(li, liststyles, listtype, level)
40
41
  end
41
- list.xpath(".//ul[not(ancestor::li/ancestor::*/@id = '#{list['id']}')] | "\
42
- ".//ol[not(ancestor::li/ancestor::*/@id = '#{list['id']}')]").each do |li|
43
- list_add1(li.parent, liststyles, listtype, level-1)
42
+ l.xpath(".//ul[not(ancestor::li/ancestor::*/@id = '#{l['id']}')] | "\
43
+ ".//ol[not(ancestor::li/ancestor::*/@id = '#{l['id']}')]")
44
+ .each do |li|
45
+ list_add1(li.parent, liststyles, listtype, level - 1)
44
46
  end
45
47
  end
46
48
  end
47
49
 
48
50
  def self.list2para(u)
49
51
  return if u.xpath("./li").empty?
52
+
50
53
  u.xpath("./li").first["class"] ||= "MsoListParagraphCxSpFirst"
51
54
  u.xpath("./li").last["class"] ||= "MsoListParagraphCxSpLast"
52
55
  u.xpath("./li/p").each { |p| p["class"] ||= "MsoListParagraphCxSpMiddle" }
@@ -64,21 +67,25 @@ module Html2Doc
64
67
  def self.lists1(docxml, liststyles, k)
65
68
  case k
66
69
  when :ul then list_add(docxml.xpath("//ul[not(@class)]#{TOPLIST}"),
67
- liststyles, :ul, 1)
70
+ liststyles, :ul, 1)
68
71
  when :ol then list_add(docxml.xpath("//ol[not(@class)]#{TOPLIST}"),
69
72
  liststyles, :ol, 1)
70
73
  else
71
- list_add(docxml.xpath("//ol[@class = '#{k.to_s}']#{TOPLIST} | "\
72
- "//ul[@class = '#{k.to_s}']#{TOPLIST}"),
74
+ list_add(docxml.xpath("//ol[@class = '#{k}']#{TOPLIST} | "\
75
+ "//ul[@class = '#{k}']#{TOPLIST}"),
73
76
  liststyles, k, 1)
74
77
  end
75
78
  end
76
79
 
77
80
  def self.lists_unstyled(docxml, liststyles)
78
- list_add(docxml.xpath("//ul#{TOPLIST}[not(@seen)]"),
79
- liststyles, :ul, 1) if liststyles.has_key?(:ul)
80
- list_add(docxml.xpath("//ol#{TOPLIST}[not(@seen)]"),
81
- liststyles, :ul, 1) if liststyles.has_key?(:ol)
81
+ if liststyles.has_key?(:ul)
82
+ list_add(docxml.xpath("//ul#{TOPLIST}[not(@seen)]"),
83
+ liststyles, :ul, 1)
84
+ end
85
+ if liststyles.has_key?(:ol)
86
+ list_add(docxml.xpath("//ol#{TOPLIST}[not(@seen)]"),
87
+ liststyles, :ul, 1)
88
+ end
82
89
  docxml.xpath("//ul[@seen] | //ol[@seen]").each do |l|
83
90
  l.delete("seen")
84
91
  end
@@ -86,6 +93,7 @@ module Html2Doc
86
93
 
87
94
  def self.lists(docxml, liststyles)
88
95
  return if liststyles.nil?
96
+
89
97
  @listnumber = 0
90
98
  liststyles.each_key { |k| lists1(docxml, liststyles, k) }
91
99
  lists_unstyled(docxml, liststyles)
data/lib/html2doc/math.rb CHANGED
@@ -2,20 +2,27 @@ require "uuidtools"
2
2
  require "asciimath"
3
3
  require "htmlentities"
4
4
  require "nokogiri"
5
+ require "plane1converter"
5
6
 
6
7
  module Html2Doc
7
8
  @xsltemplate =
8
9
  Nokogiri::XSLT(File.read(File.join(File.dirname(__FILE__), "mml2omml.xsl"),
9
10
  encoding: "utf-8"))
10
11
 
11
- def self.asciimath_to_mathml1(x)
12
- AsciiMath::MathMLBuilder.new(:msword => true).append_expression(
13
- AsciiMath.parse(HTMLEntities.new.decode(x)).ast).to_s.
14
- gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
12
+ def self.asciimath_to_mathml1(expr)
13
+ AsciiMath::MathMLBuilder.new(msword: true).append_expression(
14
+ AsciiMath.parse(HTMLEntities.new.decode(expr)).ast,
15
+ ).to_s
16
+ .gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
17
+ rescue StandardError => e
18
+ puts "parsing: #{expr}"
19
+ puts e.message
20
+ raise e
15
21
  end
16
22
 
17
23
  def self.asciimath_to_mathml(doc, delims)
18
24
  return doc if delims.nil? || delims.size < 2
25
+
19
26
  m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
20
27
  m.each_slice(4).map.with_index do |(*a), i|
21
28
  i % 500 == 0 && m.size > 1000 && i > 0 and
@@ -25,43 +32,96 @@ module Html2Doc
25
32
  end.join
26
33
  end
27
34
 
35
+ def self.unwrap_accents(doc)
36
+ doc.xpath("//*[@accent = 'true']").each do |x|
37
+ x.elements.length > 1 or next
38
+ x.elements[1].name == "mrow" and
39
+ x.elements[1].replace(x.elements[1].children)
40
+ end
41
+ doc
42
+ end
43
+
28
44
  # random fixes to MathML input that OOXML needs to render properly
29
- def self.ooxml_cleanup(m, docnamespaces)
30
- m = mathml_preserve_space(mathml_insert_rows(m, docnamespaces), docnamespaces)
31
- m.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
32
- m
45
+ def self.ooxml_cleanup(math, docnamespaces)
46
+ math = unwrap_accents(
47
+ mathml_preserve_space(
48
+ mathml_insert_rows(math, docnamespaces), docnamespaces
49
+ ),
50
+ )
51
+ math.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
52
+ math
33
53
  end
34
54
 
35
- def self.mathml_insert_rows(m, docnamespaces)
36
- m.xpath(%w(msup msub msubsup munder mover munderover).
37
- map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
55
+ def self.mathml_insert_rows(math, docnamespaces)
56
+ math.xpath(%w(msup msub msubsup munder mover munderover)
57
+ .map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
38
58
  next unless x.next_element && x.next_element != "mrow"
59
+
39
60
  x.next_element.wrap("<mrow/>")
40
61
  end
41
- m
62
+ math
42
63
  end
43
64
 
44
- def self.mathml_preserve_space(m, docnamespaces)
45
- m.xpath(".//xmlns:mtext", docnamespaces).each do |x|
65
+ def self.mathml_preserve_space(math, docnamespaces)
66
+ math.xpath(".//xmlns:mtext", docnamespaces).each do |x|
46
67
  x.children = x.children.to_xml.gsub(/^\s/, "&#xA0;").gsub(/\s$/, "&#xA0;")
47
68
  end
48
- m
69
+ math
49
70
  end
50
71
 
51
- def self.unitalic(m)
52
- m.xpath(".//xmlns:r[xmlns:rPr/xmlns:sty[@m:val = 'p']]").each do |x|
72
+ def self.unitalic(math)
73
+ math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
53
74
  x.wrap("<span style='font-style:normal;'></span>")
54
75
  end
55
- m.xpath(".//xmlns:r[xmlns:rPr/xmlns:sty[@m:val = 'bi']]").each do |x|
56
- x.wrap("<span style='font-style:italic;font-weight:bold;'></span>")
76
+ math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
77
+ x.wrap("<span class='nostem' style='font-weight:bold;'><em></em></span>")
57
78
  end
58
- m.xpath(".//xmlns:r[xmlns:rPr/xmlns:sty[@m:val = 'i']]").each do |x|
79
+ math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
59
80
  x.wrap("<span class='nostem'><em></em></span>")
60
81
  end
61
- m.xpath(".//xmlns:r[xmlns:rPr/xmlns:sty[@m:val = 'b']]").each do |x|
82
+ math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
62
83
  x.wrap("<span style='font-style:normal;font-weight:bold;'></span>")
63
84
  end
64
- m
85
+ math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
86
+ to_plane1(x, :monospace)
87
+ end
88
+ math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
89
+ to_plane1(x, :doublestruck)
90
+ end
91
+ math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
92
+ to_plane1(x, :script)
93
+ end
94
+ math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
95
+ to_plane1(x, :scriptbold)
96
+ end
97
+ math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
98
+ to_plane1(x, :fraktur)
99
+ end
100
+ math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
101
+ to_plane1(x, :frakturbold)
102
+ end
103
+ math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
104
+ to_plane1(x, :sans)
105
+ end
106
+ math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
107
+ to_plane1(x, :sansbold)
108
+ end
109
+ math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
110
+ to_plane1(x, :sansitalic)
111
+ end
112
+ math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
113
+ to_plane1(x, :sansbolditalic)
114
+ end
115
+ math
116
+ end
117
+
118
+ def self.to_plane1(xml, font)
119
+ xml.traverse do |n|
120
+ next unless n.text?
121
+
122
+ n.replace(Plane1Converter.conv(HTMLEntities.new.decode(n.text), font))
123
+ end
124
+ xml
65
125
  end
66
126
 
67
127
  def self.mathml_to_ooml(docxml)
@@ -71,22 +131,23 @@ module Html2Doc
71
131
  i % 100 == 0 && m.size > 500 && i > 0 and
72
132
  warn "Math OOXML #{i} of #{m.size}"
73
133
  element = ooxml_cleanup(x, docnamespaces)
74
- doc = Nokogiri::XML::Document::new()
134
+ doc = Nokogiri::XML::Document::new
75
135
  doc.root = element
76
- ooxml = (unitalic(esc_space(@xsltemplate.transform(doc)))).to_s.
77
- gsub(/<\?[^>]+>\s*/, "").
78
- gsub(/ xmlns(:[^=]+)?="[^"]+"/, "").
79
- gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
136
+ ooxml = unitalic(esc_space(@xsltemplate.transform(doc))).to_s
137
+ .gsub(/<\?[^>]+>\s*/, "")
138
+ .gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
139
+ .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
80
140
  ooxml = uncenter(x, ooxml)
81
141
  x.swap(ooxml)
82
142
  end
83
143
  end
84
144
 
85
- # escape space as &#x32;; we are removing any spaces generated by
145
+ # escape space as &#x32;; we are removing any spaces generated by
86
146
  # XML indentation
87
147
  def self.esc_space(xml)
88
148
  xml.traverse do |n|
89
149
  next unless n.text?
150
+
90
151
  n = n.text.gsub(/ /, "&#x32;")
91
152
  end
92
153
  xml
@@ -94,17 +155,15 @@ module Html2Doc
94
155
 
95
156
  # if oomml has no siblings, by default it is centered; override this with
96
157
  # left/right if parent is so tagged
97
- def self.uncenter(m, ooxml)
98
- if m.next == nil && m.previous == nil
99
- alignnode = m.at(".//ancestor::*[@style][local-name() = 'p' or "\
100
- "local-name() = 'div' or local-name() = 'td']/@style")
101
- return ooxml unless alignnode
102
- if alignnode.text.include? ("text-align:left")
103
- ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
104
- "m:val='left'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
105
- elsif alignnode.text.include? ("text-align:right")
158
+ def self.uncenter(math, ooxml)
159
+ alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\
160
+ "local-name() = 'div' or local-name() = 'td']/@style")
161
+ return ooxml unless alignnode && (math.next == nil && math.previous == nil)
162
+
163
+ %w(left right).each do |dir|
164
+ if alignnode.text.include? ("text-align:#{dir}")
106
165
  ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
107
- "m:val='right'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
166
+ "m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
108
167
  end
109
168
  end
110
169
  ooxml