html2doc 1.0.5 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6616c37575b4bd09b8b7bf7a89beffece7388f3e2a4039610825a030ffb72318
4
- data.tar.gz: f29a3348e0b9260c4178af9a7f939e8ec39c2401cdc321346e5a6321e862cbad
3
+ metadata.gz: b92a72c9d0ee6005e38ab8de1f0cbc48455819955d34eb349fef1244504a9971
4
+ data.tar.gz: 52b07e9c1720bc2bf7a7196e0f650c62b81bf534535795bb2a136f2e79829416
5
5
  SHA512:
6
- metadata.gz: 4dee0b6541178293833caf2d55e1ec382ea16692b4efe4455ff0fd627e89ef5e67be747248cefd6c56ff151d8f9ea46f54633241d90996055e3574604eed175e
7
- data.tar.gz: da5201fa1568f34e0638947d6dcfb5df1b7cf4303de1f74323122fcf92f6060cc7d5ddcc483cbfbfcb70798d12afc95c89b48d0274a34114fd8b18d7c1d58692
6
+ metadata.gz: af4b84183859fd83ac500c6c1ac28e76d49ec7fb48c8a4cf49bc3543909b913e78c8293918b34e21e9e68f2dc78a500824337677ae1aea54bcf7cbecebd9363d
7
+ data.tar.gz: 9e0bd48390458ec57dcc9650643a9332a549148a1297b37962e37399453d95132e39c69cb94ea13fcc70f9d6bc273fccf742da61e6f4a3025710182df1d65144
@@ -0,0 +1,42 @@
1
+ # Auto-generated by Cimas: Do not edit it manually!
2
+ # See https://github.com/metanorma/cimas
3
+ name: rake
4
+
5
+ on:
6
+ push:
7
+ branches: [ master, main ]
8
+ tags: [ v* ]
9
+ pull_request:
10
+
11
+ jobs:
12
+ rake:
13
+ name: Test on Ruby ${{ matrix.ruby }} ${{ matrix.os }}
14
+ runs-on: ${{ matrix.os }}
15
+ continue-on-error: ${{ matrix.experimental }}
16
+ strategy:
17
+ fail-fast: false
18
+ matrix:
19
+ ruby: [ '2.7', '2.6', '2.5', '2.4' ]
20
+ os: [ ubuntu-latest, windows-latest, macos-latest ]
21
+ experimental: [ false ]
22
+ include:
23
+ - ruby: '3.0'
24
+ os: 'ubuntu-latest'
25
+ experimental: true
26
+ - ruby: '3.0'
27
+ os: 'windows-latest'
28
+ experimental: true
29
+ - ruby: '3.0'
30
+ os: 'macos-latest'
31
+ experimental: true
32
+ steps:
33
+ - uses: actions/checkout@v2
34
+ with:
35
+ submodules: true
36
+
37
+ - uses: ruby/setup-ruby@v1
38
+ with:
39
+ ruby-version: ${{ matrix.ruby }}
40
+ bundler-cache: true
41
+
42
+ - run: bundle exec rake
data/.gitignore CHANGED
@@ -9,3 +9,5 @@
9
9
 
10
10
  # rspec failure tracking
11
11
  .rspec_status
12
+
13
+ .rubocop-https--*
data/.rubocop.yml CHANGED
@@ -1,10 +1,12 @@
1
1
  # This project follows the Ribose OSS style guide.
2
2
  # https://github.com/riboseinc/oss-guides
3
3
  # All project-specific additions and overrides should be specified in this file.
4
-
5
4
  inherit_from:
6
5
  - https://raw.githubusercontent.com/riboseinc/oss-guides/master/ci/rubocop.yml
6
+
7
+ # local repo-specific modifications
8
+
7
9
  AllCops:
8
- TargetRubyVersion: 2.3
9
- Rails:
10
- Enabled: true
10
+ DisplayCopNames: false
11
+ StyleGuideCopsOnly: false
12
+ TargetRubyVersion: 2.4
data/Gemfile CHANGED
@@ -10,6 +10,6 @@ end
10
10
 
11
11
  gemspec
12
12
 
13
- if File.exist? 'Gemfile.devel'
14
- eval File.read('Gemfile.devel'), nil, 'Gemfile.devel' # rubocop:disable Security/Eval
13
+ if File.exist? "Gemfile.devel"
14
+ eval File.read("Gemfile.devel"), nil, "Gemfile.devel" # rubocop:disable Security/Eval
15
15
  end
data/README.adoc CHANGED
@@ -3,9 +3,7 @@
3
3
  https://github.com/metanorma/html2doc/workflows/main/badge.svg
4
4
 
5
5
  image:https://img.shields.io/gem/v/html2doc.svg["Gem Version", link="https://rubygems.org/gems/html2doc"]
6
- image:https://github.com/metanorma/html2doc/workflows/ubuntu/badge.svg["Ubuntu Build Status", link="https://github.com/metanorma/html2doc/actions?query=workflow%3Aubuntu"]
7
- image:https://github.com/metanorma/html2doc/workflows/macos/badge.svg["OSX Build Status", link="https://github.com/metanorma/html2doc/actions?query=workflow%3Amacos"]
8
- image:https://github.com/metanorma/html2doc/workflows/windows/badge.svg["Windows Build Status", link="https://github.com/metanorma/html2doc/actions?query=workflow%3Awindows"]
6
+ image:https://github.com/metanorma/html2doc/workflows/rake/badge.svg["Build Status", link="https://github.com/metanorma/html2doc/actions?workflow=rake"]
9
7
  image:https://codeclimate.com/github/metanorma/html2doc/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/html2doc"]
10
8
  image:https://img.shields.io/github/issues-pr-raw/metanorma/html2doc.svg["Pull Requests", link="https://github.com/metanorma/html2doc/pulls"]
11
9
  image:https://img.shields.io/github/commits-since/metanorma/html2doc/latest.svg["Commits since latest",link="https://github.com/metanorma/html2doc/releases"]
data/Rakefile CHANGED
@@ -3,4 +3,4 @@ require "rspec/core/rake_task"
3
3
 
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
6
- task :default => :spec
6
+ task default: :spec
data/bin/html2doc CHANGED
@@ -21,9 +21,8 @@ if ARGV.length < 1
21
21
  end
22
22
 
23
23
  Html2Doc.process(
24
- File.read(ARGV[0], encoding: "utf-8"),
24
+ File.read(ARGV[0], encoding: "utf-8"),
25
25
  filename: ARGV[0].gsub(/\.html?$/, ""),
26
26
  stylesheet: options[:stylesheet],
27
- header: options[:header],
27
+ header: options[:header]
28
28
  )
29
-
data/html2doc.gemspec CHANGED
@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
28
28
  spec.add_dependency "htmlentities", "~> 4.3.4"
29
29
  spec.add_dependency "image_size"
30
30
  spec.add_dependency "mime-types"
31
- spec.add_dependency "nokogiri", ">= 1.10.4"
31
+ spec.add_dependency "nokogiri", "~> 1.10.4"
32
32
  spec.add_dependency "thread_safe"
33
33
  spec.add_dependency "uuidtools"
34
- spec.add_dependency "asciimath", "~> 2.0.0"
34
+ spec.add_dependency "asciimath", "~> 2.0.2"
35
+ spec.add_dependency "plane1converter", "~> 0.0.1"
35
36
 
36
37
  spec.add_development_dependency "byebug", "~> 9.1"
37
38
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
@@ -39,7 +40,7 @@ Gem::Specification.new do |spec|
39
40
  spec.add_development_dependency "guard-rspec", "~> 4.7"
40
41
  spec.add_development_dependency "rake", "~> 12.0"
41
42
  spec.add_development_dependency "rspec", "~> 3.6"
42
- spec.add_development_dependency "rubocop", "= 0.54.0"
43
+ spec.add_development_dependency "rubocop", "~> 1.5.2"
43
44
  spec.add_development_dependency "simplecov", "~> 0.15"
44
45
  spec.add_development_dependency "timecop", "~> 0.9"
45
46
  spec.add_development_dependency "rspec-match_fuzzy", "~> 0.1.3"
data/lib/html2doc/base.rb CHANGED
@@ -2,8 +2,6 @@ require "uuidtools"
2
2
  require "asciimath"
3
3
  require "htmlentities"
4
4
  require "nokogiri"
5
- #require "xml/xslt"
6
- require "pp"
7
5
  require "fileutils"
8
6
 
9
7
  module Html2Doc
@@ -19,16 +17,26 @@ module Html2Doc
19
17
 
20
18
  def self.process_header(headerfile, hash)
21
19
  return if headerfile.nil?
20
+
22
21
  doc = File.read(headerfile, encoding: "utf-8")
23
- doc = header_image_cleanup(doc, hash[:dir1], hash[:filename], File.dirname(hash[:filename]))
22
+ doc = header_image_cleanup(doc, hash[:dir1], hash[:filename],
23
+ File.dirname(hash[:filename]))
24
24
  File.open("#{hash[:dir1]}/header.html", "w:UTF-8") { |f| f.write(doc) }
25
25
  end
26
26
 
27
+ def self.clear_dir(dir)
28
+ Dir.foreach(dir) do |f|
29
+ fn = File.join(dir, f)
30
+ File.delete(fn) if f != "." && f != ".."
31
+ end
32
+ dir
33
+ end
34
+
27
35
  def self.create_dir(filename, dir)
28
- return dir if dir
36
+ dir and return clear_dir(dir)
29
37
  dir = "#{filename}_files"
30
38
  Dir.mkdir(dir) unless File.exists?(dir)
31
- dir
39
+ clear_dir(dir)
32
40
  end
33
41
 
34
42
  def self.process_html(result, hash)
@@ -64,7 +72,7 @@ module Html2Doc
64
72
 
65
73
  def self.to_xhtml(xml)
66
74
  xml.gsub!(/<\?xml[^>]*>/, "")
67
- unless /<!DOCTYPE /.match xml
75
+ unless /<!DOCTYPE /.match? xml
68
76
  xml = '<!DOCTYPE html SYSTEM
69
77
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
70
78
  end
@@ -76,34 +84,34 @@ module Html2Doc
76
84
  DOCTYPE
77
85
 
78
86
  def self.from_xhtml(xml)
79
- xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "").
80
- sub(DOCTYPE, "").
81
- gsub(%{ />}, "/>")
87
+ xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
88
+ .sub(DOCTYPE, "")
89
+ .gsub(%{ />}, "/>")
82
90
  end
83
91
 
84
- def self.msword_fix(r)
92
+ def self.msword_fix(doc)
85
93
  # brain damage in MSWord parser
86
- r.gsub!(%r{<span style="mso-special-character:footnote"/>},
87
- '<span style="mso-special-character:footnote"></span>')
88
- r.gsub!(%r{<div style="mso-element:footnote-list"></div>},
89
- '<div style="mso-element:footnote-list"/>')
90
- r.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
91
- r.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
92
- r.gsub!(%r{<meta http-equiv="Content-Type"},
93
- "<meta http-equiv=Content-Type")
94
- r.gsub!(%r{></m:jc>}, "/>")
95
- r.gsub!(%r{></v:stroke>}, "/>")
96
- r.gsub!(%r{></v:f>}, "/>")
97
- r.gsub!(%r{></v:path>}, "/>")
98
- r.gsub!(%r{></o:lock>}, "/>")
99
- r.gsub!(%r{></v:imagedata>}, "/>")
100
- r.gsub!(%r{></w:wrap>}, "/>")
101
- r.gsub!(%r{&tab;|&amp;tab;}, '<span style="mso-tab-count:1">&#xA0; </span>')
102
- r = r.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
94
+ doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
95
+ '<span style="mso-special-character:footnote"></span>')
96
+ doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
97
+ '<div style="mso-element:footnote-list"/>')
98
+ doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
99
+ doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
100
+ doc.gsub!(%r{<meta http-equiv="Content-Type"},
101
+ "<meta http-equiv=Content-Type")
102
+ doc.gsub!(%r{></m:jc>}, "/>")
103
+ doc.gsub!(%r{></v:stroke>}, "/>")
104
+ doc.gsub!(%r{></v:f>}, "/>")
105
+ doc.gsub!(%r{></v:path>}, "/>")
106
+ doc.gsub!(%r{></o:lock>}, "/>")
107
+ doc.gsub!(%r{></v:imagedata>}, "/>")
108
+ doc.gsub!(%r{></w:wrap>}, "/>")
109
+ doc.gsub!(%r{&tab;|&amp;tab;},
110
+ '<span style="mso-tab-count:1">&#xA0; </span>')
111
+ doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
103
112
  a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
104
113
  a
105
114
  end.join
106
- r
107
115
  end
108
116
 
109
117
  PRINT_VIEW = <<~XML.freeze
@@ -122,26 +130,27 @@ module Html2Doc
122
130
  def self.define_head1(docxml, dir)
123
131
  docxml.xpath("//*[local-name() = 'head']").each do |h|
124
132
  h.children.first.add_previous_sibling <<~XML
125
- #{PRINT_VIEW}
126
- <link rel="File-List" href="#{File.basename(dir)}/filelist.xml"/>
133
+ #{PRINT_VIEW}
134
+ <link rel="File-List" href="cid:filelist.xml"/>
127
135
  XML
128
136
  end
129
137
  end
130
138
 
131
- def self.filename_substitute(stylesheet, header_filename, filename)
132
- if header_filename.nil?
133
- stylesheet.gsub!(/\n[^\n]*FILENAME[^\n]*i\n/, "\n")
134
- else
135
- stylesheet.gsub!(/FILENAME/, File.basename(filename))
139
+ def self.filename_substitute(head, header_filename)
140
+ return if header_filename.nil?
141
+
142
+ head.xpath(".//*[local-name() = 'style']").each do |s|
143
+ s1 = s.to_xml.gsub(/url\("[^"]+"\)/) do |m|
144
+ /FILENAME/.match?(m) ? "url(cid:header.html)" : m
145
+ end
146
+ s.replace(s1)
136
147
  end
137
- stylesheet
138
148
  end
139
149
 
140
150
  def self.stylesheet(filename, header_filename, fn)
141
- (fn.nil? || fn.empty?) &&
151
+ (fn.nil? || fn.empty?) and
142
152
  fn = File.join(File.dirname(__FILE__), "wordstyle.css")
143
153
  stylesheet = File.read(fn, encoding: "UTF-8")
144
- stylesheet = filename_substitute(stylesheet, header_filename, filename)
145
154
  xml = Nokogiri::XML("<style/>")
146
155
  xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
147
156
  xml.root.to_s
@@ -152,6 +161,7 @@ module Html2Doc
152
161
  head = docxml.at("//*[local-name() = 'head']")
153
162
  css = stylesheet(hash[:filename], hash[:header_file], hash[:stylesheet])
154
163
  add_stylesheet(head, title, css)
164
+ filename_substitute(head, hash[:header_file])
155
165
  define_head1(docxml, hash[:dir1])
156
166
  rootnamespace(docxml.root)
157
167
  end
@@ -180,13 +190,13 @@ module Html2Doc
180
190
  end
181
191
 
182
192
  def self.bookmarks(docxml)
183
- docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]").each do |x|
184
- next if x["id"].empty?
185
- next if %w(shapetype v:shapetype shape v:shape).include? x.name
186
- if x.children.empty?
187
- x.add_child("<a name='#{x["id"]}'></a>")
188
- else
189
- x.children.first.previous = "<a name='#{x["id"]}'></a>"
193
+ docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
194
+ .each do |x|
195
+ next if x["id"].empty? ||
196
+ %w(shapetype v:shapetype shape v:shape).include?(x.name)
197
+
198
+ if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
199
+ else x.children.first.previous = "<a name='#{x['id']}'></a>"
190
200
  end
191
201
  x.delete("id")
192
202
  end
@@ -7,6 +7,7 @@ require "uuidtools"
7
7
  module Html2Doc
8
8
  def self.style_list(li, level, liststyle, listnumber)
9
9
  return unless liststyle
10
+
10
11
  if li["style"]
11
12
  li["style"] += ";"
12
13
  else
@@ -16,37 +17,39 @@ module Html2Doc
16
17
  end
17
18
 
18
19
  def self.list_add1(li, liststyles, listtype, level)
19
- if [:ul, :ol].include? listtype
20
- list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
21
- liststyles, :ul, level + 1)
22
- list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
23
- liststyles, :ol, level + 1)
24
- else
25
- list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
26
- liststyles, listtype, level + 1)
27
- list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
28
- liststyles, listtype, level + 1)
29
- end
20
+ if %i[ul ol].include? listtype
21
+ list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
22
+ liststyles, :ul, level + 1)
23
+ list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
24
+ liststyles, :ol, level + 1)
25
+ else
26
+ list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
27
+ liststyles, listtype, level + 1)
28
+ list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
29
+ liststyles, listtype, level + 1)
30
+ end
30
31
  end
31
32
 
32
33
  def self.list_add(xpath, liststyles, listtype, level)
33
- xpath.each_with_index do |list, i|
34
+ xpath.each_with_index do |l, _i|
34
35
  @listnumber += 1 if level == 1
35
- list["seen"] = true if level == 1
36
- list["id"] ||= UUIDTools::UUID.random_create
37
- (list.xpath(".//li") - list.xpath(".//ol//li | .//ul//li")).each do |li|
36
+ l["seen"] = true if level == 1
37
+ l["id"] ||= UUIDTools::UUID.random_create
38
+ (l.xpath(".//li") - l.xpath(".//ol//li | .//ul//li")).each do |li|
38
39
  style_list(li, level, liststyles[listtype], @listnumber)
39
40
  list_add1(li, liststyles, listtype, level)
40
41
  end
41
- list.xpath(".//ul[not(ancestor::li/ancestor::*/@id = '#{list['id']}')] | "\
42
- ".//ol[not(ancestor::li/ancestor::*/@id = '#{list['id']}')]").each do |li|
43
- list_add1(li.parent, liststyles, listtype, level-1)
42
+ l.xpath(".//ul[not(ancestor::li/ancestor::*/@id = '#{l['id']}')] | "\
43
+ ".//ol[not(ancestor::li/ancestor::*/@id = '#{l['id']}')]")
44
+ .each do |li|
45
+ list_add1(li.parent, liststyles, listtype, level - 1)
44
46
  end
45
47
  end
46
48
  end
47
49
 
48
50
  def self.list2para(u)
49
51
  return if u.xpath("./li").empty?
52
+
50
53
  u.xpath("./li").first["class"] ||= "MsoListParagraphCxSpFirst"
51
54
  u.xpath("./li").last["class"] ||= "MsoListParagraphCxSpLast"
52
55
  u.xpath("./li/p").each { |p| p["class"] ||= "MsoListParagraphCxSpMiddle" }
@@ -64,21 +67,25 @@ module Html2Doc
64
67
  def self.lists1(docxml, liststyles, k)
65
68
  case k
66
69
  when :ul then list_add(docxml.xpath("//ul[not(@class)]#{TOPLIST}"),
67
- liststyles, :ul, 1)
70
+ liststyles, :ul, 1)
68
71
  when :ol then list_add(docxml.xpath("//ol[not(@class)]#{TOPLIST}"),
69
72
  liststyles, :ol, 1)
70
73
  else
71
- list_add(docxml.xpath("//ol[@class = '#{k.to_s}']#{TOPLIST} | "\
72
- "//ul[@class = '#{k.to_s}']#{TOPLIST}"),
74
+ list_add(docxml.xpath("//ol[@class = '#{k}']#{TOPLIST} | "\
75
+ "//ul[@class = '#{k}']#{TOPLIST}"),
73
76
  liststyles, k, 1)
74
77
  end
75
78
  end
76
79
 
77
80
  def self.lists_unstyled(docxml, liststyles)
78
- list_add(docxml.xpath("//ul#{TOPLIST}[not(@seen)]"),
79
- liststyles, :ul, 1) if liststyles.has_key?(:ul)
80
- list_add(docxml.xpath("//ol#{TOPLIST}[not(@seen)]"),
81
- liststyles, :ul, 1) if liststyles.has_key?(:ol)
81
+ if liststyles.has_key?(:ul)
82
+ list_add(docxml.xpath("//ul#{TOPLIST}[not(@seen)]"),
83
+ liststyles, :ul, 1)
84
+ end
85
+ if liststyles.has_key?(:ol)
86
+ list_add(docxml.xpath("//ol#{TOPLIST}[not(@seen)]"),
87
+ liststyles, :ul, 1)
88
+ end
82
89
  docxml.xpath("//ul[@seen] | //ol[@seen]").each do |l|
83
90
  l.delete("seen")
84
91
  end
@@ -86,6 +93,7 @@ module Html2Doc
86
93
 
87
94
  def self.lists(docxml, liststyles)
88
95
  return if liststyles.nil?
96
+
89
97
  @listnumber = 0
90
98
  liststyles.each_key { |k| lists1(docxml, liststyles, k) }
91
99
  lists_unstyled(docxml, liststyles)
data/lib/html2doc/math.rb CHANGED
@@ -2,20 +2,27 @@ require "uuidtools"
2
2
  require "asciimath"
3
3
  require "htmlentities"
4
4
  require "nokogiri"
5
+ require "plane1converter"
5
6
 
6
7
  module Html2Doc
7
8
  @xsltemplate =
8
9
  Nokogiri::XSLT(File.read(File.join(File.dirname(__FILE__), "mml2omml.xsl"),
9
10
  encoding: "utf-8"))
10
11
 
11
- def self.asciimath_to_mathml1(x)
12
- AsciiMath::MathMLBuilder.new(:msword => true).append_expression(
13
- AsciiMath.parse(HTMLEntities.new.decode(x)).ast).to_s.
14
- gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
12
+ def self.asciimath_to_mathml1(expr)
13
+ AsciiMath::MathMLBuilder.new(msword: true).append_expression(
14
+ AsciiMath.parse(HTMLEntities.new.decode(expr)).ast,
15
+ ).to_s
16
+ .gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
17
+ rescue StandardError => e
18
+ puts "parsing: #{expr}"
19
+ puts e.message
20
+ raise e
15
21
  end
16
22
 
17
23
  def self.asciimath_to_mathml(doc, delims)
18
24
  return doc if delims.nil? || delims.size < 2
25
+
19
26
  m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
20
27
  m.each_slice(4).map.with_index do |(*a), i|
21
28
  i % 500 == 0 && m.size > 1000 && i > 0 and
@@ -25,43 +32,96 @@ module Html2Doc
25
32
  end.join
26
33
  end
27
34
 
35
+ def self.unwrap_accents(doc)
36
+ doc.xpath("//*[@accent = 'true']").each do |x|
37
+ x.elements.length > 1 or next
38
+ x.elements[1].name == "mrow" and
39
+ x.elements[1].replace(x.elements[1].children)
40
+ end
41
+ doc
42
+ end
43
+
28
44
  # random fixes to MathML input that OOXML needs to render properly
29
- def self.ooxml_cleanup(m, docnamespaces)
30
- m = mathml_preserve_space(mathml_insert_rows(m, docnamespaces), docnamespaces)
31
- m.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
32
- m
45
+ def self.ooxml_cleanup(math, docnamespaces)
46
+ math = unwrap_accents(
47
+ mathml_preserve_space(
48
+ mathml_insert_rows(math, docnamespaces), docnamespaces
49
+ ),
50
+ )
51
+ math.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
52
+ math
33
53
  end
34
54
 
35
- def self.mathml_insert_rows(m, docnamespaces)
36
- m.xpath(%w(msup msub msubsup munder mover munderover).
37
- map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
55
+ def self.mathml_insert_rows(math, docnamespaces)
56
+ math.xpath(%w(msup msub msubsup munder mover munderover)
57
+ .map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
38
58
  next unless x.next_element && x.next_element != "mrow"
59
+
39
60
  x.next_element.wrap("<mrow/>")
40
61
  end
41
- m
62
+ math
42
63
  end
43
64
 
44
- def self.mathml_preserve_space(m, docnamespaces)
45
- m.xpath(".//xmlns:mtext", docnamespaces).each do |x|
65
+ def self.mathml_preserve_space(math, docnamespaces)
66
+ math.xpath(".//xmlns:mtext", docnamespaces).each do |x|
46
67
  x.children = x.children.to_xml.gsub(/^\s/, "&#xA0;").gsub(/\s$/, "&#xA0;")
47
68
  end
48
- m
69
+ math
49
70
  end
50
71
 
51
- def self.unitalic(m)
52
- m.xpath(".//xmlns:r[xmlns:rPr/xmlns:sty[@m:val = 'p']]").each do |x|
72
+ def self.unitalic(math)
73
+ math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
53
74
  x.wrap("<span style='font-style:normal;'></span>")
54
75
  end
55
- m.xpath(".//xmlns:r[xmlns:rPr/xmlns:sty[@m:val = 'bi']]").each do |x|
56
- x.wrap("<span style='font-style:italic;font-weight:bold;'></span>")
76
+ math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
77
+ x.wrap("<span class='nostem' style='font-weight:bold;'><em></em></span>")
57
78
  end
58
- m.xpath(".//xmlns:r[xmlns:rPr/xmlns:sty[@m:val = 'i']]").each do |x|
79
+ math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
59
80
  x.wrap("<span class='nostem'><em></em></span>")
60
81
  end
61
- m.xpath(".//xmlns:r[xmlns:rPr/xmlns:sty[@m:val = 'b']]").each do |x|
82
+ math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
62
83
  x.wrap("<span style='font-style:normal;font-weight:bold;'></span>")
63
84
  end
64
- m
85
+ math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
86
+ to_plane1(x, :monospace)
87
+ end
88
+ math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
89
+ to_plane1(x, :doublestruck)
90
+ end
91
+ math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
92
+ to_plane1(x, :script)
93
+ end
94
+ math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
95
+ to_plane1(x, :scriptbold)
96
+ end
97
+ math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
98
+ to_plane1(x, :fraktur)
99
+ end
100
+ math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
101
+ to_plane1(x, :frakturbold)
102
+ end
103
+ math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
104
+ to_plane1(x, :sans)
105
+ end
106
+ math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
107
+ to_plane1(x, :sansbold)
108
+ end
109
+ math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
110
+ to_plane1(x, :sansitalic)
111
+ end
112
+ math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
113
+ to_plane1(x, :sansbolditalic)
114
+ end
115
+ math
116
+ end
117
+
118
+ def self.to_plane1(xml, font)
119
+ xml.traverse do |n|
120
+ next unless n.text?
121
+
122
+ n.replace(Plane1Converter.conv(HTMLEntities.new.decode(n.text), font))
123
+ end
124
+ xml
65
125
  end
66
126
 
67
127
  def self.mathml_to_ooml(docxml)
@@ -71,22 +131,23 @@ module Html2Doc
71
131
  i % 100 == 0 && m.size > 500 && i > 0 and
72
132
  warn "Math OOXML #{i} of #{m.size}"
73
133
  element = ooxml_cleanup(x, docnamespaces)
74
- doc = Nokogiri::XML::Document::new()
134
+ doc = Nokogiri::XML::Document::new
75
135
  doc.root = element
76
- ooxml = (unitalic(esc_space(@xsltemplate.transform(doc)))).to_s.
77
- gsub(/<\?[^>]+>\s*/, "").
78
- gsub(/ xmlns(:[^=]+)?="[^"]+"/, "").
79
- gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
136
+ ooxml = unitalic(esc_space(@xsltemplate.transform(doc))).to_s
137
+ .gsub(/<\?[^>]+>\s*/, "")
138
+ .gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
139
+ .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
80
140
  ooxml = uncenter(x, ooxml)
81
141
  x.swap(ooxml)
82
142
  end
83
143
  end
84
144
 
85
- # escape space as &#x32;; we are removing any spaces generated by
145
+ # escape space as &#x32;; we are removing any spaces generated by
86
146
  # XML indentation
87
147
  def self.esc_space(xml)
88
148
  xml.traverse do |n|
89
149
  next unless n.text?
150
+
90
151
  n = n.text.gsub(/ /, "&#x32;")
91
152
  end
92
153
  xml
@@ -94,17 +155,15 @@ module Html2Doc
94
155
 
95
156
  # if oomml has no siblings, by default it is centered; override this with
96
157
  # left/right if parent is so tagged
97
- def self.uncenter(m, ooxml)
98
- if m.next == nil && m.previous == nil
99
- alignnode = m.at(".//ancestor::*[@style][local-name() = 'p' or "\
100
- "local-name() = 'div' or local-name() = 'td']/@style")
101
- return ooxml unless alignnode
102
- if alignnode.text.include? ("text-align:left")
103
- ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
104
- "m:val='left'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
105
- elsif alignnode.text.include? ("text-align:right")
158
+ def self.uncenter(math, ooxml)
159
+ alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\
160
+ "local-name() = 'div' or local-name() = 'td']/@style")
161
+ return ooxml unless alignnode && (math.next == nil && math.previous == nil)
162
+
163
+ %w(left right).each do |dir|
164
+ if alignnode.text.include? ("text-align:#{dir}")
106
165
  ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
107
- "m:val='right'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
166
+ "m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
108
167
  end
109
168
  end
110
169
  ooxml