html2doc 1.0.5 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +42 -0
- data/.gitignore +2 -0
- data/.rubocop.yml +6 -4
- data/Gemfile +2 -2
- data/README.adoc +1 -3
- data/Rakefile +1 -1
- data/bin/html2doc +2 -3
- data/html2doc.gemspec +4 -3
- data/lib/html2doc/base.rb +55 -45
- data/lib/html2doc/lists.rb +33 -25
- data/lib/html2doc/math.rb +97 -38
- data/lib/html2doc/mime.rb +42 -25
- data/lib/html2doc/mml2omml.xsl +9 -1
- data/lib/html2doc/notes.rb +34 -31
- data/lib/html2doc/version.rb +1 -1
- data/spec/html2doc_spec.rb +511 -484
- metadata +26 -16
- data/.github/workflows/macos.yml +0 -38
- data/.github/workflows/ubuntu.yml +0 -56
- data/.github/workflows/windows.yml +0 -40
- data/.rubocop.ribose.yml +0 -65
- data/.rubocop.tb.yml +0 -650
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b92a72c9d0ee6005e38ab8de1f0cbc48455819955d34eb349fef1244504a9971
|
4
|
+
data.tar.gz: 52b07e9c1720bc2bf7a7196e0f650c62b81bf534535795bb2a136f2e79829416
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: af4b84183859fd83ac500c6c1ac28e76d49ec7fb48c8a4cf49bc3543909b913e78c8293918b34e21e9e68f2dc78a500824337677ae1aea54bcf7cbecebd9363d
|
7
|
+
data.tar.gz: 9e0bd48390458ec57dcc9650643a9332a549148a1297b37962e37399453d95132e39c69cb94ea13fcc70f9d6bc273fccf742da61e6f4a3025710182df1d65144
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# Auto-generated by Cimas: Do not edit it manually!
|
2
|
+
# See https://github.com/metanorma/cimas
|
3
|
+
name: rake
|
4
|
+
|
5
|
+
on:
|
6
|
+
push:
|
7
|
+
branches: [ master, main ]
|
8
|
+
tags: [ v* ]
|
9
|
+
pull_request:
|
10
|
+
|
11
|
+
jobs:
|
12
|
+
rake:
|
13
|
+
name: Test on Ruby ${{ matrix.ruby }} ${{ matrix.os }}
|
14
|
+
runs-on: ${{ matrix.os }}
|
15
|
+
continue-on-error: ${{ matrix.experimental }}
|
16
|
+
strategy:
|
17
|
+
fail-fast: false
|
18
|
+
matrix:
|
19
|
+
ruby: [ '2.7', '2.6', '2.5', '2.4' ]
|
20
|
+
os: [ ubuntu-latest, windows-latest, macos-latest ]
|
21
|
+
experimental: [ false ]
|
22
|
+
include:
|
23
|
+
- ruby: '3.0'
|
24
|
+
os: 'ubuntu-latest'
|
25
|
+
experimental: true
|
26
|
+
- ruby: '3.0'
|
27
|
+
os: 'windows-latest'
|
28
|
+
experimental: true
|
29
|
+
- ruby: '3.0'
|
30
|
+
os: 'macos-latest'
|
31
|
+
experimental: true
|
32
|
+
steps:
|
33
|
+
- uses: actions/checkout@v2
|
34
|
+
with:
|
35
|
+
submodules: true
|
36
|
+
|
37
|
+
- uses: ruby/setup-ruby@v1
|
38
|
+
with:
|
39
|
+
ruby-version: ${{ matrix.ruby }}
|
40
|
+
bundler-cache: true
|
41
|
+
|
42
|
+
- run: bundle exec rake
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
# This project follows the Ribose OSS style guide.
|
2
2
|
# https://github.com/riboseinc/oss-guides
|
3
3
|
# All project-specific additions and overrides should be specified in this file.
|
4
|
-
|
5
4
|
inherit_from:
|
6
5
|
- https://raw.githubusercontent.com/riboseinc/oss-guides/master/ci/rubocop.yml
|
6
|
+
|
7
|
+
# local repo-specific modifications
|
8
|
+
|
7
9
|
AllCops:
|
8
|
-
|
9
|
-
|
10
|
-
|
10
|
+
DisplayCopNames: false
|
11
|
+
StyleGuideCopsOnly: false
|
12
|
+
TargetRubyVersion: 2.4
|
data/Gemfile
CHANGED
@@ -10,6 +10,6 @@ end
|
|
10
10
|
|
11
11
|
gemspec
|
12
12
|
|
13
|
-
if File.exist?
|
14
|
-
eval File.read(
|
13
|
+
if File.exist? "Gemfile.devel"
|
14
|
+
eval File.read("Gemfile.devel"), nil, "Gemfile.devel" # rubocop:disable Security/Eval
|
15
15
|
end
|
data/README.adoc
CHANGED
@@ -3,9 +3,7 @@
|
|
3
3
|
https://github.com/metanorma/html2doc/workflows/main/badge.svg
|
4
4
|
|
5
5
|
image:https://img.shields.io/gem/v/html2doc.svg["Gem Version", link="https://rubygems.org/gems/html2doc"]
|
6
|
-
image:https://github.com/metanorma/html2doc/workflows/
|
7
|
-
image:https://github.com/metanorma/html2doc/workflows/macos/badge.svg["OSX Build Status", link="https://github.com/metanorma/html2doc/actions?query=workflow%3Amacos"]
|
8
|
-
image:https://github.com/metanorma/html2doc/workflows/windows/badge.svg["Windows Build Status", link="https://github.com/metanorma/html2doc/actions?query=workflow%3Awindows"]
|
6
|
+
image:https://github.com/metanorma/html2doc/workflows/rake/badge.svg["Build Status", link="https://github.com/metanorma/html2doc/actions?workflow=rake"]
|
9
7
|
image:https://codeclimate.com/github/metanorma/html2doc/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/html2doc"]
|
10
8
|
image:https://img.shields.io/github/issues-pr-raw/metanorma/html2doc.svg["Pull Requests", link="https://github.com/metanorma/html2doc/pulls"]
|
11
9
|
image:https://img.shields.io/github/commits-since/metanorma/html2doc/latest.svg["Commits since latest",link="https://github.com/metanorma/html2doc/releases"]
|
data/Rakefile
CHANGED
data/bin/html2doc
CHANGED
@@ -21,9 +21,8 @@ if ARGV.length < 1
|
|
21
21
|
end
|
22
22
|
|
23
23
|
Html2Doc.process(
|
24
|
-
File.read(ARGV[0], encoding: "utf-8"),
|
24
|
+
File.read(ARGV[0], encoding: "utf-8"),
|
25
25
|
filename: ARGV[0].gsub(/\.html?$/, ""),
|
26
26
|
stylesheet: options[:stylesheet],
|
27
|
-
header: options[:header]
|
27
|
+
header: options[:header]
|
28
28
|
)
|
29
|
-
|
data/html2doc.gemspec
CHANGED
@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
|
|
28
28
|
spec.add_dependency "htmlentities", "~> 4.3.4"
|
29
29
|
spec.add_dependency "image_size"
|
30
30
|
spec.add_dependency "mime-types"
|
31
|
-
spec.add_dependency "nokogiri", "
|
31
|
+
spec.add_dependency "nokogiri", "~> 1.10.4"
|
32
32
|
spec.add_dependency "thread_safe"
|
33
33
|
spec.add_dependency "uuidtools"
|
34
|
-
spec.add_dependency "asciimath", "~> 2.0.
|
34
|
+
spec.add_dependency "asciimath", "~> 2.0.2"
|
35
|
+
spec.add_dependency "plane1converter", "~> 0.0.1"
|
35
36
|
|
36
37
|
spec.add_development_dependency "byebug", "~> 9.1"
|
37
38
|
spec.add_development_dependency "equivalent-xml", "~> 0.6"
|
@@ -39,7 +40,7 @@ Gem::Specification.new do |spec|
|
|
39
40
|
spec.add_development_dependency "guard-rspec", "~> 4.7"
|
40
41
|
spec.add_development_dependency "rake", "~> 12.0"
|
41
42
|
spec.add_development_dependency "rspec", "~> 3.6"
|
42
|
-
spec.add_development_dependency "rubocop", "
|
43
|
+
spec.add_development_dependency "rubocop", "~> 1.5.2"
|
43
44
|
spec.add_development_dependency "simplecov", "~> 0.15"
|
44
45
|
spec.add_development_dependency "timecop", "~> 0.9"
|
45
46
|
spec.add_development_dependency "rspec-match_fuzzy", "~> 0.1.3"
|
data/lib/html2doc/base.rb
CHANGED
@@ -2,8 +2,6 @@ require "uuidtools"
|
|
2
2
|
require "asciimath"
|
3
3
|
require "htmlentities"
|
4
4
|
require "nokogiri"
|
5
|
-
#require "xml/xslt"
|
6
|
-
require "pp"
|
7
5
|
require "fileutils"
|
8
6
|
|
9
7
|
module Html2Doc
|
@@ -19,16 +17,26 @@ module Html2Doc
|
|
19
17
|
|
20
18
|
def self.process_header(headerfile, hash)
|
21
19
|
return if headerfile.nil?
|
20
|
+
|
22
21
|
doc = File.read(headerfile, encoding: "utf-8")
|
23
|
-
doc = header_image_cleanup(doc, hash[:dir1], hash[:filename],
|
22
|
+
doc = header_image_cleanup(doc, hash[:dir1], hash[:filename],
|
23
|
+
File.dirname(hash[:filename]))
|
24
24
|
File.open("#{hash[:dir1]}/header.html", "w:UTF-8") { |f| f.write(doc) }
|
25
25
|
end
|
26
26
|
|
27
|
+
def self.clear_dir(dir)
|
28
|
+
Dir.foreach(dir) do |f|
|
29
|
+
fn = File.join(dir, f)
|
30
|
+
File.delete(fn) if f != "." && f != ".."
|
31
|
+
end
|
32
|
+
dir
|
33
|
+
end
|
34
|
+
|
27
35
|
def self.create_dir(filename, dir)
|
28
|
-
|
36
|
+
dir and return clear_dir(dir)
|
29
37
|
dir = "#{filename}_files"
|
30
38
|
Dir.mkdir(dir) unless File.exists?(dir)
|
31
|
-
dir
|
39
|
+
clear_dir(dir)
|
32
40
|
end
|
33
41
|
|
34
42
|
def self.process_html(result, hash)
|
@@ -64,7 +72,7 @@ module Html2Doc
|
|
64
72
|
|
65
73
|
def self.to_xhtml(xml)
|
66
74
|
xml.gsub!(/<\?xml[^>]*>/, "")
|
67
|
-
unless /<!DOCTYPE /.match xml
|
75
|
+
unless /<!DOCTYPE /.match? xml
|
68
76
|
xml = '<!DOCTYPE html SYSTEM
|
69
77
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
|
70
78
|
end
|
@@ -76,34 +84,34 @@ module Html2Doc
|
|
76
84
|
DOCTYPE
|
77
85
|
|
78
86
|
def self.from_xhtml(xml)
|
79
|
-
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
|
80
|
-
sub(DOCTYPE, "")
|
81
|
-
gsub(%{ />}, "/>")
|
87
|
+
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
|
88
|
+
.sub(DOCTYPE, "")
|
89
|
+
.gsub(%{ />}, "/>")
|
82
90
|
end
|
83
91
|
|
84
|
-
def self.msword_fix(
|
92
|
+
def self.msword_fix(doc)
|
85
93
|
# brain damage in MSWord parser
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
94
|
+
doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
|
95
|
+
'<span style="mso-special-character:footnote"></span>')
|
96
|
+
doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
|
97
|
+
'<div style="mso-element:footnote-list"/>')
|
98
|
+
doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
|
99
|
+
doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
|
100
|
+
doc.gsub!(%r{<meta http-equiv="Content-Type"},
|
101
|
+
"<meta http-equiv=Content-Type")
|
102
|
+
doc.gsub!(%r{></m:jc>}, "/>")
|
103
|
+
doc.gsub!(%r{></v:stroke>}, "/>")
|
104
|
+
doc.gsub!(%r{></v:f>}, "/>")
|
105
|
+
doc.gsub!(%r{></v:path>}, "/>")
|
106
|
+
doc.gsub!(%r{></o:lock>}, "/>")
|
107
|
+
doc.gsub!(%r{></v:imagedata>}, "/>")
|
108
|
+
doc.gsub!(%r{></w:wrap>}, "/>")
|
109
|
+
doc.gsub!(%r{&tab;|&tab;},
|
110
|
+
'<span style="mso-tab-count:1">  </span>')
|
111
|
+
doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
|
103
112
|
a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
|
104
113
|
a
|
105
114
|
end.join
|
106
|
-
r
|
107
115
|
end
|
108
116
|
|
109
117
|
PRINT_VIEW = <<~XML.freeze
|
@@ -122,26 +130,27 @@ module Html2Doc
|
|
122
130
|
def self.define_head1(docxml, dir)
|
123
131
|
docxml.xpath("//*[local-name() = 'head']").each do |h|
|
124
132
|
h.children.first.add_previous_sibling <<~XML
|
125
|
-
|
126
|
-
|
133
|
+
#{PRINT_VIEW}
|
134
|
+
<link rel="File-List" href="cid:filelist.xml"/>
|
127
135
|
XML
|
128
136
|
end
|
129
137
|
end
|
130
138
|
|
131
|
-
def self.filename_substitute(
|
132
|
-
if header_filename.nil?
|
133
|
-
|
134
|
-
|
135
|
-
|
139
|
+
def self.filename_substitute(head, header_filename)
|
140
|
+
return if header_filename.nil?
|
141
|
+
|
142
|
+
head.xpath(".//*[local-name() = 'style']").each do |s|
|
143
|
+
s1 = s.to_xml.gsub(/url\("[^"]+"\)/) do |m|
|
144
|
+
/FILENAME/.match?(m) ? "url(cid:header.html)" : m
|
145
|
+
end
|
146
|
+
s.replace(s1)
|
136
147
|
end
|
137
|
-
stylesheet
|
138
148
|
end
|
139
149
|
|
140
150
|
def self.stylesheet(filename, header_filename, fn)
|
141
|
-
(fn.nil? || fn.empty?)
|
151
|
+
(fn.nil? || fn.empty?) and
|
142
152
|
fn = File.join(File.dirname(__FILE__), "wordstyle.css")
|
143
153
|
stylesheet = File.read(fn, encoding: "UTF-8")
|
144
|
-
stylesheet = filename_substitute(stylesheet, header_filename, filename)
|
145
154
|
xml = Nokogiri::XML("<style/>")
|
146
155
|
xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
|
147
156
|
xml.root.to_s
|
@@ -152,6 +161,7 @@ module Html2Doc
|
|
152
161
|
head = docxml.at("//*[local-name() = 'head']")
|
153
162
|
css = stylesheet(hash[:filename], hash[:header_file], hash[:stylesheet])
|
154
163
|
add_stylesheet(head, title, css)
|
164
|
+
filename_substitute(head, hash[:header_file])
|
155
165
|
define_head1(docxml, hash[:dir1])
|
156
166
|
rootnamespace(docxml.root)
|
157
167
|
end
|
@@ -180,13 +190,13 @@ module Html2Doc
|
|
180
190
|
end
|
181
191
|
|
182
192
|
def self.bookmarks(docxml)
|
183
|
-
docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
|
184
|
-
|
185
|
-
next if
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
193
|
+
docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
|
194
|
+
.each do |x|
|
195
|
+
next if x["id"].empty? ||
|
196
|
+
%w(shapetype v:shapetype shape v:shape).include?(x.name)
|
197
|
+
|
198
|
+
if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
|
199
|
+
else x.children.first.previous = "<a name='#{x['id']}'></a>"
|
190
200
|
end
|
191
201
|
x.delete("id")
|
192
202
|
end
|
data/lib/html2doc/lists.rb
CHANGED
@@ -7,6 +7,7 @@ require "uuidtools"
|
|
7
7
|
module Html2Doc
|
8
8
|
def self.style_list(li, level, liststyle, listnumber)
|
9
9
|
return unless liststyle
|
10
|
+
|
10
11
|
if li["style"]
|
11
12
|
li["style"] += ";"
|
12
13
|
else
|
@@ -16,37 +17,39 @@ module Html2Doc
|
|
16
17
|
end
|
17
18
|
|
18
19
|
def self.list_add1(li, liststyles, listtype, level)
|
19
|
-
if [
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
20
|
+
if %i[ul ol].include? listtype
|
21
|
+
list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
|
22
|
+
liststyles, :ul, level + 1)
|
23
|
+
list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
|
24
|
+
liststyles, :ol, level + 1)
|
25
|
+
else
|
26
|
+
list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
|
27
|
+
liststyles, listtype, level + 1)
|
28
|
+
list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
|
29
|
+
liststyles, listtype, level + 1)
|
30
|
+
end
|
30
31
|
end
|
31
32
|
|
32
33
|
def self.list_add(xpath, liststyles, listtype, level)
|
33
|
-
xpath.each_with_index do |
|
34
|
+
xpath.each_with_index do |l, _i|
|
34
35
|
@listnumber += 1 if level == 1
|
35
|
-
|
36
|
-
|
37
|
-
(
|
36
|
+
l["seen"] = true if level == 1
|
37
|
+
l["id"] ||= UUIDTools::UUID.random_create
|
38
|
+
(l.xpath(".//li") - l.xpath(".//ol//li | .//ul//li")).each do |li|
|
38
39
|
style_list(li, level, liststyles[listtype], @listnumber)
|
39
40
|
list_add1(li, liststyles, listtype, level)
|
40
41
|
end
|
41
|
-
|
42
|
-
|
43
|
-
|
42
|
+
l.xpath(".//ul[not(ancestor::li/ancestor::*/@id = '#{l['id']}')] | "\
|
43
|
+
".//ol[not(ancestor::li/ancestor::*/@id = '#{l['id']}')]")
|
44
|
+
.each do |li|
|
45
|
+
list_add1(li.parent, liststyles, listtype, level - 1)
|
44
46
|
end
|
45
47
|
end
|
46
48
|
end
|
47
49
|
|
48
50
|
def self.list2para(u)
|
49
51
|
return if u.xpath("./li").empty?
|
52
|
+
|
50
53
|
u.xpath("./li").first["class"] ||= "MsoListParagraphCxSpFirst"
|
51
54
|
u.xpath("./li").last["class"] ||= "MsoListParagraphCxSpLast"
|
52
55
|
u.xpath("./li/p").each { |p| p["class"] ||= "MsoListParagraphCxSpMiddle" }
|
@@ -64,21 +67,25 @@ module Html2Doc
|
|
64
67
|
def self.lists1(docxml, liststyles, k)
|
65
68
|
case k
|
66
69
|
when :ul then list_add(docxml.xpath("//ul[not(@class)]#{TOPLIST}"),
|
67
|
-
|
70
|
+
liststyles, :ul, 1)
|
68
71
|
when :ol then list_add(docxml.xpath("//ol[not(@class)]#{TOPLIST}"),
|
69
72
|
liststyles, :ol, 1)
|
70
73
|
else
|
71
|
-
list_add(docxml.xpath("//ol[@class = '#{k
|
72
|
-
"//ul[@class = '#{k
|
74
|
+
list_add(docxml.xpath("//ol[@class = '#{k}']#{TOPLIST} | "\
|
75
|
+
"//ul[@class = '#{k}']#{TOPLIST}"),
|
73
76
|
liststyles, k, 1)
|
74
77
|
end
|
75
78
|
end
|
76
79
|
|
77
80
|
def self.lists_unstyled(docxml, liststyles)
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
81
|
+
if liststyles.has_key?(:ul)
|
82
|
+
list_add(docxml.xpath("//ul#{TOPLIST}[not(@seen)]"),
|
83
|
+
liststyles, :ul, 1)
|
84
|
+
end
|
85
|
+
if liststyles.has_key?(:ol)
|
86
|
+
list_add(docxml.xpath("//ol#{TOPLIST}[not(@seen)]"),
|
87
|
+
liststyles, :ul, 1)
|
88
|
+
end
|
82
89
|
docxml.xpath("//ul[@seen] | //ol[@seen]").each do |l|
|
83
90
|
l.delete("seen")
|
84
91
|
end
|
@@ -86,6 +93,7 @@ module Html2Doc
|
|
86
93
|
|
87
94
|
def self.lists(docxml, liststyles)
|
88
95
|
return if liststyles.nil?
|
96
|
+
|
89
97
|
@listnumber = 0
|
90
98
|
liststyles.each_key { |k| lists1(docxml, liststyles, k) }
|
91
99
|
lists_unstyled(docxml, liststyles)
|
data/lib/html2doc/math.rb
CHANGED
@@ -2,20 +2,27 @@ require "uuidtools"
|
|
2
2
|
require "asciimath"
|
3
3
|
require "htmlentities"
|
4
4
|
require "nokogiri"
|
5
|
+
require "plane1converter"
|
5
6
|
|
6
7
|
module Html2Doc
|
7
8
|
@xsltemplate =
|
8
9
|
Nokogiri::XSLT(File.read(File.join(File.dirname(__FILE__), "mml2omml.xsl"),
|
9
10
|
encoding: "utf-8"))
|
10
11
|
|
11
|
-
def self.asciimath_to_mathml1(
|
12
|
-
AsciiMath::MathMLBuilder.new(:
|
13
|
-
AsciiMath.parse(HTMLEntities.new.decode(
|
14
|
-
|
12
|
+
def self.asciimath_to_mathml1(expr)
|
13
|
+
AsciiMath::MathMLBuilder.new(msword: true).append_expression(
|
14
|
+
AsciiMath.parse(HTMLEntities.new.decode(expr)).ast,
|
15
|
+
).to_s
|
16
|
+
.gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
|
17
|
+
rescue StandardError => e
|
18
|
+
puts "parsing: #{expr}"
|
19
|
+
puts e.message
|
20
|
+
raise e
|
15
21
|
end
|
16
22
|
|
17
23
|
def self.asciimath_to_mathml(doc, delims)
|
18
24
|
return doc if delims.nil? || delims.size < 2
|
25
|
+
|
19
26
|
m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
|
20
27
|
m.each_slice(4).map.with_index do |(*a), i|
|
21
28
|
i % 500 == 0 && m.size > 1000 && i > 0 and
|
@@ -25,43 +32,96 @@ module Html2Doc
|
|
25
32
|
end.join
|
26
33
|
end
|
27
34
|
|
35
|
+
def self.unwrap_accents(doc)
|
36
|
+
doc.xpath("//*[@accent = 'true']").each do |x|
|
37
|
+
x.elements.length > 1 or next
|
38
|
+
x.elements[1].name == "mrow" and
|
39
|
+
x.elements[1].replace(x.elements[1].children)
|
40
|
+
end
|
41
|
+
doc
|
42
|
+
end
|
43
|
+
|
28
44
|
# random fixes to MathML input that OOXML needs to render properly
|
29
|
-
def self.ooxml_cleanup(
|
30
|
-
|
31
|
-
|
32
|
-
|
45
|
+
def self.ooxml_cleanup(math, docnamespaces)
|
46
|
+
math = unwrap_accents(
|
47
|
+
mathml_preserve_space(
|
48
|
+
mathml_insert_rows(math, docnamespaces), docnamespaces
|
49
|
+
),
|
50
|
+
)
|
51
|
+
math.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
|
52
|
+
math
|
33
53
|
end
|
34
54
|
|
35
|
-
def self.mathml_insert_rows(
|
36
|
-
|
37
|
-
map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
|
55
|
+
def self.mathml_insert_rows(math, docnamespaces)
|
56
|
+
math.xpath(%w(msup msub msubsup munder mover munderover)
|
57
|
+
.map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
|
38
58
|
next unless x.next_element && x.next_element != "mrow"
|
59
|
+
|
39
60
|
x.next_element.wrap("<mrow/>")
|
40
61
|
end
|
41
|
-
|
62
|
+
math
|
42
63
|
end
|
43
64
|
|
44
|
-
def self.mathml_preserve_space(
|
45
|
-
|
65
|
+
def self.mathml_preserve_space(math, docnamespaces)
|
66
|
+
math.xpath(".//xmlns:mtext", docnamespaces).each do |x|
|
46
67
|
x.children = x.children.to_xml.gsub(/^\s/, " ").gsub(/\s$/, " ")
|
47
68
|
end
|
48
|
-
|
69
|
+
math
|
49
70
|
end
|
50
71
|
|
51
|
-
def self.unitalic(
|
52
|
-
|
72
|
+
def self.unitalic(math)
|
73
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
|
53
74
|
x.wrap("<span style='font-style:normal;'></span>")
|
54
75
|
end
|
55
|
-
|
56
|
-
x.wrap("<span style='font-
|
76
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
|
77
|
+
x.wrap("<span class='nostem' style='font-weight:bold;'><em></em></span>")
|
57
78
|
end
|
58
|
-
|
79
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
|
59
80
|
x.wrap("<span class='nostem'><em></em></span>")
|
60
81
|
end
|
61
|
-
|
82
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
|
62
83
|
x.wrap("<span style='font-style:normal;font-weight:bold;'></span>")
|
63
84
|
end
|
64
|
-
m
|
85
|
+
math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
|
86
|
+
to_plane1(x, :monospace)
|
87
|
+
end
|
88
|
+
math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
|
89
|
+
to_plane1(x, :doublestruck)
|
90
|
+
end
|
91
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
|
92
|
+
to_plane1(x, :script)
|
93
|
+
end
|
94
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
|
95
|
+
to_plane1(x, :scriptbold)
|
96
|
+
end
|
97
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
|
98
|
+
to_plane1(x, :fraktur)
|
99
|
+
end
|
100
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
|
101
|
+
to_plane1(x, :frakturbold)
|
102
|
+
end
|
103
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
|
104
|
+
to_plane1(x, :sans)
|
105
|
+
end
|
106
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
|
107
|
+
to_plane1(x, :sansbold)
|
108
|
+
end
|
109
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
|
110
|
+
to_plane1(x, :sansitalic)
|
111
|
+
end
|
112
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
|
113
|
+
to_plane1(x, :sansbolditalic)
|
114
|
+
end
|
115
|
+
math
|
116
|
+
end
|
117
|
+
|
118
|
+
def self.to_plane1(xml, font)
|
119
|
+
xml.traverse do |n|
|
120
|
+
next unless n.text?
|
121
|
+
|
122
|
+
n.replace(Plane1Converter.conv(HTMLEntities.new.decode(n.text), font))
|
123
|
+
end
|
124
|
+
xml
|
65
125
|
end
|
66
126
|
|
67
127
|
def self.mathml_to_ooml(docxml)
|
@@ -71,22 +131,23 @@ module Html2Doc
|
|
71
131
|
i % 100 == 0 && m.size > 500 && i > 0 and
|
72
132
|
warn "Math OOXML #{i} of #{m.size}"
|
73
133
|
element = ooxml_cleanup(x, docnamespaces)
|
74
|
-
doc = Nokogiri::XML::Document::new
|
134
|
+
doc = Nokogiri::XML::Document::new
|
75
135
|
doc.root = element
|
76
|
-
ooxml =
|
77
|
-
gsub(/<\?[^>]+>\s*/, "")
|
78
|
-
gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
|
79
|
-
gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
|
136
|
+
ooxml = unitalic(esc_space(@xsltemplate.transform(doc))).to_s
|
137
|
+
.gsub(/<\?[^>]+>\s*/, "")
|
138
|
+
.gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
|
139
|
+
.gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
|
80
140
|
ooxml = uncenter(x, ooxml)
|
81
141
|
x.swap(ooxml)
|
82
142
|
end
|
83
143
|
end
|
84
144
|
|
85
|
-
# escape space as 2; we are removing any spaces generated by
|
145
|
+
# escape space as 2; we are removing any spaces generated by
|
86
146
|
# XML indentation
|
87
147
|
def self.esc_space(xml)
|
88
148
|
xml.traverse do |n|
|
89
149
|
next unless n.text?
|
150
|
+
|
90
151
|
n = n.text.gsub(/ /, "2")
|
91
152
|
end
|
92
153
|
xml
|
@@ -94,17 +155,15 @@ module Html2Doc
|
|
94
155
|
|
95
156
|
# if oomml has no siblings, by default it is centered; override this with
|
96
157
|
# left/right if parent is so tagged
|
97
|
-
def self.uncenter(
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
"m:val='left'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
|
105
|
-
elsif alignnode.text.include? ("text-align:right")
|
158
|
+
def self.uncenter(math, ooxml)
|
159
|
+
alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\
|
160
|
+
"local-name() = 'div' or local-name() = 'td']/@style")
|
161
|
+
return ooxml unless alignnode && (math.next == nil && math.previous == nil)
|
162
|
+
|
163
|
+
%w(left right).each do |dir|
|
164
|
+
if alignnode.text.include? ("text-align:#{dir}")
|
106
165
|
ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
|
107
|
-
"m:val='
|
166
|
+
"m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
|
108
167
|
end
|
109
168
|
end
|
110
169
|
ooxml
|