html2doc 1.2.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +3 -20
- data/html2doc.gemspec +1 -1
- data/lib/html2doc/base.rb +19 -10
- data/lib/html2doc/math.rb +20 -3
- data/lib/html2doc/mime.rb +5 -4
- data/lib/html2doc/version.rb +1 -1
- data/spec/html2doc_spec.rb +20 -7
- metadata +8 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 56d8c42bd609845f35a5a994fed43d12ebc9fb0d8d303fd60f9a064f4da26a7b
|
4
|
+
data.tar.gz: e9310883dbc5991640e66a1c085d6bcb2ca87155449326b7076489e78d64d187
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8d7076b196634dc81a3942a59155c7c80da21b9eb68721dab437170c54876f970b80448fa31f520648145eca9ace1fea0c7751be04021f9c1f95fe0bf3fa64ce
|
7
|
+
data.tar.gz: 532b022bda9cc4fb88eafeb467c7d6d26ba8dc5ea21f5553ba251e8b92469de6e906f5947a3fdb3bbee241cbaea8805f3477978bbe78af359a6bb7140399a971
|
data/.github/workflows/rake.yml
CHANGED
@@ -10,23 +10,6 @@ on:
|
|
10
10
|
|
11
11
|
jobs:
|
12
12
|
rake:
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
strategy:
|
17
|
-
fail-fast: false
|
18
|
-
matrix:
|
19
|
-
ruby: [ '3.0', '2.7', '2.6', '2.5' ]
|
20
|
-
os: [ ubuntu-latest, windows-latest, macos-latest ]
|
21
|
-
experimental: [ false ]
|
22
|
-
steps:
|
23
|
-
- uses: actions/checkout@v2
|
24
|
-
with:
|
25
|
-
submodules: true
|
26
|
-
|
27
|
-
- uses: ruby/setup-ruby@v1
|
28
|
-
with:
|
29
|
-
ruby-version: ${{ matrix.ruby }}
|
30
|
-
bundler-cache: true
|
31
|
-
|
32
|
-
- run: bundle exec rake
|
13
|
+
uses: metanorma/metanorma-build-scripts/.github/workflows/generic-rake.yml@main
|
14
|
+
secrets:
|
15
|
+
pat_token: ${{ secrets.METANORMA_CI_PAT_TOKEN }}
|
data/html2doc.gemspec
CHANGED
@@ -33,7 +33,7 @@ Gem::Specification.new do |spec|
|
|
33
33
|
spec.add_dependency "thread_safe"
|
34
34
|
spec.add_dependency "uuidtools"
|
35
35
|
|
36
|
-
spec.add_development_dependency "
|
36
|
+
spec.add_development_dependency "debug"
|
37
37
|
spec.add_development_dependency "equivalent-xml", "~> 0.6"
|
38
38
|
spec.add_development_dependency "guard", "~> 2.14"
|
39
39
|
spec.add_development_dependency "guard-rspec", "~> 4.7"
|
data/lib/html2doc/base.rb
CHANGED
@@ -76,6 +76,8 @@ module Html2Doc
|
|
76
76
|
xml = '<!DOCTYPE html SYSTEM
|
77
77
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
|
78
78
|
end
|
79
|
+
xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
|
80
|
+
.gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
|
79
81
|
Nokogiri::XML.parse(xml)
|
80
82
|
end
|
81
83
|
|
@@ -85,12 +87,16 @@ module Html2Doc
|
|
85
87
|
|
86
88
|
def self.from_xhtml(xml)
|
87
89
|
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
|
88
|
-
.sub(DOCTYPE, "")
|
89
|
-
.gsub(
|
90
|
+
.sub(DOCTYPE, "").gsub(%{ />}, "/>")
|
91
|
+
.gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
|
92
|
+
.gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
|
93
|
+
.gsub("\n-->\n", "\n-->\n")
|
90
94
|
end
|
91
95
|
|
92
96
|
def self.msword_fix(doc)
|
93
97
|
# brain damage in MSWord parser
|
98
|
+
doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
|
99
|
+
"<w:DoNotOptimizeForBrowser/>")
|
94
100
|
doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
|
95
101
|
'<span style="mso-special-character:footnote"></span>')
|
96
102
|
doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
|
@@ -116,7 +122,7 @@ module Html2Doc
|
|
116
122
|
end
|
117
123
|
|
118
124
|
PRINT_VIEW = <<~XML.freeze
|
119
|
-
|
125
|
+
|
120
126
|
<xml>
|
121
127
|
<w:WordDocument>
|
122
128
|
<w:View>Print</w:View>
|
@@ -124,8 +130,7 @@ module Html2Doc
|
|
124
130
|
<w:DoNotOptimizeForBrowser/>
|
125
131
|
</w:WordDocument>
|
126
132
|
</xml>
|
127
|
-
|
128
|
-
<meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
|
133
|
+
<meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
|
129
134
|
XML
|
130
135
|
|
131
136
|
def self.define_head1(docxml, _dir)
|
@@ -148,12 +153,16 @@ module Html2Doc
|
|
148
153
|
end
|
149
154
|
end
|
150
155
|
|
151
|
-
def self.stylesheet(_filename, _header_filename,
|
152
|
-
(
|
153
|
-
|
154
|
-
stylesheet = File.read(
|
156
|
+
def self.stylesheet(_filename, _header_filename, cssname)
|
157
|
+
(cssname.nil? || cssname.empty?) and
|
158
|
+
cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
|
159
|
+
stylesheet = File.read(cssname, encoding: "UTF-8")
|
155
160
|
xml = Nokogiri::XML("<style/>")
|
156
|
-
|
161
|
+
#s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
|
162
|
+
#xml.children.first << Nokogiri::XML::Comment.new(xml, s)
|
163
|
+
xml.children.first << Nokogiri::XML::CDATA
|
164
|
+
.new(xml, "\n<!--\n#{stylesheet}\n-->\n")
|
165
|
+
|
157
166
|
xml.root.to_s
|
158
167
|
end
|
159
168
|
|
data/lib/html2doc/math.rb
CHANGED
@@ -140,7 +140,7 @@ module Html2Doc
|
|
140
140
|
end
|
141
141
|
end
|
142
142
|
|
143
|
-
# We need span and em not to be namespaced. Word can't deal with explicit
|
143
|
+
# We need span and em not to be namespaced. Word can't deal with explicit
|
144
144
|
# namespaces.
|
145
145
|
# We will end up stripping them out again under Nokogiri 1.11, which correctly
|
146
146
|
# insists on inheriting namespace from parent.
|
@@ -154,11 +154,28 @@ module Html2Doc
|
|
154
154
|
def self.mathml_to_ooml1(xml, docnamespaces)
|
155
155
|
doc = Nokogiri::XML::Document::new
|
156
156
|
doc.root = ooxml_cleanup(xml, docnamespaces)
|
157
|
-
|
157
|
+
ooxml = ooml_clean(unitalic(esc_space(accent_tr(@xsltemplate.transform(doc)))))
|
158
158
|
ooxml = uncenter(xml, ooxml)
|
159
159
|
xml.swap(ooxml)
|
160
160
|
end
|
161
161
|
|
162
|
+
def self.accent_tr(xml)
|
163
|
+
xml.xpath(".//*[local-name()='accPr']/*[local-name()='chr']").each do |x|
|
164
|
+
x["m:val"] &&= accent_tr1(x["m:val"])
|
165
|
+
x["val"] &&= accent_tr1(x["val"])
|
166
|
+
end
|
167
|
+
xml
|
168
|
+
end
|
169
|
+
|
170
|
+
def self.accent_tr1(accent)
|
171
|
+
case accent
|
172
|
+
when "\u2192" then "\u20D7"
|
173
|
+
when "^" then "\u0302"
|
174
|
+
when "~" then "\u0303"
|
175
|
+
else accent
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
162
179
|
# escape space as 2; we are removing any spaces generated by
|
163
180
|
# XML indentation
|
164
181
|
def self.esc_space(xml)
|
@@ -180,7 +197,7 @@ module Html2Doc
|
|
180
197
|
%w(left right).each do |dir|
|
181
198
|
if alignnode.text.include? ("text-align:#{dir}")
|
182
199
|
ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
|
183
|
-
|
200
|
+
"m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
|
184
201
|
end
|
185
202
|
end
|
186
203
|
ooxml
|
data/lib/html2doc/mime.rb
CHANGED
@@ -107,12 +107,13 @@ module Html2Doc
|
|
107
107
|
# only processes locally stored images
|
108
108
|
def self.image_cleanup(docxml, dir, localdir)
|
109
109
|
docxml.traverse do |i|
|
110
|
+
src = i["src"]
|
110
111
|
next unless i.element? && %w(img v:imagedata).include?(i.name)
|
111
|
-
next if /^http/.match?
|
112
|
-
next if %r{^data:(image|application)/[^;]+;base64}.match?
|
112
|
+
next if src.nil? || src.empty? || /^http/.match?(src)
|
113
|
+
next if %r{^data:(image|application)/[^;]+;base64}.match? src
|
113
114
|
|
114
|
-
local_filename = localname(
|
115
|
-
new_filename = "#{mkuuid}#{File.extname(
|
115
|
+
local_filename = localname(src, localdir)
|
116
|
+
new_filename = "#{mkuuid}#{File.extname(src)}"
|
116
117
|
FileUtils.cp local_filename, File.join(dir, new_filename)
|
117
118
|
i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
|
118
119
|
i["src"] = File.join(File.basename(dir), new_filename)
|
data/lib/html2doc/version.rb
CHANGED
data/spec/html2doc_spec.rb
CHANGED
@@ -41,7 +41,7 @@ WORD_HDR = <<~HDR.freeze
|
|
41
41
|
Content-Type: text/html; charset="utf-8"
|
42
42
|
|
43
43
|
<?xml version="1.0"?>
|
44
|
-
<html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head
|
44
|
+
<html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head>
|
45
45
|
<xml>
|
46
46
|
<w:WordDocument>
|
47
47
|
<w:View>Print</w:View>
|
@@ -49,7 +49,6 @@ WORD_HDR = <<~HDR.freeze
|
|
49
49
|
<w:DoNotOptimizeForBrowser/>
|
50
50
|
</w:WordDocument>
|
51
51
|
</xml>
|
52
|
-
<![endif]-->
|
53
52
|
<meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
|
54
53
|
|
55
54
|
<link rel=File-List href="cid:filelist.xml"/>
|
@@ -278,6 +277,17 @@ RSpec.describe Html2Doc do
|
|
278
277
|
expect(Html2Doc::VERSION).not_to be nil
|
279
278
|
end
|
280
279
|
|
280
|
+
it "preserves Word HTML directives" do
|
281
|
+
Html2Doc.process(html_input(%[A<!--[if gte mso 9]>X<![endif]-->B]), filename: "test")
|
282
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
283
|
+
.to match_fuzzy(<<~OUTPUT)
|
284
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
285
|
+
#{word_body(%{A<!--[if gte mso 9]>X<![endif]-->B},
|
286
|
+
'<div style="mso-element:footnote-list"/>')}
|
287
|
+
#{WORD_FTR1}
|
288
|
+
OUTPUT
|
289
|
+
end
|
290
|
+
|
281
291
|
it "processes a blank document" do
|
282
292
|
Html2Doc.process(html_input(""), filename: "test")
|
283
293
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
@@ -367,7 +377,8 @@ RSpec.describe Html2Doc do
|
|
367
377
|
File.open("spec/header_img1.html", "w:UTF-8") do |f|
|
368
378
|
f.write(
|
369
379
|
doc.sub(%r{spec/19160-6.png},
|
370
|
-
File.expand_path(File.join(File.dirname(__FILE__),
|
380
|
+
File.expand_path(File.join(File.dirname(__FILE__),
|
381
|
+
"19160-6.png"))),
|
371
382
|
)
|
372
383
|
end
|
373
384
|
Html2Doc.process(html_input(""),
|
@@ -450,7 +461,7 @@ RSpec.describe Html2Doc do
|
|
450
461
|
OUTPUT
|
451
462
|
end
|
452
463
|
|
453
|
-
it "unwraps accent in MathML" do
|
464
|
+
it "unwraps and converts accent in MathML" do
|
454
465
|
Html2Doc.process(html_input("<div><math xmlns='http://www.w3.org/1998/Math/MathML'>
|
455
466
|
<mover accent='true'><mrow><mi>p</mi></mrow><mrow><mo>^</mo></mrow></mover>
|
456
467
|
</math></div>"), filename: "test", asciimathdelims: ["{{", "}}"])
|
@@ -458,7 +469,7 @@ RSpec.describe Html2Doc do
|
|
458
469
|
.to match_fuzzy(<<~OUTPUT)
|
459
470
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
460
471
|
#{word_body('<div><m:oMath>
|
461
|
-
<m:acc><m:accPr><m:chr m:val="
|
472
|
+
<m:acc><m:accPr><m:chr m:val="̂"></m:chr></m:accPr><m:e><m:r><m:t>p</m:t></m:r></m:e></m:acc>
|
462
473
|
</m:oMath>
|
463
474
|
</div>', '<div style="mso-element:footnote-list"/>')}
|
464
475
|
#{WORD_FTR1}
|
@@ -565,7 +576,8 @@ RSpec.describe Html2Doc do
|
|
565
576
|
|
566
577
|
it "resizes images for height, in a file in a subdirectory" do
|
567
578
|
simple_body = '<img src="19160-6.png">'
|
568
|
-
Html2Doc.process(html_input(simple_body), filename: "spec/test",
|
579
|
+
Html2Doc.process(html_input(simple_body), filename: "spec/test",
|
580
|
+
imagedir: "spec")
|
569
581
|
testdoc = File.read("spec/test.doc", encoding: "utf-8")
|
570
582
|
expect(testdoc).to match(%r{Content-Type: image/png})
|
571
583
|
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
@@ -653,7 +665,8 @@ RSpec.describe Html2Doc do
|
|
653
665
|
|
654
666
|
it "deals with absolute image locations" do
|
655
667
|
simple_body = %{<img src="#{__dir__}/19160-6.png">}
|
656
|
-
Html2Doc.process(html_input(simple_body), filename: "spec/test",
|
668
|
+
Html2Doc.process(html_input(simple_body), filename: "spec/test",
|
669
|
+
imagedir: ".")
|
657
670
|
testdoc = File.read("spec/test.doc", encoding: "utf-8")
|
658
671
|
expect(testdoc).to match(%r{Content-Type: image/png})
|
659
672
|
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2doc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-02-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: asciimath
|
@@ -123,19 +123,19 @@ dependencies:
|
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
126
|
+
name: debug
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
128
128
|
requirements:
|
129
|
-
- - "
|
129
|
+
- - ">="
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version: '
|
131
|
+
version: '0'
|
132
132
|
type: :development
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
|
-
- - "
|
136
|
+
- - ">="
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version: '
|
138
|
+
version: '0'
|
139
139
|
- !ruby/object:Gem::Dependency
|
140
140
|
name: equivalent-xml
|
141
141
|
requirement: !ruby/object:Gem::Requirement
|
@@ -334,7 +334,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
334
334
|
- !ruby/object:Gem::Version
|
335
335
|
version: '0'
|
336
336
|
requirements: []
|
337
|
-
rubygems_version: 3.
|
337
|
+
rubygems_version: 3.2.32
|
338
338
|
signing_key:
|
339
339
|
specification_version: 4
|
340
340
|
summary: Convert HTML document to Microsoft Word document
|