html2doc 1.2.0 → 1.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +3 -20
- data/html2doc.gemspec +1 -1
- data/lib/html2doc/base.rb +19 -10
- data/lib/html2doc/math.rb +20 -3
- data/lib/html2doc/mime.rb +5 -4
- data/lib/html2doc/version.rb +1 -1
- data/spec/html2doc_spec.rb +20 -7
- metadata +8 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 56d8c42bd609845f35a5a994fed43d12ebc9fb0d8d303fd60f9a064f4da26a7b
|
4
|
+
data.tar.gz: e9310883dbc5991640e66a1c085d6bcb2ca87155449326b7076489e78d64d187
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8d7076b196634dc81a3942a59155c7c80da21b9eb68721dab437170c54876f970b80448fa31f520648145eca9ace1fea0c7751be04021f9c1f95fe0bf3fa64ce
|
7
|
+
data.tar.gz: 532b022bda9cc4fb88eafeb467c7d6d26ba8dc5ea21f5553ba251e8b92469de6e906f5947a3fdb3bbee241cbaea8805f3477978bbe78af359a6bb7140399a971
|
data/.github/workflows/rake.yml
CHANGED
@@ -10,23 +10,6 @@ on:
|
|
10
10
|
|
11
11
|
jobs:
|
12
12
|
rake:
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
strategy:
|
17
|
-
fail-fast: false
|
18
|
-
matrix:
|
19
|
-
ruby: [ '3.0', '2.7', '2.6', '2.5' ]
|
20
|
-
os: [ ubuntu-latest, windows-latest, macos-latest ]
|
21
|
-
experimental: [ false ]
|
22
|
-
steps:
|
23
|
-
- uses: actions/checkout@v2
|
24
|
-
with:
|
25
|
-
submodules: true
|
26
|
-
|
27
|
-
- uses: ruby/setup-ruby@v1
|
28
|
-
with:
|
29
|
-
ruby-version: ${{ matrix.ruby }}
|
30
|
-
bundler-cache: true
|
31
|
-
|
32
|
-
- run: bundle exec rake
|
13
|
+
uses: metanorma/metanorma-build-scripts/.github/workflows/generic-rake.yml@main
|
14
|
+
secrets:
|
15
|
+
pat_token: ${{ secrets.METANORMA_CI_PAT_TOKEN }}
|
data/html2doc.gemspec
CHANGED
@@ -33,7 +33,7 @@ Gem::Specification.new do |spec|
|
|
33
33
|
spec.add_dependency "thread_safe"
|
34
34
|
spec.add_dependency "uuidtools"
|
35
35
|
|
36
|
-
spec.add_development_dependency "
|
36
|
+
spec.add_development_dependency "debug"
|
37
37
|
spec.add_development_dependency "equivalent-xml", "~> 0.6"
|
38
38
|
spec.add_development_dependency "guard", "~> 2.14"
|
39
39
|
spec.add_development_dependency "guard-rspec", "~> 4.7"
|
data/lib/html2doc/base.rb
CHANGED
@@ -76,6 +76,8 @@ module Html2Doc
|
|
76
76
|
xml = '<!DOCTYPE html SYSTEM
|
77
77
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
|
78
78
|
end
|
79
|
+
xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
|
80
|
+
.gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
|
79
81
|
Nokogiri::XML.parse(xml)
|
80
82
|
end
|
81
83
|
|
@@ -85,12 +87,16 @@ module Html2Doc
|
|
85
87
|
|
86
88
|
def self.from_xhtml(xml)
|
87
89
|
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
|
88
|
-
.sub(DOCTYPE, "")
|
89
|
-
.gsub(
|
90
|
+
.sub(DOCTYPE, "").gsub(%{ />}, "/>")
|
91
|
+
.gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
|
92
|
+
.gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
|
93
|
+
.gsub("\n-->\n", "\n-->\n")
|
90
94
|
end
|
91
95
|
|
92
96
|
def self.msword_fix(doc)
|
93
97
|
# brain damage in MSWord parser
|
98
|
+
doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
|
99
|
+
"<w:DoNotOptimizeForBrowser/>")
|
94
100
|
doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
|
95
101
|
'<span style="mso-special-character:footnote"></span>')
|
96
102
|
doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
|
@@ -116,7 +122,7 @@ module Html2Doc
|
|
116
122
|
end
|
117
123
|
|
118
124
|
PRINT_VIEW = <<~XML.freeze
|
119
|
-
|
125
|
+
|
120
126
|
<xml>
|
121
127
|
<w:WordDocument>
|
122
128
|
<w:View>Print</w:View>
|
@@ -124,8 +130,7 @@ module Html2Doc
|
|
124
130
|
<w:DoNotOptimizeForBrowser/>
|
125
131
|
</w:WordDocument>
|
126
132
|
</xml>
|
127
|
-
|
128
|
-
<meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
|
133
|
+
<meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
|
129
134
|
XML
|
130
135
|
|
131
136
|
def self.define_head1(docxml, _dir)
|
@@ -148,12 +153,16 @@ module Html2Doc
|
|
148
153
|
end
|
149
154
|
end
|
150
155
|
|
151
|
-
def self.stylesheet(_filename, _header_filename,
|
152
|
-
(
|
153
|
-
|
154
|
-
stylesheet = File.read(
|
156
|
+
def self.stylesheet(_filename, _header_filename, cssname)
|
157
|
+
(cssname.nil? || cssname.empty?) and
|
158
|
+
cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
|
159
|
+
stylesheet = File.read(cssname, encoding: "UTF-8")
|
155
160
|
xml = Nokogiri::XML("<style/>")
|
156
|
-
|
161
|
+
#s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
|
162
|
+
#xml.children.first << Nokogiri::XML::Comment.new(xml, s)
|
163
|
+
xml.children.first << Nokogiri::XML::CDATA
|
164
|
+
.new(xml, "\n<!--\n#{stylesheet}\n-->\n")
|
165
|
+
|
157
166
|
xml.root.to_s
|
158
167
|
end
|
159
168
|
|
data/lib/html2doc/math.rb
CHANGED
@@ -140,7 +140,7 @@ module Html2Doc
|
|
140
140
|
end
|
141
141
|
end
|
142
142
|
|
143
|
-
# We need span and em not to be namespaced. Word can't deal with explicit
|
143
|
+
# We need span and em not to be namespaced. Word can't deal with explicit
|
144
144
|
# namespaces.
|
145
145
|
# We will end up stripping them out again under Nokogiri 1.11, which correctly
|
146
146
|
# insists on inheriting namespace from parent.
|
@@ -154,11 +154,28 @@ module Html2Doc
|
|
154
154
|
def self.mathml_to_ooml1(xml, docnamespaces)
|
155
155
|
doc = Nokogiri::XML::Document::new
|
156
156
|
doc.root = ooxml_cleanup(xml, docnamespaces)
|
157
|
-
|
157
|
+
ooxml = ooml_clean(unitalic(esc_space(accent_tr(@xsltemplate.transform(doc)))))
|
158
158
|
ooxml = uncenter(xml, ooxml)
|
159
159
|
xml.swap(ooxml)
|
160
160
|
end
|
161
161
|
|
162
|
+
def self.accent_tr(xml)
|
163
|
+
xml.xpath(".//*[local-name()='accPr']/*[local-name()='chr']").each do |x|
|
164
|
+
x["m:val"] &&= accent_tr1(x["m:val"])
|
165
|
+
x["val"] &&= accent_tr1(x["val"])
|
166
|
+
end
|
167
|
+
xml
|
168
|
+
end
|
169
|
+
|
170
|
+
def self.accent_tr1(accent)
|
171
|
+
case accent
|
172
|
+
when "\u2192" then "\u20D7"
|
173
|
+
when "^" then "\u0302"
|
174
|
+
when "~" then "\u0303"
|
175
|
+
else accent
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
162
179
|
# escape space as 2; we are removing any spaces generated by
|
163
180
|
# XML indentation
|
164
181
|
def self.esc_space(xml)
|
@@ -180,7 +197,7 @@ module Html2Doc
|
|
180
197
|
%w(left right).each do |dir|
|
181
198
|
if alignnode.text.include? ("text-align:#{dir}")
|
182
199
|
ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
|
183
|
-
|
200
|
+
"m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
|
184
201
|
end
|
185
202
|
end
|
186
203
|
ooxml
|
data/lib/html2doc/mime.rb
CHANGED
@@ -107,12 +107,13 @@ module Html2Doc
|
|
107
107
|
# only processes locally stored images
|
108
108
|
def self.image_cleanup(docxml, dir, localdir)
|
109
109
|
docxml.traverse do |i|
|
110
|
+
src = i["src"]
|
110
111
|
next unless i.element? && %w(img v:imagedata).include?(i.name)
|
111
|
-
next if /^http/.match?
|
112
|
-
next if %r{^data:(image|application)/[^;]+;base64}.match?
|
112
|
+
next if src.nil? || src.empty? || /^http/.match?(src)
|
113
|
+
next if %r{^data:(image|application)/[^;]+;base64}.match? src
|
113
114
|
|
114
|
-
local_filename = localname(
|
115
|
-
new_filename = "#{mkuuid}#{File.extname(
|
115
|
+
local_filename = localname(src, localdir)
|
116
|
+
new_filename = "#{mkuuid}#{File.extname(src)}"
|
116
117
|
FileUtils.cp local_filename, File.join(dir, new_filename)
|
117
118
|
i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
|
118
119
|
i["src"] = File.join(File.basename(dir), new_filename)
|
data/lib/html2doc/version.rb
CHANGED
data/spec/html2doc_spec.rb
CHANGED
@@ -41,7 +41,7 @@ WORD_HDR = <<~HDR.freeze
|
|
41
41
|
Content-Type: text/html; charset="utf-8"
|
42
42
|
|
43
43
|
<?xml version="1.0"?>
|
44
|
-
<html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head
|
44
|
+
<html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head>
|
45
45
|
<xml>
|
46
46
|
<w:WordDocument>
|
47
47
|
<w:View>Print</w:View>
|
@@ -49,7 +49,6 @@ WORD_HDR = <<~HDR.freeze
|
|
49
49
|
<w:DoNotOptimizeForBrowser/>
|
50
50
|
</w:WordDocument>
|
51
51
|
</xml>
|
52
|
-
<![endif]-->
|
53
52
|
<meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
|
54
53
|
|
55
54
|
<link rel=File-List href="cid:filelist.xml"/>
|
@@ -278,6 +277,17 @@ RSpec.describe Html2Doc do
|
|
278
277
|
expect(Html2Doc::VERSION).not_to be nil
|
279
278
|
end
|
280
279
|
|
280
|
+
it "preserves Word HTML directives" do
|
281
|
+
Html2Doc.process(html_input(%[A<!--[if gte mso 9]>X<![endif]-->B]), filename: "test")
|
282
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
283
|
+
.to match_fuzzy(<<~OUTPUT)
|
284
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
285
|
+
#{word_body(%{A<!--[if gte mso 9]>X<![endif]-->B},
|
286
|
+
'<div style="mso-element:footnote-list"/>')}
|
287
|
+
#{WORD_FTR1}
|
288
|
+
OUTPUT
|
289
|
+
end
|
290
|
+
|
281
291
|
it "processes a blank document" do
|
282
292
|
Html2Doc.process(html_input(""), filename: "test")
|
283
293
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
@@ -367,7 +377,8 @@ RSpec.describe Html2Doc do
|
|
367
377
|
File.open("spec/header_img1.html", "w:UTF-8") do |f|
|
368
378
|
f.write(
|
369
379
|
doc.sub(%r{spec/19160-6.png},
|
370
|
-
File.expand_path(File.join(File.dirname(__FILE__),
|
380
|
+
File.expand_path(File.join(File.dirname(__FILE__),
|
381
|
+
"19160-6.png"))),
|
371
382
|
)
|
372
383
|
end
|
373
384
|
Html2Doc.process(html_input(""),
|
@@ -450,7 +461,7 @@ RSpec.describe Html2Doc do
|
|
450
461
|
OUTPUT
|
451
462
|
end
|
452
463
|
|
453
|
-
it "unwraps accent in MathML" do
|
464
|
+
it "unwraps and converts accent in MathML" do
|
454
465
|
Html2Doc.process(html_input("<div><math xmlns='http://www.w3.org/1998/Math/MathML'>
|
455
466
|
<mover accent='true'><mrow><mi>p</mi></mrow><mrow><mo>^</mo></mrow></mover>
|
456
467
|
</math></div>"), filename: "test", asciimathdelims: ["{{", "}}"])
|
@@ -458,7 +469,7 @@ RSpec.describe Html2Doc do
|
|
458
469
|
.to match_fuzzy(<<~OUTPUT)
|
459
470
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
460
471
|
#{word_body('<div><m:oMath>
|
461
|
-
<m:acc><m:accPr><m:chr m:val="
|
472
|
+
<m:acc><m:accPr><m:chr m:val="̂"></m:chr></m:accPr><m:e><m:r><m:t>p</m:t></m:r></m:e></m:acc>
|
462
473
|
</m:oMath>
|
463
474
|
</div>', '<div style="mso-element:footnote-list"/>')}
|
464
475
|
#{WORD_FTR1}
|
@@ -565,7 +576,8 @@ RSpec.describe Html2Doc do
|
|
565
576
|
|
566
577
|
it "resizes images for height, in a file in a subdirectory" do
|
567
578
|
simple_body = '<img src="19160-6.png">'
|
568
|
-
Html2Doc.process(html_input(simple_body), filename: "spec/test",
|
579
|
+
Html2Doc.process(html_input(simple_body), filename: "spec/test",
|
580
|
+
imagedir: "spec")
|
569
581
|
testdoc = File.read("spec/test.doc", encoding: "utf-8")
|
570
582
|
expect(testdoc).to match(%r{Content-Type: image/png})
|
571
583
|
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
@@ -653,7 +665,8 @@ RSpec.describe Html2Doc do
|
|
653
665
|
|
654
666
|
it "deals with absolute image locations" do
|
655
667
|
simple_body = %{<img src="#{__dir__}/19160-6.png">}
|
656
|
-
Html2Doc.process(html_input(simple_body), filename: "spec/test",
|
668
|
+
Html2Doc.process(html_input(simple_body), filename: "spec/test",
|
669
|
+
imagedir: ".")
|
657
670
|
testdoc = File.read("spec/test.doc", encoding: "utf-8")
|
658
671
|
expect(testdoc).to match(%r{Content-Type: image/png})
|
659
672
|
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2doc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-02-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: asciimath
|
@@ -123,19 +123,19 @@ dependencies:
|
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
126
|
+
name: debug
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
128
128
|
requirements:
|
129
|
-
- - "
|
129
|
+
- - ">="
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version: '
|
131
|
+
version: '0'
|
132
132
|
type: :development
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
|
-
- - "
|
136
|
+
- - ">="
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version: '
|
138
|
+
version: '0'
|
139
139
|
- !ruby/object:Gem::Dependency
|
140
140
|
name: equivalent-xml
|
141
141
|
requirement: !ruby/object:Gem::Requirement
|
@@ -334,7 +334,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
334
334
|
- !ruby/object:Gem::Version
|
335
335
|
version: '0'
|
336
336
|
requirements: []
|
337
|
-
rubygems_version: 3.
|
337
|
+
rubygems_version: 3.2.32
|
338
338
|
signing_key:
|
339
339
|
specification_version: 4
|
340
340
|
summary: Convert HTML document to Microsoft Word document
|