html2doc 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.adoc +7 -2
- data/lib/html2doc.rb +1 -0
- data/lib/html2doc/base.rb +12 -10
- data/lib/html2doc/notes.rb +97 -0
- data/lib/html2doc/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4043ea8c7347a5a9b15846ef7c4a79a801fd8675
|
4
|
+
data.tar.gz: 7b3f967fe5380b2f20c0426d8c5d9abe5d5414d9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b830fdb69fae58dd1e64c1c35e0119ba697eff978eeb257ed6017547709b9d1d43da51e6f68bbe3dda82c51af5b71d1312c7971c1ebf8dac535b3ca79d54eff7
|
7
|
+
data.tar.gz: ebdcc4112c857d8edc934389b2925671dfc7a16a51bf75646b606479570c9d0b88aafd16f6fcdb3853a9bba9c8d99d042d2dffbbc705026f65e555207afecbb7
|
data/README.adoc
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
= Html2Doc
|
2
2
|
|
3
|
+
|
4
|
+
image:https://img.shields.io/gem/v/html2doc.svg["Gem Version", link="https://rubygems.org/gems/html2doc"]
|
5
|
+
image:https://img.shields.io/travis/riboseinc/html2doc/master.svg["Build Status", link="https://travis-ci.org/riboseinc/html2doc"]
|
6
|
+
image:https://codeclimate.com/github/riboseinc/html2doc/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/riboseinc/html2doc"]
|
7
|
+
|
3
8
|
Gem to convert an HTML document into a Word document (.doc) format. This is intended for automated generation of Microsoft Word documents, given HTML documents, which are mmuch more readily crafted.
|
4
9
|
|
5
10
|
This gem originated out of https://github.com/riboseinc/asciidoctor-iso, which creates a Word document from a Microsoft HTML document (created in turn by processing Asciidoc). The Microsoft HTML document is already quite close to Microsoft Word requirements, but future iterations of this gem will become more generic.
|
@@ -9,6 +14,7 @@ This work is driven by the Word document generation procedure documented in http
|
|
9
14
|
The gem currently does the following:
|
10
15
|
|
11
16
|
* Convert any AsciiMath and MathML to Word's native mathematical formatting language.
|
17
|
+
* Identify any footnotes in the document (through hyperlinks with `class = "Footnote"` or `epub:type = "footnote"`), and render them as Microsoft Word footnotes.
|
12
18
|
* Resize any images in the HTML file to fit within the maximum page size. (Word will otherwise crash on reading the document.)
|
13
19
|
* Generate a filelist.xml listing of all files to be bundled into the Word document.
|
14
20
|
* Assign the class `MsoNormal` to any paragraphs that do not have a class, so that they can be treated as Normal Style when editing the Word document.
|
@@ -17,9 +23,8 @@ The gem currently does the following:
|
|
17
23
|
|
18
24
|
Future iterations will convert generic HTML to Microsoft-specific HTML. For a representative generator of Microsoft HTML, see https://github.com/riboseinc/asciidoctor-iso
|
19
25
|
|
20
|
-
Work
|
26
|
+
Work to be done:
|
21
27
|
|
22
|
-
* Render footnotes
|
23
28
|
* Render (editorial) comments
|
24
29
|
|
25
30
|
== Constraints
|
data/lib/html2doc.rb
CHANGED
data/lib/html2doc/base.rb
CHANGED
@@ -33,6 +33,7 @@ module Html2Doc
|
|
33
33
|
def self.cleanup(docxml, dir)
|
34
34
|
image_cleanup(docxml, dir)
|
35
35
|
mathml_to_ooml(docxml)
|
36
|
+
footnotes(docxml)
|
36
37
|
msonormal(docxml)
|
37
38
|
docxml
|
38
39
|
end
|
@@ -79,18 +80,19 @@ module Html2Doc
|
|
79
80
|
r
|
80
81
|
end
|
81
82
|
|
82
|
-
def self.image_resize(
|
83
|
-
|
83
|
+
def self.image_resize(i)
|
84
|
+
size = [i["width"].to_i, i["height"].to_i]
|
85
|
+
size = ImageSize.path(i["src"]).size unless size[0] && size[1]
|
84
86
|
# max width for Word document is 400, max height is 680
|
85
|
-
if
|
86
|
-
|
87
|
-
|
87
|
+
if size[0] > 400
|
88
|
+
size[1] = (size[1] * 400 / size[0]).ceil
|
89
|
+
size[0] = 400
|
88
90
|
end
|
89
|
-
if
|
90
|
-
|
91
|
-
|
91
|
+
if size[1] > 680
|
92
|
+
size[0] = (size[0] * 680 / size[1]).ceil
|
93
|
+
size[1] = 680
|
92
94
|
end
|
93
|
-
|
95
|
+
size
|
94
96
|
end
|
95
97
|
|
96
98
|
def self.image_cleanup(docxml, dir)
|
@@ -100,7 +102,7 @@ module Html2Doc
|
|
100
102
|
new_full_filename = File.join(dir, "#{uuid}.#{matched[:suffix]}")
|
101
103
|
# presupposes that the image source is local
|
102
104
|
system "cp #{i['src']} #{new_full_filename}"
|
103
|
-
i["width"], i["height"] = image_resize(i
|
105
|
+
i["width"], i["height"] = image_resize(i)
|
104
106
|
i["src"] = new_full_filename
|
105
107
|
end
|
106
108
|
docxml
|
@@ -0,0 +1,97 @@
|
|
1
|
+
require "uuidtools"
|
2
|
+
require "nokogiri"
|
3
|
+
|
4
|
+
module Html2Doc
|
5
|
+
|
6
|
+
def self.footnotes(docxml)
|
7
|
+
i, fn = 1, []
|
8
|
+
docxml.xpath("//a").each do |a|
|
9
|
+
next unless process_footnote_link(docxml, a, i, fn)
|
10
|
+
i += 1
|
11
|
+
end
|
12
|
+
process_footnote_texts(docxml, fn)
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.process_footnote_texts(docxml, footnotes)
|
16
|
+
body = docxml.at("//body")
|
17
|
+
list = body.add_child("<div style='mso-element:footnote-list'/>")
|
18
|
+
footnotes.each_with_index do |f, i|
|
19
|
+
fn = list.first.add_child(footnote_container(i+1))
|
20
|
+
f.parent = fn.first
|
21
|
+
footnote_div_to_p(f)
|
22
|
+
end
|
23
|
+
footnote_cleanup(docxml)
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.footnote_div_to_p(f)
|
27
|
+
if %w{div aside}.include? f.name
|
28
|
+
if f.at(".//p")
|
29
|
+
f = f.replace(f.children)
|
30
|
+
else
|
31
|
+
f.name = "p"
|
32
|
+
f["class"] = "MsoFootnoteText"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.footnote_container(i)
|
38
|
+
<<~DIV
|
39
|
+
<div style='mso-element:footnote' id='ftn#{i}'>
|
40
|
+
<a style='mso-footnote-id:ftn#{i}' href=#_ftn#{i}'
|
41
|
+
name='_ftnref#{i}' title='' id='_ftnref#{i}'><span
|
42
|
+
class='MsoFootnoteReference'><span
|
43
|
+
style='mso-special-character:footnote'></span></span></div>
|
44
|
+
DIV
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.process_footnote_link(docxml, a, i, fn)
|
48
|
+
return false unless is_footnote(a)
|
49
|
+
href = a["href"].gsub(/^#/, "")
|
50
|
+
note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']")
|
51
|
+
return false if note.nil?
|
52
|
+
set_footnote_link_attrs(a, i)
|
53
|
+
a.children = "<span class='MsoFootnoteReference'>"\
|
54
|
+
"<span style='mso-special-character:footnote'/></span>"
|
55
|
+
fn << transform_footnote_text(note)
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.transform_footnote_text(note)
|
59
|
+
note["id"] = ""
|
60
|
+
note.xpath(".//div").each { |div| div = div.replace(div.children) }
|
61
|
+
note.xpath(".//aside | .//p").each do |p|
|
62
|
+
p.name = "p"
|
63
|
+
p["class"] = "MsoFootnoteText"
|
64
|
+
end
|
65
|
+
note.remove
|
66
|
+
end
|
67
|
+
|
68
|
+
def self.is_footnote(a)
|
69
|
+
a["epub:type"]&.casecmp("footnote") == 0 ||
|
70
|
+
a["class"]&.casecmp("footnote") == 0
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.set_footnote_link_attrs(a, i)
|
74
|
+
a["style"] = "mso-footnote-id:ftn#{i}"
|
75
|
+
a["href"] = "#_ftn#{i}"
|
76
|
+
a["name"] = "_ftnref#{i}"
|
77
|
+
a["title"] = ""
|
78
|
+
end
|
79
|
+
|
80
|
+
# We expect that the content of the footnote text received is one or
|
81
|
+
# more text containers, p or aside or div (which we have already converted
|
82
|
+
# to p). We do not expect any <a name> or links back to text; if they
|
83
|
+
# are present in the HTML, they need to have been cleaned out before
|
84
|
+
# passing to this gem
|
85
|
+
def self.footnote_cleanup(docxml)
|
86
|
+
docxml.xpath('//div[@style="mso-element:footnote"]/a').
|
87
|
+
each do |x|
|
88
|
+
n = x.next_element
|
89
|
+
n&.children&.first&.add_previous_sibling(x.remove)
|
90
|
+
end
|
91
|
+
docxml
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
|
96
|
+
|
97
|
+
end
|
data/lib/html2doc/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2doc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-02-
|
11
|
+
date: 2018-02-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: htmlentities
|
@@ -291,6 +291,7 @@ files:
|
|
291
291
|
- lib/html2doc/base.rb
|
292
292
|
- lib/html2doc/mathml2omml.xsl
|
293
293
|
- lib/html2doc/mime.rb
|
294
|
+
- lib/html2doc/notes.rb
|
294
295
|
- lib/html2doc/version.rb
|
295
296
|
- lib/html2doc/wordstyle.css
|
296
297
|
- spec/html2doc_spec.rb
|