html2doc 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 878408b54af45f8693aade94aee06047dcd450a3
4
- data.tar.gz: 66775cf77b38dc25490da74ac84a5dd5ade68650
3
+ metadata.gz: 4043ea8c7347a5a9b15846ef7c4a79a801fd8675
4
+ data.tar.gz: 7b3f967fe5380b2f20c0426d8c5d9abe5d5414d9
5
5
  SHA512:
6
- metadata.gz: 04baa6214e38eb83f7bd687d42b4bc4db9a28ac80e9c78944dc0ec1150e8c3d9f3fb0ccbaf4df00040b17362405c7dc28765335c01a6d2e5a1e47314232c5b01
7
- data.tar.gz: 0ad65befcc98b15e89bd6c94a8b18a54f709fbf01185b786dd1675a287473a4bb8ad05149647e383c130fcefaafc66048711a438eb557fe80fbd3fc08fded629
6
+ metadata.gz: b830fdb69fae58dd1e64c1c35e0119ba697eff978eeb257ed6017547709b9d1d43da51e6f68bbe3dda82c51af5b71d1312c7971c1ebf8dac535b3ca79d54eff7
7
+ data.tar.gz: ebdcc4112c857d8edc934389b2925671dfc7a16a51bf75646b606479570c9d0b88aafd16f6fcdb3853a9bba9c8d99d042d2dffbbc705026f65e555207afecbb7
data/README.adoc CHANGED
@@ -1,5 +1,10 @@
1
1
  = Html2Doc
2
2
 
3
+
4
+ image:https://img.shields.io/gem/v/html2doc.svg["Gem Version", link="https://rubygems.org/gems/html2doc"]
5
+ image:https://img.shields.io/travis/riboseinc/html2doc/master.svg["Build Status", link="https://travis-ci.org/riboseinc/html2doc"]
6
+ image:https://codeclimate.com/github/riboseinc/html2doc/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/riboseinc/html2doc"]
7
+
3
8
  Gem to convert an HTML document into a Word document (.doc) format. This is intended for automated generation of Microsoft Word documents, given HTML documents, which are mmuch more readily crafted.
4
9
 
5
10
  This gem originated out of https://github.com/riboseinc/asciidoctor-iso, which creates a Word document from a Microsoft HTML document (created in turn by processing Asciidoc). The Microsoft HTML document is already quite close to Microsoft Word requirements, but future iterations of this gem will become more generic.
@@ -9,6 +14,7 @@ This work is driven by the Word document generation procedure documented in http
9
14
  The gem currently does the following:
10
15
 
11
16
  * Convert any AsciiMath and MathML to Word's native mathematical formatting language.
17
+ * Identify any footnotes in the document (through hyperlinks with `class = "Footnote"` or `epub:type = "footnote"`), and render them as Microsoft Word footnotes.
12
18
  * Resize any images in the HTML file to fit within the maximum page size. (Word will otherwise crash on reading the document.)
13
19
  * Generate a filelist.xml listing of all files to be bundled into the Word document.
14
20
  * Assign the class `MsoNormal` to any paragraphs that do not have a class, so that they can be treated as Normal Style when editing the Word document.
@@ -17,9 +23,8 @@ The gem currently does the following:
17
23
 
18
24
  Future iterations will convert generic HTML to Microsoft-specific HTML. For a representative generator of Microsoft HTML, see https://github.com/riboseinc/asciidoctor-iso
19
25
 
20
- Work being tracked at https://github.com/riboseinc/asciidoctor-iso/issues/47:
26
+ Work to be done:
21
27
 
22
- * Render footnotes
23
28
  * Render (editorial) comments
24
29
 
25
30
  == Constraints
data/lib/html2doc.rb CHANGED
@@ -1,3 +1,4 @@
1
1
  require_relative "html2doc/version"
2
2
  require_relative "html2doc/base"
3
3
  require_relative "html2doc/mime"
4
+ require_relative "html2doc/notes"
data/lib/html2doc/base.rb CHANGED
@@ -33,6 +33,7 @@ module Html2Doc
33
33
  def self.cleanup(docxml, dir)
34
34
  image_cleanup(docxml, dir)
35
35
  mathml_to_ooml(docxml)
36
+ footnotes(docxml)
36
37
  msonormal(docxml)
37
38
  docxml
38
39
  end
@@ -79,18 +80,19 @@ module Html2Doc
79
80
  r
80
81
  end
81
82
 
82
- def self.image_resize(orig_filename)
83
- image_size = ImageSize.path(orig_filename).size
83
+ def self.image_resize(i)
84
+ size = [i["width"].to_i, i["height"].to_i]
85
+ size = ImageSize.path(i["src"]).size unless size[0] && size[1]
84
86
  # max width for Word document is 400, max height is 680
85
- if image_size[0] > 400
86
- image_size[1] = (image_size[1] * 400 / image_size[0]).ceil
87
- image_size[0] = 400
87
+ if size[0] > 400
88
+ size[1] = (size[1] * 400 / size[0]).ceil
89
+ size[0] = 400
88
90
  end
89
- if image_size[1] > 680
90
- image_size[0] = (image_size[0] * 680 / image_size[1]).ceil
91
- image_size[1] = 680
91
+ if size[1] > 680
92
+ size[0] = (size[0] * 680 / size[1]).ceil
93
+ size[1] = 680
92
94
  end
93
- image_size
95
+ size
94
96
  end
95
97
 
96
98
  def self.image_cleanup(docxml, dir)
@@ -100,7 +102,7 @@ module Html2Doc
100
102
  new_full_filename = File.join(dir, "#{uuid}.#{matched[:suffix]}")
101
103
  # presupposes that the image source is local
102
104
  system "cp #{i['src']} #{new_full_filename}"
103
- i["width"], i["height"] = image_resize(i["src"])
105
+ i["width"], i["height"] = image_resize(i)
104
106
  i["src"] = new_full_filename
105
107
  end
106
108
  docxml
@@ -0,0 +1,97 @@
1
+ require "uuidtools"
2
+ require "nokogiri"
3
+
4
+ module Html2Doc
5
+
6
+ def self.footnotes(docxml)
7
+ i, fn = 1, []
8
+ docxml.xpath("//a").each do |a|
9
+ next unless process_footnote_link(docxml, a, i, fn)
10
+ i += 1
11
+ end
12
+ process_footnote_texts(docxml, fn)
13
+ end
14
+
15
+ def self.process_footnote_texts(docxml, footnotes)
16
+ body = docxml.at("//body")
17
+ list = body.add_child("<div style='mso-element:footnote-list'/>")
18
+ footnotes.each_with_index do |f, i|
19
+ fn = list.first.add_child(footnote_container(i+1))
20
+ f.parent = fn.first
21
+ footnote_div_to_p(f)
22
+ end
23
+ footnote_cleanup(docxml)
24
+ end
25
+
26
+ def self.footnote_div_to_p(f)
27
+ if %w{div aside}.include? f.name
28
+ if f.at(".//p")
29
+ f = f.replace(f.children)
30
+ else
31
+ f.name = "p"
32
+ f["class"] = "MsoFootnoteText"
33
+ end
34
+ end
35
+ end
36
+
37
+ def self.footnote_container(i)
38
+ <<~DIV
39
+ <div style='mso-element:footnote' id='ftn#{i}'>
40
+ <a style='mso-footnote-id:ftn#{i}' href=#_ftn#{i}'
41
+ name='_ftnref#{i}' title='' id='_ftnref#{i}'><span
42
+ class='MsoFootnoteReference'><span
43
+ style='mso-special-character:footnote'></span></span></div>
44
+ DIV
45
+ end
46
+
47
+ def self.process_footnote_link(docxml, a, i, fn)
48
+ return false unless is_footnote(a)
49
+ href = a["href"].gsub(/^#/, "")
50
+ note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']")
51
+ return false if note.nil?
52
+ set_footnote_link_attrs(a, i)
53
+ a.children = "<span class='MsoFootnoteReference'>"\
54
+ "<span style='mso-special-character:footnote'/></span>"
55
+ fn << transform_footnote_text(note)
56
+ end
57
+
58
+ def self.transform_footnote_text(note)
59
+ note["id"] = ""
60
+ note.xpath(".//div").each { |div| div = div.replace(div.children) }
61
+ note.xpath(".//aside | .//p").each do |p|
62
+ p.name = "p"
63
+ p["class"] = "MsoFootnoteText"
64
+ end
65
+ note.remove
66
+ end
67
+
68
+ def self.is_footnote(a)
69
+ a["epub:type"]&.casecmp("footnote") == 0 ||
70
+ a["class"]&.casecmp("footnote") == 0
71
+ end
72
+
73
+ def self.set_footnote_link_attrs(a, i)
74
+ a["style"] = "mso-footnote-id:ftn#{i}"
75
+ a["href"] = "#_ftn#{i}"
76
+ a["name"] = "_ftnref#{i}"
77
+ a["title"] = ""
78
+ end
79
+
80
+ # We expect that the content of the footnote text received is one or
81
+ # more text containers, p or aside or div (which we have already converted
82
+ # to p). We do not expect any <a name> or links back to text; if they
83
+ # are present in the HTML, they need to have been cleaned out before
84
+ # passing to this gem
85
+ def self.footnote_cleanup(docxml)
86
+ docxml.xpath('//div[@style="mso-element:footnote"]/a').
87
+ each do |x|
88
+ n = x.next_element
89
+ n&.children&.first&.add_previous_sibling(x.remove)
90
+ end
91
+ docxml
92
+ end
93
+
94
+
95
+
96
+
97
+ end
@@ -1,3 +1,3 @@
1
1
  module Html2Doc
2
- VERSION = "0.5.0".freeze
2
+ VERSION = "0.6.0".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-02-06 00:00:00.000000000 Z
11
+ date: 2018-02-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities
@@ -291,6 +291,7 @@ files:
291
291
  - lib/html2doc/base.rb
292
292
  - lib/html2doc/mathml2omml.xsl
293
293
  - lib/html2doc/mime.rb
294
+ - lib/html2doc/notes.rb
294
295
  - lib/html2doc/version.rb
295
296
  - lib/html2doc/wordstyle.css
296
297
  - spec/html2doc_spec.rb