html2doc 1.9.2 → 1.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +5 -2
- data/lib/html2doc/base.rb +4 -7
- data/lib/html2doc/lists.rb +101 -12
- data/lib/html2doc/mime.rb +1 -2
- data/lib/html2doc/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a9ef55b2a805994f5ca26253285cdc062ff7d9a16c12ce009935956513d3eb3c
|
4
|
+
data.tar.gz: 8229037b442e8c9fd93790b14ec25daff760b4f98697b787da2ae8daef344699
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 293dcfb6a88743f5a3a5e8bc7042d3a5f7c1e7f93683b22d21153591f2e82fdae5d3bb2c446817b2a06372469db09554078e2c5a804ff54cf2ba4ee54706e0bf
|
7
|
+
data.tar.gz: 4bfac10267769fecca5b85fe68319bf17fce9ed021e6c1534cf7d76cee34193468919f3d4e1f9114257884d09661d8502a7a0f1de06ab05586842c360a4a7228
|
data/README.adoc
CHANGED
@@ -31,6 +31,7 @@ The gem currently does the following:
|
|
31
31
|
* Resize any local images in the HTML file to fit within the maximum page size. (Word will otherwise crash on reading the document.)
|
32
32
|
* Optionally apply list styles with predefined bullet and numbering from a Word CSS to the unordered and ordered lists in the document, restarting numbering for each ordered list.
|
33
33
|
* Convert all lists to native Word HTML rendering (using paragraphs with `MsoListParagraphCxSpFirst, MsoListParagraphCxSpMiddle, MsoListParagraphCxSpLast` styles)
|
34
|
+
* Generate additional list styles in CSS for any ordered lists with a new start number.
|
34
35
|
* Convert any internal `@id` anchors to `a@name` anchors; Word only hyperlinks to the latter.
|
35
36
|
* Generate a filelist.xml listing of all files to be bundled into the Word document.
|
36
37
|
* Assign the class `MsoNormal` to any paragraphs that do not have a class, so that they can be treated as Normal Style when editing the Word document.
|
@@ -43,7 +44,7 @@ For a representative generator of HTML that uses this gem in postprocessing, see
|
|
43
44
|
|
44
45
|
This gem generates `.doc` documents. Future versions may upgrade the output to `docx`.
|
45
46
|
|
46
|
-
Because `.doc` is the format of an older version of Microsoft Word, the output of this gem do *not* support SVG graphics.
|
47
|
+
Because `.doc` is the format of an older version of Microsoft Word, the output of this gem do *not* support SVG graphics. Word itself converts SVG into PNG when it saves documents as Word HTML, which is the input to this gem. External consumers of this gem in Metanorma convert SVG to EMF.
|
47
48
|
|
48
49
|
There there are two other Microsoft Word vendors in the Ruby ecosystem.
|
49
50
|
|
@@ -150,7 +151,9 @@ left-aligned or right-aligned, add `style="text-align:left"` or
|
|
150
151
|
|
151
152
|
|
152
153
|
=== Lists
|
153
|
-
Natively, Word does not use `<ol>`, `<ul>`, or `<dl>` lists in its HTML exports at all: it uses paragraphs styled with list styles. If you save a Word document as HTML in order to use its CSS for Word documents generated by HTML, those styles will still work (with the caveat that you will need to extract the `@list` style specific to ordered and unordered lists, and pass it as a `liststyles` parameter to the conversion).
|
154
|
+
Natively, Word does not use `<ol>`, `<ul>`, or `<dl>` lists in its HTML exports at all: it uses paragraphs styled with list styles. If you save a Word document as HTML in order to use its CSS for Word documents generated by HTML, those styles will still work (with the caveat that you will need to extract the `@list` style specific to ordered and unordered lists, and pass it as a `liststyles` parameter to the conversion). The gem will duplicate the ordered list style definition to provide new styles, in order to deal with custom numbering.
|
155
|
+
|
156
|
+
Word HTML understands `<ol>, <ul>, <li>`, but its rendering is fragile: in particular, any instance of `<p>` within a `<li>` is treated as a new list item (so Word HTML will not let you have multi-paragraph list items if you use native HTML.) This gem now exports lists as Word HTML prefers to see them, with `MsoListParagraphCxSpFirst, MsoListParagraphCxSpMiddle, MsoListParagraphCxSpLast` styles. You will need to include these in the CSS stylesheet you supply, in order to get the right indentation for lists.
|
154
157
|
|
155
158
|
== Example
|
156
159
|
|
data/lib/html2doc/base.rb
CHANGED
@@ -13,7 +13,7 @@ class Html2Doc
|
|
13
13
|
@imagedir = hash[:imagedir]
|
14
14
|
@debug = hash[:debug]
|
15
15
|
@liststyles = hash[:liststyles]
|
16
|
-
@stylesheet = hash[:stylesheet]
|
16
|
+
@stylesheet = read_stylesheet(hash[:stylesheet])
|
17
17
|
@c = HTMLEntities.new
|
18
18
|
end
|
19
19
|
|
@@ -74,8 +74,7 @@ class Html2Doc
|
|
74
74
|
end
|
75
75
|
|
76
76
|
def locate_landscape(_docxml)
|
77
|
-
|
78
|
-
@landscape = css.scan(/div\.\S+\s+\{\s*page:\s*[^;]+L;\s*\}/m)
|
77
|
+
@landscape = @stylesheet.scan(/div\.\S+\s+\{\s*page:\s*[^;]+L;\s*\}/m)
|
79
78
|
.map { |e| e.sub(/^div\.(\S+).*$/m, "\\1") }
|
80
79
|
end
|
81
80
|
|
@@ -99,11 +98,9 @@ class Html2Doc
|
|
99
98
|
end
|
100
99
|
end
|
101
100
|
|
102
|
-
def stylesheet(_filename, _header_filename,
|
103
|
-
stylesheet =
|
101
|
+
def stylesheet(_filename, _header_filename, _cssname)
|
102
|
+
stylesheet = "#{@stylesheet}\n#{@newliststyledefs}"
|
104
103
|
xml = Nokogiri::XML("<style/>")
|
105
|
-
# s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
|
106
|
-
# xml.children.first << Nokogiri::XML::Comment.new(xml, s)
|
107
104
|
xml.children.first << Nokogiri::XML::CDATA
|
108
105
|
.new(xml, "\n<!--\n#{stylesheet}\n-->\n")
|
109
106
|
xml.root.to_s
|
data/lib/html2doc/lists.rb
CHANGED
@@ -4,8 +4,7 @@ require "nokogiri"
|
|
4
4
|
|
5
5
|
class Html2Doc
|
6
6
|
def style_list(elem, level, liststyle, listnumber)
|
7
|
-
|
8
|
-
|
7
|
+
liststyle or return
|
9
8
|
if elem["style"]
|
10
9
|
elem["style"] += ";"
|
11
10
|
else
|
@@ -30,16 +29,37 @@ class Html2Doc
|
|
30
29
|
|
31
30
|
def list_add(xpath, liststyles, listtype, level)
|
32
31
|
xpath.each do |l|
|
33
|
-
level == 1
|
32
|
+
level == 1 && l["seen"] = true and @listnumber += 1
|
34
33
|
l["id"] ||= UUIDTools::UUID.random_create
|
34
|
+
liststyle = derive_liststyle(l, liststyles[listtype], level)
|
35
35
|
(l.xpath(".//li") - l.xpath(".//ol//li | .//ul//li")).each do |li|
|
36
|
-
style_list(li, level,
|
36
|
+
style_list(li, level, liststyle, @listnumber)
|
37
37
|
list_add1(li, liststyles, listtype, level)
|
38
38
|
end
|
39
39
|
list_add_tail(l, liststyles, listtype, level)
|
40
40
|
end
|
41
41
|
end
|
42
42
|
|
43
|
+
def derive_liststyle(list, liststyle, level)
|
44
|
+
list["start"] && list["start"] != "1" or return liststyle
|
45
|
+
@liststyledefsidx += 1
|
46
|
+
ret = "l#{@liststyledefsidx}"
|
47
|
+
@newliststyledefs += newliststyle(list["start"], liststyle, ret, level)
|
48
|
+
ret
|
49
|
+
end
|
50
|
+
|
51
|
+
def newliststyle(start, liststyle, newstylename, level)
|
52
|
+
s = @liststyledefs[liststyle]
|
53
|
+
.gsub(/@list\s+#{liststyle}/, "@list #{newstylename}")
|
54
|
+
.sub(/@list\s+#{newstylename}\s+\{[^}]*\}/m, <<~LISTSTYLE)
|
55
|
+
@list #{newstylename}\n{mso-list-id:#{rand(100_000_000..999_999_999)};
|
56
|
+
mso-list-template-ids:#{rand(100_000_000..999_999_999)};}
|
57
|
+
LISTSTYLE
|
58
|
+
.sub(/@list\s+#{newstylename}:level#{level}\s+\{/m,
|
59
|
+
"\\0mso-level-start-at:#{start};\n")
|
60
|
+
"#{s}\n"
|
61
|
+
end
|
62
|
+
|
43
63
|
def list_add_tail(list, liststyles, listtype, level)
|
44
64
|
list.xpath(".//ul[not(ancestor::li/ancestor::*/@id = '#{list['id']}')] | "\
|
45
65
|
".//ol[not(ancestor::li/ancestor::*/@id = '#{list['id']}')]")
|
@@ -49,16 +69,15 @@ class Html2Doc
|
|
49
69
|
end
|
50
70
|
|
51
71
|
def list2para(list)
|
52
|
-
|
53
|
-
|
72
|
+
list.xpath("./li").empty? and return
|
54
73
|
list2para_position(list)
|
55
74
|
list.xpath("./li").each do |l|
|
56
75
|
l.name = "p"
|
57
76
|
l["class"] ||= "MsoListParagraphCxSpMiddle"
|
58
|
-
|
59
|
-
|
77
|
+
l.first_element_child&.name == "p" or next
|
60
78
|
l["style"] ||= ""
|
61
|
-
l["style"] +=
|
79
|
+
l["style"] += l.first_element_child["style"]
|
80
|
+
&.sub(/mso-list[^;]+;/, "") || ""
|
62
81
|
l.first_element_child.replace(l.first_element_child.children)
|
63
82
|
end
|
64
83
|
list.replace(list.children)
|
@@ -100,12 +119,82 @@ class Html2Doc
|
|
100
119
|
end
|
101
120
|
|
102
121
|
def lists(docxml, liststyles)
|
103
|
-
|
104
|
-
|
105
|
-
@listnumber = 0
|
122
|
+
liststyles.nil? and return
|
123
|
+
parse_stylesheet_line_styles
|
106
124
|
liststyles.each_key { |k| lists1(docxml, liststyles, k) }
|
107
125
|
lists_unstyled(docxml, liststyles)
|
108
126
|
liststyles.has_key?(:ul) and docxml.xpath("//ul").each { |u| list2para(u) }
|
109
127
|
liststyles.has_key?(:ol) and docxml.xpath("//ol").each { |u| list2para(u) }
|
110
128
|
end
|
129
|
+
|
130
|
+
def parse_stylesheet_line_styles
|
131
|
+
@listnumber = 0
|
132
|
+
result = process_stylesheet_lines(@stylesheet.split("\n"))
|
133
|
+
@liststyledefs = clean_result_content(result)
|
134
|
+
@newliststyledefs = ""
|
135
|
+
@liststyledefsidx = @liststyledefs.keys.map do |k|
|
136
|
+
k.sub(/^.*(\d+)$/, "\\1").to_i
|
137
|
+
end.max
|
138
|
+
end
|
139
|
+
|
140
|
+
private
|
141
|
+
|
142
|
+
def extract_list_name(line)
|
143
|
+
match = line.match(/^\s*@list\s+([^:\s]+)(?::.*)?/)
|
144
|
+
match ? match[1] : nil
|
145
|
+
end
|
146
|
+
|
147
|
+
def list_declaration?(line)
|
148
|
+
!extract_list_name(line).nil?
|
149
|
+
end
|
150
|
+
|
151
|
+
def save_current_list(result, current_base, current_content)
|
152
|
+
current_base.nil? || current_content.empty? and return result
|
153
|
+
if result[current_base]
|
154
|
+
result[current_base] += current_content
|
155
|
+
else
|
156
|
+
result[current_base] = current_content
|
157
|
+
end
|
158
|
+
result
|
159
|
+
end
|
160
|
+
|
161
|
+
def process_stylesheet_lines(lines)
|
162
|
+
result = {}
|
163
|
+
current_base = nil
|
164
|
+
current_content = ""
|
165
|
+
parsing_active = false
|
166
|
+
|
167
|
+
lines.each do |line|
|
168
|
+
if list_declaration?(line)
|
169
|
+
base_name = extract_list_name(line)
|
170
|
+
if current_base == base_name
|
171
|
+
current_content += "#{line}\n"
|
172
|
+
else
|
173
|
+
# save accumulated list style definition, new list style
|
174
|
+
save_current_list(result, current_base, current_content)
|
175
|
+
current_base = base_name
|
176
|
+
current_content = "#{line}\n"
|
177
|
+
end
|
178
|
+
parsing_active = true
|
179
|
+
|
180
|
+
elsif parsing_active && line.include?("}")
|
181
|
+
# End of current block - add this line and stop parsing
|
182
|
+
current_content += "#{line}\n"
|
183
|
+
parsing_active = false
|
184
|
+
|
185
|
+
elsif parsing_active
|
186
|
+
# Continue adding content while parsing is active
|
187
|
+
current_content += "#{line}\n"
|
188
|
+
end
|
189
|
+
# If parsing_active is false and no @list declaration, skip the line
|
190
|
+
end
|
191
|
+
# Save the last list if we were still parsing
|
192
|
+
save_current_list(result, current_base, current_content)
|
193
|
+
result
|
194
|
+
end
|
195
|
+
|
196
|
+
def clean_result_content(result)
|
197
|
+
result.each { |k, v| result[k] = v.rstrip }
|
198
|
+
result
|
199
|
+
end
|
111
200
|
end
|
data/lib/html2doc/mime.rb
CHANGED
@@ -135,8 +135,7 @@ class Html2Doc
|
|
135
135
|
# Scan both @stylesheet and docxml.to_xml (where @standardstylesheet has ended up)
|
136
136
|
# Allow 0.9 * height to fit caption
|
137
137
|
def page_dimensions(docxml)
|
138
|
-
|
139
|
-
page_size = find_page_size_in_doc(stylesheet, docxml.to_xml) or
|
138
|
+
page_size = find_page_size_in_doc(@stylesheet, docxml.to_xml) or
|
140
139
|
return [680, 400]
|
141
140
|
m_size = /size:\s*(\S+)\s+(\S+)\s*;/.match(page_size) or return [680, 400]
|
142
141
|
m_marg = /margin:\s*(\S+)\s+(\S+)\s*(\S+)\s*(\S+)\s*;/.match(page_size) or
|
data/lib/html2doc/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2doc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.10.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-07-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: base64
|