html2doc 1.7.1 → 1.7.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: aa6616fc156cf3cfe7fa3c47551f5630c359688b584fa639f8d6d93d08d1170d
4
- data.tar.gz: 86bba61be244042f3f56094b053b6a9bd2fc1afb74a8e31f481e91aff063a3b0
3
+ metadata.gz: e9ccbcf331056dfa6edf4c3a028a1327751695589a3881ba34631fcfe0840ece
4
+ data.tar.gz: 50146c87c3c2b49942449917e2e549fa2d998c0d3713ce6ae4594046beb25971
5
5
  SHA512:
6
- metadata.gz: dcb0a834ef0168e4e595216a7efcd3ec4a2aa3801a82bf96298fe0c2d6e37ade361c91369e17d27dc9f19f94062eda258cc47b80aeb64f96a8bdeb7a0f494c73
7
- data.tar.gz: 155dfd1fb4b594ccc9dd3c4442bd62977a9c0b2e48d503efa674a12d0797f1ef4a77d9a55cba25a547b49476ceed814433084ffff41d41a43888573fa2dd7d1d
6
+ metadata.gz: b6d9e333ea11bf0fbfb0021a115c2359a949af78254bec227a11c144e1a9316353163ac15302be8769f744c62b0273602001013dcbf07e6f85ffde3e45cf6abc
7
+ data.tar.gz: 4f51999d644f50dc08ab70980418d660d8c78baba8ce58b2ed957ddb8fac41cefc1310d1078dea5150270f32c0d76f90f51f7467f6a2c5006efcbe245ec5622f
data/Gemfile CHANGED
@@ -4,12 +4,6 @@ Encoding.default_internal = Encoding::UTF_8
4
4
  source "https://rubygems.org"
5
5
  git_source(:github) { |repo| "https://github.com/#{repo}" }
6
6
 
7
- group :development, :test do
8
- gem "rspec"
9
- end
10
-
11
- if File.exist? "Gemfile.devel"
12
- eval File.read("Gemfile.devel"), nil, "Gemfile.devel" # rubocop:disable Security/Eval
13
- end
14
-
15
7
  gemspec
8
+
9
+ eval_gemfile("Gemfile.devel") rescue nil
data/README.adoc CHANGED
@@ -24,7 +24,7 @@ The gem currently does the following:
24
24
 
25
25
  * Convert any AsciiMath and MathML to Word's native mathematical formatting language, OOXML. Word supports copy-pasting MathML into Word and converting it into OOXML; however the conversion is not infallible (we have in the past found problems with `\sum`: Word claims parameters were missing, and inserting dotted squares to indicate as much), and you may need to post-edit the OOXML.
26
26
  ** The gem does attempt to repair the MathML input, to bring it in line with Word's OOXML's expectations. If you find any issues with AsciiMath or MathML input, please raise an issue.
27
- * Identify any footnotes in the document (defined as hyperlinks with attributes `class = "Footnote"` or `epub:type = "footnote"`), and render them as Microsoft Word footnotes.
27
+ * Identify any footnotes in the document (defined as hyperlinks with attributes `class = "Footnote"` or `epub:type = "footnote"`), and render them as Microsoft Word footnotes.
28
28
  ** The corresponding footnote content is any `div` or `aside` element with the same `@id` attribute as the footnote points to; e.g. `<a href="#ftn1" epub:type="footnote"><sup>3</sup></a></span>`, pointing to `<aside id="ftn3">`.
29
29
  ** By default, the footnote hyperlink contents are overwritten with the autonumbering element: `<a href="#ftn1" epub:type="footnote"><sup>1</sup></a>` is replaced with `<a style='mso-footnote-id:ftn1' href='#_ftn1' name='_ftnref1' title='' id='_ftnref1'><span class='MsoFootnoteReference'><span style='mso-special-character:footnote'/></span>`
30
30
  ** If the footnote hyperlink already contains (as a child) an element marked up as `<span class='MsoFootnoteReference'>`, only that span is replaced by the Microsoft autonumber element; any text surrounding it is preserved in both the footnote reference and the footnote target. For example, `<a href="#ftn1" epub:type="footnote"><span class='MsoFootnoteReference'>1</span>)</a>` will render as the footnote _1)_, both in the link and the target.
@@ -116,22 +116,42 @@ The bad news is that Word's understanding of HTML is HTML 4. In order for bookma
116
116
 
117
117
  The good news with generating a Word document via HTML is that Word understands CSS, and you can determine much of what the Word document looks like by manipulating that CSS. That extends to features that are not part of HTML CSS: if you want to work out how to get Word to do something in CSS, save a Word document that already does what you want as HTML, and inspect the HTML and CSS you get.
118
118
 
119
- The bad news is that Word's implementation of CSS is poorly documented -- even if Office HTML is documented in a 1300 page document (online at https://stigmortenmyre.no/mso/, https://www.rodriguezcommaj.com/assets/resources/microsoft-office-html-and-xml-reference.pdf), and the CSS selectors are only partially and selectively implemented. For list styles, for example, `mso-level-text` governs how the list label is displayed; but it is only recognised in a `@list` style: it is ignored in a CSS rule like `ol li`, or in a `style` attribute on a node. CSS selectors only support classes, in ancestor relations: `p.class1 ol.class2` is supported, but `#id1` is not, and neither is `p > ol`. Working out the right CSS for what you want will take some trial and error, and you are better placed to try to do things Word's way than the right way.
119
+ The bad news is that Word's implementation of CSS is poorly documented -- even
120
+ if Office HTML is documented in a 1300 page document (online
121
+ https://stigmortenmyre.no/mso/[here] and
122
+ https://www.rodriguezcommaj.com/assets/resources/microsoft-office-html-and-xml-reference.pdf[here]),
123
+ and the CSS selectors are only partially and selectively implemented. For list
124
+ styles, for example, `mso-level-text` governs how the list label is displayed;
125
+ but it is only recognised in a `@list` style: it is ignored in a CSS rule like
126
+ `ol li`, or in a `style` attribute on a node. CSS selectors only support
127
+ classes, in ancestor relations: `p.class1 ol.class2` is supported, but `#id1` is
128
+ not, and neither is `p > ol`. Working out the right CSS for what you want will
129
+ take some trial and error, and you are better placed to try to do things Word's
130
+ way than the right way.
120
131
 
121
- === XSLT
132
+ === Math
122
133
 
123
- This gem is published with an early draft of the XSLT stylesheet transforming MathML into OOXML, `mml2omml.xsl`, that has published for several years now as part of the https://github.com/TEIC/Stylesheets[TEI stylesheet set]. (We have made some further minor edits to the stylesheet.) The stylesheets have been published under a dual Creative Commons Sharealike/BSD licence.
134
+ Word uses OMML instead of W3C's MathML which is now the de-facto standard of XML
135
+ math representation.
124
136
 
125
- The good news is that the stylesheet is not identical to the stylesheet `mathml2omml.xsl` that is published with Microsoft Word, so it can and has been redistributed.
137
+ The https://github.com/plurimath/plurimath[Plurimath gem] is used to convert
138
+ Metanorma's MathML into OMML.
139
+
140
+ NOTE: Previously `html2doc` use a modified, early draft of the XSLT stylesheet
141
+ `mml2omml.xsl`, published by the
142
+ https://github.com/TEIC/Stylesheets[TEI stylesheet set] (CC/BSD licensed).
143
+
144
+ === Math Positioning
145
+
146
+ By default, mathematical formulas that are the only content of their paragraph
147
+ are rendered as centered in Word. If you want your AsciiMath or MathML to be
148
+ left-aligned or right-aligned, add `style="text-align:left"` or
149
+ `style="text-align:right"` to its ancestor `div`, `p` or `td` node in HTML.
126
150
 
127
- The bad news is that the stylesheet is not identical to the stylesheet `mathml2omml.xsl` that is published with Microsoft Word, so it isn't guaranteed to have identical output. If you want to make sure that your MathML import is identical to what Word currently uses, replace `mml2omml.xsl` with `mathml2omml.xsl`, and edit the gem accordingly for your local installation. On Windows, you will find the stylesheet in the same directory as the `winword.exe` executable. On Mac, right-click on the Word application, and select "Show Package Contents"; you will find the stylesheet under `Contents/Resources`.
128
151
 
129
152
  === Lists
130
153
  Natively, Word does not use `<ol>`, `<ul>`, or `<dl>` lists in its HTML exports at all: it uses paragraphs styled with list styles. If you save a Word document as HTML in order to use its CSS for Word documents generated by HTML, those styles will still work (with the caveat that you will need to extract the `@list` style specific to ordered and unordered lists, and pass it as a `liststyles` parameter to the conversion). Word HTML understands `<ol>, <ul>, <li>`, but its rendering is fragile: in particular, any instance of `<p>` within a `<li>` is treated as a new list item (so Word HTML will not let you have multi-paragraph list items if you use native HTML.) This gem now exports lists as Word HTML prefers to see them, with `MsoListParagraphCxSpFirst, MsoListParagraphCxSpMiddle, MsoListParagraphCxSpLast` styles. You will need to include these in the CSS stylesheet you supply, in order to get the right indentation for lists.
131
154
 
132
- === Math Positioning
133
- By default, mathematical formulas that are the only content of their paragraph are rendered as centered in Word. If you want your AsciiMath or MathML to be left-aligned or right-aligned, add `style="text-align:left"` or `style="text-align:right"` to its ancestor `div`, `p` or `td` node in HTML.
134
-
135
155
  == Example
136
156
 
137
157
  The `spec/examples` directory includes `rice.doc` and its source files: this Word document has been generated from `rice.html` through a call to html2doc from https://github.com/metanorma/metanorma-iso. (The source document `rice.html` was itself generated from Asciidoc, rather than being hand-crafted.)
data/html2doc.gemspec CHANGED
@@ -32,9 +32,10 @@ Gem::Specification.new do |spec|
32
32
  spec.add_dependency "mime-types"
33
33
  spec.add_dependency "nokogiri", "~> 1.15"
34
34
  spec.add_dependency "plane1converter", "~> 0.0.1"
35
- spec.add_dependency "plurimath", "~> 0.5.0"
35
+ spec.add_dependency "plurimath", "~> 0.7.0"
36
36
  spec.add_dependency "thread_safe"
37
37
  spec.add_dependency "uuidtools"
38
+ spec.add_dependency "unitsml"
38
39
 
39
40
  spec.add_development_dependency "debug"
40
41
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
data/lib/html2doc/base.rb CHANGED
@@ -15,9 +15,6 @@ class Html2Doc
15
15
  @liststyles = hash[:liststyles]
16
16
  @stylesheet = hash[:stylesheet]
17
17
  @c = HTMLEntities.new
18
- @xsltemplate =
19
- Nokogiri::XSLT(File.read(File.join(File.dirname(__FILE__), "mml2omml.xsl"),
20
- encoding: "utf-8"))
21
18
  end
22
19
 
23
20
  def process(result)
data/lib/html2doc/math.rb CHANGED
@@ -38,13 +38,13 @@ class Html2Doc
38
38
 
39
39
  # random fixes to MathML input that OOXML needs to render properly
40
40
  def ooxml_cleanup(math, docnamespaces)
41
- #encode_math(
42
- unwrap_accents(
43
- mathml_preserve_space(
44
- mathml_insert_rows(math, docnamespaces), docnamespaces
45
- ),
46
- )
47
- #)
41
+ # encode_math(
42
+ unwrap_accents(
43
+ mathml_preserve_space(
44
+ mathml_insert_rows(math, docnamespaces), docnamespaces
45
+ ),
46
+ )
47
+ # )
48
48
  math.add_namespace(nil, MATHML_NS)
49
49
  math
50
50
  end
@@ -165,10 +165,9 @@ class Html2Doc
165
165
  def mathml_to_ooml1(xml, docnamespaces)
166
166
  doc = Nokogiri::XML::Document::new
167
167
  doc.root = ooxml_cleanup(xml, docnamespaces)
168
- # ooxml = @xsltemplate.transform(doc)
169
- d = xml.parent["block"] != "false" # display_style
170
- ooxml = Nokogiri::XML(Plurimath::Math.parse(doc.to_xml(indent: 0),
171
- :mathml).to_omml)
168
+ # d = xml.parent["block"] != "false" # display_style
169
+ ooxml = Nokogiri::XML(Plurimath::Math
170
+ .parse(doc.to_xml(indent: 0), :mathml).to_omml(split_on_linebreak: true))
172
171
  ooxml = unitalic(accent_tr(ooxml))
173
172
  ooxml = ooml_clean(uncenter(xml, ooxml))
174
173
  xml.swap(ooxml)
@@ -202,9 +201,10 @@ class Html2Doc
202
201
  x.text.strip.empty?
203
202
  end
204
203
 
205
- def math_block?(_ooxml, mathml)
204
+ def math_block?(ooxml, mathml)
206
205
  # ooxml.name == "oMathPara" || mathml["displaystyle"] == "true"
207
- mathml["displaystyle"] == "true"
206
+ mathml["displaystyle"] == "true" &&
207
+ ooxml.xpath("./m:oMath", "m" => OOXML_NS).size <= 1
208
208
  end
209
209
 
210
210
  STYLE_BEARING_NODE =
@@ -228,11 +228,9 @@ class Html2Doc
228
228
 
229
229
  def uncenter_unneeded(math, ooxml, alignnode)
230
230
  (math_block?(ooxml, math) || !alignnode) and return ooxml
231
- if !math_only_para?(alignnode)
232
- ooxml.name == "oMathPara" and
233
- ooxml = ooxml.elements.detect { |x| x.name == "oMath" }
234
- return ooxml
235
- end
236
- nil
231
+ math_only_para?(alignnode) and return nil
232
+ ooxml.name == "oMathPara" and
233
+ ooxml = ooxml.elements.select { |x| %w(oMath r).include?(x.name) }
234
+ ooxml.size > 1 ? nil : Nokogiri::XML::NodeSet.new(math.document, ooxml)
237
235
  end
238
236
  end
@@ -1,3 +1,3 @@
1
1
  class Html2Doc
2
- VERSION = "1.7.1".freeze
2
+ VERSION = "1.7.2".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.7.1
4
+ version: 1.7.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-11-27 00:00:00.000000000 Z
11
+ date: 2023-12-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities
@@ -100,14 +100,14 @@ dependencies:
100
100
  requirements:
101
101
  - - "~>"
102
102
  - !ruby/object:Gem::Version
103
- version: 0.5.0
103
+ version: 0.7.0
104
104
  type: :runtime
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
- version: 0.5.0
110
+ version: 0.7.0
111
111
  - !ruby/object:Gem::Dependency
112
112
  name: thread_safe
113
113
  requirement: !ruby/object:Gem::Requirement
@@ -136,6 +136,20 @@ dependencies:
136
136
  - - ">="
137
137
  - !ruby/object:Gem::Version
138
138
  version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: unitsml
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
139
153
  - !ruby/object:Gem::Dependency
140
154
  name: debug
141
155
  requirement: !ruby/object:Gem::Requirement
@@ -301,9 +315,7 @@ files:
301
315
  - lib/html2doc/base.rb
302
316
  - lib/html2doc/lists.rb
303
317
  - lib/html2doc/math.rb
304
- - lib/html2doc/mathml2omml.xsl
305
318
  - lib/html2doc/mime.rb
306
- - lib/html2doc/mml2omml.xsl
307
319
  - lib/html2doc/notes.rb
308
320
  - lib/html2doc/version.rb
309
321
  - lib/html2doc/wordstyle.css