coradoc 0.2.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.docker/Dockerfile +1 -1
- data/.docker/docker-compose.yml +2 -2
- data/.editorconfig +15 -0
- data/CHANGELOG.md +4 -0
- data/README.md +4 -0
- data/Rakefile +10 -0
- data/coradoc.gemspec +11 -2
- data/exe/reverse_adoc +91 -0
- data/exe/w2a +72 -0
- data/lib/coradoc/document.rb +6 -6
- data/lib/coradoc/element/admonition.rb +8 -6
- data/lib/coradoc/element/attribute.rb +2 -2
- data/lib/coradoc/element/attribute_list.rb +94 -15
- data/lib/coradoc/element/audio.rb +14 -3
- data/lib/coradoc/element/author.rb +18 -14
- data/lib/coradoc/element/base.rb +69 -8
- data/lib/coradoc/element/block/core.rb +10 -6
- data/lib/coradoc/element/block/literal.rb +1 -1
- data/lib/coradoc/element/block/quote.rb +1 -1
- data/lib/coradoc/element/block/sourcecode.rb +2 -2
- data/lib/coradoc/element/break.rb +1 -1
- data/lib/coradoc/element/document_attributes.rb +6 -6
- data/lib/coradoc/element/header.rb +4 -2
- data/lib/coradoc/element/image/block_image.rb +13 -2
- data/lib/coradoc/element/image/core.rb +35 -5
- data/lib/coradoc/element/image/inline_image.rb +2 -2
- data/lib/coradoc/element/image.rb +0 -1
- data/lib/coradoc/element/inline/anchor.rb +4 -2
- data/lib/coradoc/element/inline/bold.rb +10 -4
- data/lib/coradoc/element/inline/cross_reference.rb +4 -2
- data/lib/coradoc/element/inline/hard_line_break.rb +1 -1
- data/lib/coradoc/element/inline/highlight.rb +12 -6
- data/lib/coradoc/element/inline/italic.rb +10 -4
- data/lib/coradoc/element/inline/link.rb +26 -10
- data/lib/coradoc/element/inline/monospace.rb +10 -4
- data/lib/coradoc/element/inline/quotation.rb +4 -1
- data/lib/coradoc/element/inline/subscript.rb +5 -2
- data/lib/coradoc/element/inline/superscript.rb +5 -2
- data/lib/coradoc/element/inline.rb +0 -1
- data/lib/coradoc/element/list/core.rb +10 -8
- data/lib/coradoc/element/list/definition.rb +19 -0
- data/lib/coradoc/element/list/ordered.rb +1 -1
- data/lib/coradoc/element/list/unordered.rb +1 -1
- data/lib/coradoc/element/list.rb +1 -1
- data/lib/coradoc/element/list_item.rb +9 -4
- data/lib/coradoc/element/list_item_definition.rb +32 -0
- data/lib/coradoc/element/paragraph.rb +5 -3
- data/lib/coradoc/element/revision.rb +20 -16
- data/lib/coradoc/element/section.rb +21 -4
- data/lib/coradoc/element/table.rb +36 -19
- data/lib/coradoc/element/text_element.rb +63 -17
- data/lib/coradoc/element/title.rb +27 -7
- data/lib/coradoc/element/video.rb +33 -6
- data/lib/coradoc/generator.rb +2 -2
- data/lib/coradoc/legacy_parser.rb +41 -41
- data/lib/coradoc/oscal.rb +2 -4
- data/lib/coradoc/parser/asciidoc/content.rb +15 -15
- data/lib/coradoc/parser/asciidoc/document_attributes.rb +1 -1
- data/lib/coradoc/parser/asciidoc/header.rb +6 -6
- data/lib/coradoc/parser/asciidoc/section.rb +1 -1
- data/lib/coradoc/reverse_adoc/LICENSE.txt +25 -0
- data/lib/coradoc/reverse_adoc/README.adoc +308 -0
- data/lib/coradoc/reverse_adoc/cleaner.rb +125 -0
- data/lib/coradoc/reverse_adoc/config.rb +73 -0
- data/lib/coradoc/reverse_adoc/converters/a.rb +47 -0
- data/lib/coradoc/reverse_adoc/converters/aside.rb +12 -0
- data/lib/coradoc/reverse_adoc/converters/audio.rb +25 -0
- data/lib/coradoc/reverse_adoc/converters/base.rb +104 -0
- data/lib/coradoc/reverse_adoc/converters/blockquote.rb +18 -0
- data/lib/coradoc/reverse_adoc/converters/br.rb +11 -0
- data/lib/coradoc/reverse_adoc/converters/bypass.rb +77 -0
- data/lib/coradoc/reverse_adoc/converters/code.rb +19 -0
- data/lib/coradoc/reverse_adoc/converters/div.rb +14 -0
- data/lib/coradoc/reverse_adoc/converters/dl.rb +55 -0
- data/lib/coradoc/reverse_adoc/converters/drop.rb +22 -0
- data/lib/coradoc/reverse_adoc/converters/em.rb +17 -0
- data/lib/coradoc/reverse_adoc/converters/figure.rb +21 -0
- data/lib/coradoc/reverse_adoc/converters/h.rb +38 -0
- data/lib/coradoc/reverse_adoc/converters/head.rb +19 -0
- data/lib/coradoc/reverse_adoc/converters/hr.rb +11 -0
- data/lib/coradoc/reverse_adoc/converters/ignore.rb +16 -0
- data/lib/coradoc/reverse_adoc/converters/img.rb +98 -0
- data/lib/coradoc/reverse_adoc/converters/li.rb +13 -0
- data/lib/coradoc/reverse_adoc/converters/mark.rb +15 -0
- data/lib/coradoc/reverse_adoc/converters/markup.rb +27 -0
- data/lib/coradoc/reverse_adoc/converters/math.rb +31 -0
- data/lib/coradoc/reverse_adoc/converters/ol.rb +60 -0
- data/lib/coradoc/reverse_adoc/converters/p.rb +19 -0
- data/lib/coradoc/reverse_adoc/converters/pass_through.rb +13 -0
- data/lib/coradoc/reverse_adoc/converters/pre.rb +51 -0
- data/lib/coradoc/reverse_adoc/converters/q.rb +12 -0
- data/lib/coradoc/reverse_adoc/converters/strong.rb +16 -0
- data/lib/coradoc/reverse_adoc/converters/sub.rb +18 -0
- data/lib/coradoc/reverse_adoc/converters/sup.rb +18 -0
- data/lib/coradoc/reverse_adoc/converters/table.rb +280 -0
- data/lib/coradoc/reverse_adoc/converters/td.rb +77 -0
- data/lib/coradoc/reverse_adoc/converters/text.rb +28 -0
- data/lib/coradoc/reverse_adoc/converters/th.rb +14 -0
- data/lib/coradoc/reverse_adoc/converters/tr.rb +18 -0
- data/lib/coradoc/reverse_adoc/converters/video.rb +25 -0
- data/lib/coradoc/reverse_adoc/converters.rb +53 -0
- data/lib/coradoc/reverse_adoc/errors.rb +10 -0
- data/lib/coradoc/reverse_adoc/html_converter.rb +150 -0
- data/lib/coradoc/reverse_adoc/plugin.rb +131 -0
- data/lib/coradoc/reverse_adoc/plugins/plateau.rb +174 -0
- data/lib/coradoc/reverse_adoc/postprocessor.rb +148 -0
- data/lib/coradoc/reverse_adoc.rb +30 -0
- data/lib/coradoc/transformer.rb +24 -14
- data/lib/coradoc/version.rb +1 -1
- data/lib/reverse_adoc.rb +20 -0
- metadata +184 -5
- data/lib/coradoc/element/inline/image.rb +0 -25
@@ -50,13 +50,13 @@ module Coradoc
|
|
50
50
|
block_style("=")
|
51
51
|
end
|
52
52
|
|
53
|
-
def block_style(delimiter="*", repeater = 4)
|
53
|
+
def block_style(delimiter = "*", repeater = 4)
|
54
54
|
block_title.maybe >>
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
55
|
+
newline.maybe >>
|
56
|
+
block_type.maybe >>
|
57
|
+
str(delimiter).repeat(repeater).as(:delimiter) >> newline >>
|
58
|
+
text_line.repeat(1).as(:lines) >>
|
59
|
+
str(delimiter).repeat(repeater) >> newline
|
60
60
|
end
|
61
61
|
|
62
62
|
def block_type
|
@@ -65,7 +65,7 @@ module Coradoc
|
|
65
65
|
|
66
66
|
def highlight
|
67
67
|
text_id >> newline >>
|
68
|
-
|
68
|
+
underline >> highlight_text >> newline
|
69
69
|
end
|
70
70
|
|
71
71
|
def underline
|
@@ -79,14 +79,14 @@ module Coradoc
|
|
79
79
|
# Table
|
80
80
|
def table
|
81
81
|
block_title >>
|
82
|
-
|
83
|
-
|
84
|
-
|
82
|
+
str("|===") >> line_ending >>
|
83
|
+
table_row.repeat(1).as(:rows) >>
|
84
|
+
str("|===") >> line_ending
|
85
85
|
end
|
86
86
|
|
87
87
|
def table_row
|
88
|
-
(literal_space? >> str("|") >> (cell_content | empty_cell_content))
|
89
|
-
repeat(1).as(:cols) >> line_ending
|
88
|
+
(literal_space? >> str("|") >> (cell_content | empty_cell_content))
|
89
|
+
.repeat(1).as(:cols) >> line_ending
|
90
90
|
end
|
91
91
|
|
92
92
|
def empty_cell_content
|
@@ -98,7 +98,7 @@ module Coradoc
|
|
98
98
|
end
|
99
99
|
|
100
100
|
def literal_space
|
101
|
-
(match[
|
101
|
+
(match[" "] | match[' \t']).repeat(1)
|
102
102
|
end
|
103
103
|
|
104
104
|
# Override
|
@@ -113,7 +113,7 @@ module Coradoc
|
|
113
113
|
# Text
|
114
114
|
def text_line
|
115
115
|
(asciidoc_char_with_id.absent? | text_id) >> literal_space? >>
|
116
|
-
|
116
|
+
text.as(:text) >> line_ending.as(:break)
|
117
117
|
end
|
118
118
|
|
119
119
|
def asciidoc_char
|
@@ -137,7 +137,7 @@ module Coradoc
|
|
137
137
|
|
138
138
|
def glossary
|
139
139
|
keyword.as(:key) >> str("::") >> space? >>
|
140
|
-
|
140
|
+
text.as(:value) >> line_ending.as(:break)
|
141
141
|
end
|
142
142
|
|
143
143
|
def ordered_list
|
@@ -8,8 +8,8 @@ module Coradoc
|
|
8
8
|
|
9
9
|
def header
|
10
10
|
header_title >>
|
11
|
-
|
12
|
-
|
11
|
+
author.maybe.as(:author) >>
|
12
|
+
revision.maybe.as(:revision) >> newline.maybe
|
13
13
|
end
|
14
14
|
|
15
15
|
def header_title
|
@@ -18,14 +18,14 @@ module Coradoc
|
|
18
18
|
|
19
19
|
def author
|
20
20
|
words.as(:first_name) >> str(",") >>
|
21
|
-
|
22
|
-
|
21
|
+
space? >> words.as(:last_name) >>
|
22
|
+
space? >> str("<") >> email.as(:email) >> str(">") >> newline
|
23
23
|
end
|
24
24
|
|
25
25
|
def revision
|
26
26
|
(word >> (str(".") >> word).maybe).as(:number) >>
|
27
|
-
|
28
|
-
|
27
|
+
str(",") >> space? >> date.as(:date) >> str(":") >>
|
28
|
+
space? >> words.as(:remark) >> newline
|
29
29
|
end
|
30
30
|
end
|
31
31
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
BSD 2-Clause License
|
2
|
+
|
3
|
+
Copyright (c) 2018, Ribose
|
4
|
+
All rights reserved.
|
5
|
+
|
6
|
+
Redistribution and use in source and binary forms, with or without
|
7
|
+
modification, are permitted provided that the following conditions are met:
|
8
|
+
|
9
|
+
* Redistributions of source code must retain the above copyright notice, this
|
10
|
+
list of conditions and the following disclaimer.
|
11
|
+
|
12
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
13
|
+
this list of conditions and the following disclaimer in the documentation
|
14
|
+
and/or other materials provided with the distribution.
|
15
|
+
|
16
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
17
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
18
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
19
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
20
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
21
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
22
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
23
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
24
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
@@ -0,0 +1,308 @@
|
|
1
|
+
= AsciiDoc from HTML and Microsoft Word: reverse_adoc
|
2
|
+
|
3
|
+
== Purpose
|
4
|
+
|
5
|
+
Transforms HTML and Microsoft Word into AsciiDoc.
|
6
|
+
|
7
|
+
Based on https://github.com/xijo/reverse_markdown
|
8
|
+
|
9
|
+
reverse_adoc used to be a separate Gem, but now it's part of Coradoc.
|
10
|
+
|
11
|
+
|
12
|
+
== Installation
|
13
|
+
|
14
|
+
Install the gem:
|
15
|
+
|
16
|
+
[source,console]
|
17
|
+
----
|
18
|
+
[sudo] gem install coradoc
|
19
|
+
----
|
20
|
+
|
21
|
+
or add it to your `Gemfile`:
|
22
|
+
|
23
|
+
[source,ruby]
|
24
|
+
----
|
25
|
+
gem 'coradoc'
|
26
|
+
----
|
27
|
+
|
28
|
+
|
29
|
+
== Command-line usage
|
30
|
+
|
31
|
+
=== HTML to AsciiDoc: `reverse_adoc`
|
32
|
+
|
33
|
+
Convert HTML files to AsciiDoc:
|
34
|
+
|
35
|
+
[source,console]
|
36
|
+
----
|
37
|
+
$ reverse_adoc file.html > file.adoc
|
38
|
+
$ cat file.html | reverse_adoc > file.adoc
|
39
|
+
----
|
40
|
+
|
41
|
+
|
42
|
+
=== Microsoft Word to AsciiDoc: `w2a`
|
43
|
+
|
44
|
+
Convert Word `.doc` or `.docx` files to AsciiDoc:
|
45
|
+
|
46
|
+
[source,console]
|
47
|
+
----
|
48
|
+
$ w2a file.docx > file.adoc
|
49
|
+
----
|
50
|
+
|
51
|
+
[source,console]
|
52
|
+
----
|
53
|
+
$ w2a input.docx -o output.adoc
|
54
|
+
----
|
55
|
+
|
56
|
+
Help:
|
57
|
+
|
58
|
+
[source,console]
|
59
|
+
----
|
60
|
+
$ w2a -h
|
61
|
+
Usage: w2a [options] <file>
|
62
|
+
-a, --mathml2asciimath Convert MathML to AsciiMath
|
63
|
+
-o, --output=FILENAME Output file to write to
|
64
|
+
-e, --external-images Export images if data URI
|
65
|
+
-v, --version Version information
|
66
|
+
-h, --help Prints this help
|
67
|
+
----
|
68
|
+
|
69
|
+
|
70
|
+
NOTE: `w2a` requires LibreOffice to be installed. It uses LibreOffice's
|
71
|
+
export to XHTML. LibreOffice's export of XHTML is superior to the native Microsoft Word export
|
72
|
+
to HTML: it exports lists (which Word keeps as paragraphs), and it exports OOMML into MathML.
|
73
|
+
On the other hand, the LibreOffice export relies on default styling being used in the
|
74
|
+
document, and it may not cope with ordered lists or headings with customised appearance.
|
75
|
+
For best results, reset the styles in the document you're converting to those in
|
76
|
+
the default `Normal.dot` template.
|
77
|
+
|
78
|
+
NOTE: `w2a` requires the command-line version of LibreOffice, `soffice`. As it turns out,
|
79
|
+
LibreOffice v6 appears to render formulae in HTML as images instead of MathML expressions;
|
80
|
+
use LibreOffice v5. If you have both LibreOffice v5 and LibreOffice v6 installed, make sure
|
81
|
+
that your OS path searches for the LibreOffice v5 version of `soffice` first; e.g. on Mac,
|
82
|
+
include something like `/Applications/LibreOffice5.4.7.2.app/Contents/MacOS` in your PATH
|
83
|
+
environment.
|
84
|
+
|
85
|
+
NOTE: Some information in OOMML is not preserved in the export to MathML from LibreOffice;
|
86
|
+
in particular, font shifts such as double-struck fonts.
|
87
|
+
The LibreOffice exporter does seem to drop some text (possibly associated with
|
88
|
+
MathML); use with caution.
|
89
|
+
|
90
|
+
NOTE: Adapted from `w2m` of
|
91
|
+
https://github.com/benbalter/word-to-markdown[Ben Balter's word-to-markdown]
|
92
|
+
|
93
|
+
|
94
|
+
=== Common options
|
95
|
+
|
96
|
+
|
97
|
+
==== MathML to AsciiMath conversion
|
98
|
+
|
99
|
+
If you wish to convert the MathML in the document to AsciiMath, run the script with the
|
100
|
+
`--mathml2asciimath` option:
|
101
|
+
|
102
|
+
[source,console]
|
103
|
+
----
|
104
|
+
$ w2a --mathml2asciimath document.docx > document.adoc
|
105
|
+
----
|
106
|
+
|
107
|
+
|
108
|
+
==== Extracting images
|
109
|
+
|
110
|
+
Images referred by the HTML can be extracted into the destination output folder by using:
|
111
|
+
|
112
|
+
[source,console]
|
113
|
+
----
|
114
|
+
$ reverse_adoc input.docx -o output/file.adoc -e
|
115
|
+
$ reverse_adoc input.docx --output output/file.adoc --external-images
|
116
|
+
----
|
117
|
+
|
118
|
+
|
119
|
+
Word embedded images can be extracted into the destination output folder by using:
|
120
|
+
|
121
|
+
[source,console]
|
122
|
+
----
|
123
|
+
$ w2a input.docx -o output/file.adoc -e
|
124
|
+
$ w2a input.docx --output output/file.adoc --external-images
|
125
|
+
----
|
126
|
+
|
127
|
+
|
128
|
+
==== Handling unknown HTML tags
|
129
|
+
|
130
|
+
The `--unknown_tags` option allows you to specify how to handle unknown tags
|
131
|
+
(default `pass_through`).
|
132
|
+
|
133
|
+
Valid options are:
|
134
|
+
|
135
|
+
* `pass_through` - Include the unknown tag completely into the result
|
136
|
+
* `drop` - Drop the unknown tag and its content
|
137
|
+
* `bypass` - Ignore the unknown tag but try to convert its content
|
138
|
+
* `raise` - Raise an error to let you know
|
139
|
+
|
140
|
+
|
141
|
+
==== Tagging of borders
|
142
|
+
|
143
|
+
Specify how to handle tag borders with the option `--tag_border` (default `' '`).
|
144
|
+
|
145
|
+
Valid options are:
|
146
|
+
|
147
|
+
* `' '` - Add whitespace if there is none at tag borders.
|
148
|
+
* `''` - Do not not add whitespace.
|
149
|
+
|
150
|
+
|
151
|
+
== Features
|
152
|
+
|
153
|
+
=== General
|
154
|
+
|
155
|
+
`reverse_adoc` shares features as a port of `reverse_markdown`:
|
156
|
+
|
157
|
+
* Module based -- if you miss a tag, just add it
|
158
|
+
* Can deal with nested lists
|
159
|
+
* Inline and block code is supported
|
160
|
+
* Supports blockquote
|
161
|
+
|
162
|
+
It supports the following HTML tags (these are supported by `reverse_markdown`):
|
163
|
+
|
164
|
+
* `a`
|
165
|
+
* `blockquote`
|
166
|
+
* `br`
|
167
|
+
* `code`, `tt` (added: `kbd`, `samp`, `var`)
|
168
|
+
* `div`, `article`
|
169
|
+
* `em`, `i` (added: `cite`)
|
170
|
+
* `h1`, `h2`, `h3`, `h4`, `h5`, `h6`, `hr`
|
171
|
+
* `img`
|
172
|
+
* `li`, `ol`, `ul` (added: `dir`)
|
173
|
+
* `p`, `pre`
|
174
|
+
* `strong`, `b`
|
175
|
+
* `table`, `td`, `th`, `tr`
|
176
|
+
|
177
|
+
[NOTE]
|
178
|
+
====
|
179
|
+
* reverse_adoc does *not* support `del` or `strike`, because Asciidoctor does not out of the box.
|
180
|
+
* As with reverse_markdown, `pre` is only treated as sourcecode if it is contained in a `div@class = highlight-` element, or has a `@brush` attribute naming the language (Confluence).
|
181
|
+
* The gem does not support `p@align`, because Asciidoctor doesn't
|
182
|
+
====
|
183
|
+
|
184
|
+
In addition, it supports:
|
185
|
+
|
186
|
+
* `aside`
|
187
|
+
* `audio`, `video` (with `@src` attributes)
|
188
|
+
* `figure`, `figcaption`
|
189
|
+
* `mark`
|
190
|
+
* `q`
|
191
|
+
* `sub`, `sup`
|
192
|
+
* `@id` anchors
|
193
|
+
* `blockquote@cite`
|
194
|
+
* `img/@width`, `img/@height`
|
195
|
+
* `ol/@style`, `ol/@start`, `ol/@reversed`, `ul/@type`
|
196
|
+
* `td/@colspan`, `td/@rowspan`, `td@/align`, `td@/valign`
|
197
|
+
* `table/caption`, `table/@width`, `table/@frame` (partial), `table/@rules` (partial)
|
198
|
+
* Lists and paragraphs within cells
|
199
|
+
** Not tables within cells: Asciidoctor cannot deal with nested tables
|
200
|
+
|
201
|
+
The gem does not support:
|
202
|
+
|
203
|
+
* `col`, `colgroup`
|
204
|
+
* `source`, `picture`
|
205
|
+
* `bdi`, `bdo`, `ruby`, `rt`, `rp`, `wbr`
|
206
|
+
* `frame`, `frameset`, `iframe`, `noframes`, `noscript`, `script`, `input`, `output`, `progress`
|
207
|
+
* `map`, `canvas`, `dialog`, `embed`, `object`, `param`, `svg`, `track`
|
208
|
+
* `fieldset`, `button`, `datalist`, `form`, `label`, `legend`, `menu`, `menulist`, `optgroup`, `option`, `select`, `textarea`
|
209
|
+
* `big`, `dfn`, `font`, `s`, `small`, `span`, `strike`, `u`
|
210
|
+
* `center`
|
211
|
+
* `data`, `meter`
|
212
|
+
* `del`, `ins`
|
213
|
+
* `footer`, `header`, `main`, `nav`, `details`, `section`, `summary`, `template`
|
214
|
+
|
215
|
+
|
216
|
+
=== MathML support
|
217
|
+
|
218
|
+
If you are using this gem in the context of https://www.metanorma.com[Metanorma],
|
219
|
+
Metanorma AsciiDoc accepts MathML as a native mathematical format. So you do not need
|
220
|
+
to convert the MathML to AsciiMath.
|
221
|
+
|
222
|
+
The gem will optionally invoke the https://github.com/metanorma/mathml2asciimath
|
223
|
+
gem, to convert MathML to AsciiMath. The conversion is not perfect, and will need to be
|
224
|
+
post-edited; but it's a lot better than nothing.
|
225
|
+
|
226
|
+
NOTE: Asciidoctor does not support MathML input. HTML uses MathML.
|
227
|
+
The gem will recognize MathML expressions in HTML, and will wrap them in Asciidoctor
|
228
|
+
`stem:[ ]` macros. The result of this gem is not actually legal Asciidoctor for `stem`:
|
229
|
+
Asciidoctor will presumably
|
230
|
+
think this is AsciiMath in the `stem:[ ]` macro, try to pass it into MathJax as
|
231
|
+
AsciiMath, and fail. But of course, MathJax has no problem with MathML, and some postprocessing
|
232
|
+
on the Asciidoctor output can ensure that the MathML is treated by MathJax (or whatever else
|
233
|
+
uses the output) as such; so this is still much better than nothing for stem processing.
|
234
|
+
|
235
|
+
=== Word cleanup
|
236
|
+
|
237
|
+
This gem is routinely used in the Metanorma project to export Word documents to AsciiDoc.
|
238
|
+
The HTML export from Word that the gem uses, from LibreOffice, is much cleaner than the
|
239
|
+
native HTML 4 export from Word; but it has some infelicities which this gem cleans up:
|
240
|
+
|
241
|
+
* The HTML export has trouble with subscripts, and routinely exports them as headings; the `w2a`
|
242
|
+
script tries to clean them up.
|
243
|
+
* The `w2a` cleans up spaces, but it does not strip them.
|
244
|
+
* Spaces are removed from anchors and cross-references.
|
245
|
+
* Double underscores are removed from anchors and cross-references.
|
246
|
+
* Cross-references to `_GoBack` and to `_Toc` followed by numbers (used to construct tables of contents) are ignored.
|
247
|
+
|
248
|
+
== Ruby library usage
|
249
|
+
|
250
|
+
=== General
|
251
|
+
|
252
|
+
Simple to use.
|
253
|
+
|
254
|
+
[source,ruby]
|
255
|
+
----
|
256
|
+
require 'coradoc/reverse_adoc'
|
257
|
+
|
258
|
+
result = Coradoc::ReverseAdoc.convert input
|
259
|
+
result.inspect # " *feelings* "
|
260
|
+
----
|
261
|
+
|
262
|
+
=== Configure with options
|
263
|
+
|
264
|
+
Just pass your chosen configuration options in after the input. The given options will last for this operation only.
|
265
|
+
|
266
|
+
[source,ruby]
|
267
|
+
----
|
268
|
+
require 'coradoc/reverse_adoc'
|
269
|
+
|
270
|
+
Coradoc::ReverseAdoc.convert(input, unknown_tags: :raise, mathml2asciimath: true)
|
271
|
+
----
|
272
|
+
|
273
|
+
|
274
|
+
=== Preconfigure using an initializer
|
275
|
+
|
276
|
+
Or configure it block style on a initializer level. These configurations will last for all conversions until they are set to something different.
|
277
|
+
|
278
|
+
[source,ruby]
|
279
|
+
----
|
280
|
+
require 'coradoc/reverse_adoc'
|
281
|
+
|
282
|
+
Coradoc::ReverseAdoc.config do |config|
|
283
|
+
config.unknown_tags = :bypass
|
284
|
+
config.mathml2asciimath = true
|
285
|
+
config.tag_border = ''
|
286
|
+
end
|
287
|
+
----
|
288
|
+
|
289
|
+
=== Convert HTML to a Coradoc AST
|
290
|
+
|
291
|
+
[source,ruby]
|
292
|
+
----
|
293
|
+
require 'coradoc/reverse_adoc'
|
294
|
+
|
295
|
+
# Options can be supplied as keyword arguments
|
296
|
+
Coradoc::ReverseAdoc::HtmlConverter.to_coradoc("<b><i>Some input</i></b>")
|
297
|
+
----
|
298
|
+
|
299
|
+
|
300
|
+
== Related stuff
|
301
|
+
|
302
|
+
* https://github.com/xijo/reverse_markdown[Xijo's original reverse_markdown gem]
|
303
|
+
* https://github.com/xijo/reverse_markdown/wiki/Write-your-own-converter[Write custom converters] - Wiki entry about how to write your own converter
|
304
|
+
* https://github.com/harlantwood/html_massage[html_massage] - A gem by Harlan T. Wood to convert regular sites into markdown using reverse_markdown
|
305
|
+
* https://github.com/benbalter/word-to-markdown[word-to-markdown] - Convert word docs into markdown while using reverse_markdown, by Ben Balter
|
306
|
+
* https://github.com/asciidocfx/HtmlToAsciidoc[HtmlToAsciidoc] - Javascript regexp-based converter of HTML to Asciidoctor
|
307
|
+
* https://asciidoctor.org/docs/user-manual/[The Asciidoctor User Manual]
|
308
|
+
|
@@ -0,0 +1,125 @@
|
|
1
|
+
module Coradoc::ReverseAdoc
|
2
|
+
class Cleaner
|
3
|
+
def tidy(string)
|
4
|
+
result = HtmlConverter.track_time "Removing inner whitespace" do
|
5
|
+
remove_inner_whitespaces(String.new(string))
|
6
|
+
end
|
7
|
+
result = HtmlConverter.track_time "Removing newlines" do
|
8
|
+
remove_newlines(result)
|
9
|
+
end
|
10
|
+
result = HtmlConverter.track_time "Removing leading newlines" do
|
11
|
+
remove_leading_newlines(result)
|
12
|
+
end
|
13
|
+
result = HtmlConverter.track_time "Cleaning tag borders" do
|
14
|
+
clean_tag_borders(result)
|
15
|
+
end
|
16
|
+
result = HtmlConverter.track_time "Cleaning punctuation characters" do
|
17
|
+
clean_punctuation_characters(result)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def remove_newlines(string)
|
22
|
+
string.gsub(/\n{3,}/, "\n\n")
|
23
|
+
end
|
24
|
+
|
25
|
+
def remove_leading_newlines(string)
|
26
|
+
string.gsub(/\A\n+/, "")
|
27
|
+
end
|
28
|
+
|
29
|
+
def remove_inner_whitespaces(string)
|
30
|
+
unless string.nil?
|
31
|
+
string.gsub!(/\n stem:\[/, "\nstem:[")
|
32
|
+
string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ")
|
33
|
+
string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1")
|
34
|
+
end
|
35
|
+
result = +""
|
36
|
+
string.each_line do |line|
|
37
|
+
result << preserve_border_whitespaces(line) do
|
38
|
+
line.strip.gsub(/[ \t]{2,}/, " ")
|
39
|
+
end
|
40
|
+
end
|
41
|
+
result
|
42
|
+
end
|
43
|
+
|
44
|
+
# Find non-asterisk content that is enclosed by two or
|
45
|
+
# more asterisks. Ensure that only one whitespace occurs
|
46
|
+
# in the border area.
|
47
|
+
# Same for underscores and brackets.
|
48
|
+
def clean_tag_borders(string)
|
49
|
+
# result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match|
|
50
|
+
# preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do
|
51
|
+
# match.strip.sub("** ", "**").sub(" **", "**")
|
52
|
+
# end
|
53
|
+
# end
|
54
|
+
|
55
|
+
# result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match|
|
56
|
+
# preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do
|
57
|
+
# match.strip.sub("__ ", "__").sub(" __", "__")
|
58
|
+
# end
|
59
|
+
# end
|
60
|
+
|
61
|
+
result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
|
62
|
+
preserve_border_whitespaces(match,
|
63
|
+
default_border: Coradoc::ReverseAdoc.config.tag_border) do
|
64
|
+
match.strip.sub("~~ ", "~~").sub(" ~~", "~~")
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
result.gsub(/\s?\[.*?\]\s?/) do |match|
|
69
|
+
preserve_border_whitespaces(match) do
|
70
|
+
match.strip.sub("[ ", "[").sub(" ]", "]")
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def clean_punctuation_characters(string)
|
76
|
+
string.gsub(/(\*\*|~~|__)\s([.!?'"])/, "\\1\\2")
|
77
|
+
end
|
78
|
+
|
79
|
+
# preprocesses HTML, rather than postprocessing it
|
80
|
+
def preprocess_word_html(string)
|
81
|
+
clean_headings(scrub_whitespace(string.dup))
|
82
|
+
end
|
83
|
+
|
84
|
+
def scrub_whitespace(string)
|
85
|
+
string.gsub!(/ | |\u00a0/i, " ") # HTML encoded spaces
|
86
|
+
string.sub!(/^\A[[:space:]]+/m, "") # document leading whitespace
|
87
|
+
string.sub!(/[[:space:]]+\z$/m, "") # document trailing whitespace
|
88
|
+
string.gsub!(/( +)$/, " ") # line trailing whitespace
|
89
|
+
string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
|
90
|
+
# string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs
|
91
|
+
string
|
92
|
+
end
|
93
|
+
|
94
|
+
# following added by me
|
95
|
+
def clean_headings(string)
|
96
|
+
string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ")
|
97
|
+
# I don't know why Libre Office is inserting them, but they need to go
|
98
|
+
string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
|
99
|
+
"<sup>\\2</sup>")
|
100
|
+
# I absolutely don't know why Libre Office is rendering superscripts as h1
|
101
|
+
string
|
102
|
+
end
|
103
|
+
|
104
|
+
private
|
105
|
+
|
106
|
+
def preserve_border_whitespaces(string, options = {})
|
107
|
+
return string if /\A\s*\Z/.match?(string)
|
108
|
+
|
109
|
+
default_border = options.fetch(:default_border, "")
|
110
|
+
# If the string contains part of a link so the characters [,],(,)
|
111
|
+
# then don't add any extra spaces
|
112
|
+
default_border = "" if /[\[(\])]/.match?(string)
|
113
|
+
string_start = present_or_default(string[/\A\s*/], default_border)
|
114
|
+
string_end = present_or_default(string[/\s*\Z/], default_border)
|
115
|
+
result = yield
|
116
|
+
string_start + result + string_end
|
117
|
+
end
|
118
|
+
|
119
|
+
def present_or_default(string, default)
|
120
|
+
return default if string.nil? || string.empty?
|
121
|
+
|
122
|
+
string
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require "tmpdir"
|
2
|
+
|
3
|
+
module Coradoc::ReverseAdoc
|
4
|
+
class Config
|
5
|
+
def initialize
|
6
|
+
@unknown_tags = :pass_through
|
7
|
+
@input_format = :html
|
8
|
+
@mathml2asciimath = false
|
9
|
+
@external_images = false
|
10
|
+
|
11
|
+
# Destination to save file and images
|
12
|
+
@destination = nil
|
13
|
+
|
14
|
+
# Source of HTML
|
15
|
+
# @sourcedir = nil
|
16
|
+
|
17
|
+
# Image counter, assuming there are max 999 images
|
18
|
+
@image_counter = 1
|
19
|
+
# pad with 0s
|
20
|
+
@image_counter_pattern = "%03d"
|
21
|
+
|
22
|
+
@em_delimiter = "_".freeze
|
23
|
+
@strong_delimiter = "*".freeze
|
24
|
+
@inline_options = {}
|
25
|
+
@tag_border = " ".freeze
|
26
|
+
|
27
|
+
@split_sections = nil
|
28
|
+
|
29
|
+
# Document width - used to compute table sizes.
|
30
|
+
# This is an assumption for screen size in input document.
|
31
|
+
# If column widths are specified in absolute values, then we
|
32
|
+
# have to convert them to relative values, as AsciiDoc only
|
33
|
+
# supports those.
|
34
|
+
@doc_width = 1000
|
35
|
+
|
36
|
+
# Plugin system
|
37
|
+
@plugins = []
|
38
|
+
|
39
|
+
# Debugging options
|
40
|
+
@track_time = false
|
41
|
+
end
|
42
|
+
|
43
|
+
def with(options = {})
|
44
|
+
old_options = @inline_options
|
45
|
+
@inline_options = options
|
46
|
+
result = yield
|
47
|
+
@inline_options = old_options
|
48
|
+
result
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.declare_option(option)
|
52
|
+
define_method(option) do
|
53
|
+
@inline_options[option] || instance_variable_get(:"@#{option}")
|
54
|
+
end
|
55
|
+
|
56
|
+
attr_writer option
|
57
|
+
end
|
58
|
+
|
59
|
+
declare_option :unknown_tags
|
60
|
+
declare_option :tag_border
|
61
|
+
declare_option :mathml2asciimath
|
62
|
+
declare_option :external_images
|
63
|
+
declare_option :destination
|
64
|
+
declare_option :sourcedir
|
65
|
+
declare_option :image_counter
|
66
|
+
declare_option :image_counter_pattern
|
67
|
+
declare_option :input_format
|
68
|
+
declare_option :split_sections
|
69
|
+
declare_option :doc_width
|
70
|
+
declare_option :plugins
|
71
|
+
declare_option :track_time
|
72
|
+
end
|
73
|
+
end
|