coradoc 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.docker/Dockerfile +1 -1
- data/.docker/docker-compose.yml +2 -2
- data/.editorconfig +15 -0
- data/CHANGELOG.md +4 -0
- data/Rakefile +10 -0
- data/coradoc.gemspec +11 -2
- data/exe/reverse_adoc +70 -0
- data/exe/w2a +72 -0
- data/lib/coradoc/document.rb +5 -6
- data/lib/coradoc/element/admonition.rb +8 -6
- data/lib/coradoc/element/attribute_list.rb +2 -2
- data/lib/coradoc/element/audio.rb +1 -1
- data/lib/coradoc/element/author.rb +16 -14
- data/lib/coradoc/element/base.rb +0 -2
- data/lib/coradoc/element/block/core.rb +2 -2
- data/lib/coradoc/element/block/literal.rb +1 -1
- data/lib/coradoc/element/block/sourcecode.rb +2 -2
- data/lib/coradoc/element/image/core.rb +1 -0
- data/lib/coradoc/element/image.rb +0 -1
- data/lib/coradoc/element/inline/bold.rb +1 -0
- data/lib/coradoc/element/inline/highlight.rb +1 -0
- data/lib/coradoc/element/inline/image.rb +1 -0
- data/lib/coradoc/element/inline/italic.rb +1 -0
- data/lib/coradoc/element/inline/link.rb +9 -9
- data/lib/coradoc/element/inline/monospace.rb +1 -0
- data/lib/coradoc/element/inline/quotation.rb +1 -0
- data/lib/coradoc/element/inline/subscript.rb +1 -0
- data/lib/coradoc/element/inline/superscript.rb +1 -0
- data/lib/coradoc/element/inline.rb +0 -1
- data/lib/coradoc/element/list/core.rb +3 -4
- data/lib/coradoc/element/list.rb +0 -1
- data/lib/coradoc/element/list_item.rb +1 -1
- data/lib/coradoc/element/paragraph.rb +1 -1
- data/lib/coradoc/element/revision.rb +18 -16
- data/lib/coradoc/element/table.rb +10 -10
- data/lib/coradoc/element/text_element.rb +21 -15
- data/lib/coradoc/element/title.rb +2 -2
- data/lib/coradoc/element/video.rb +1 -1
- data/lib/coradoc/generator.rb +2 -2
- data/lib/coradoc/legacy_parser.rb +41 -41
- data/lib/coradoc/oscal.rb +2 -4
- data/lib/coradoc/parser/asciidoc/content.rb +15 -15
- data/lib/coradoc/parser/asciidoc/document_attributes.rb +1 -1
- data/lib/coradoc/parser/asciidoc/header.rb +6 -6
- data/lib/coradoc/parser/asciidoc/section.rb +1 -1
- data/lib/coradoc/reverse_adoc/LICENSE.txt +25 -0
- data/lib/coradoc/reverse_adoc/README.adoc +302 -0
- data/lib/coradoc/reverse_adoc/cleaner.rb +113 -0
- data/lib/coradoc/reverse_adoc/config.rb +54 -0
- data/lib/coradoc/reverse_adoc/converters/a.rb +42 -0
- data/lib/coradoc/reverse_adoc/converters/aside.rb +16 -0
- data/lib/coradoc/reverse_adoc/converters/audio.rb +29 -0
- data/lib/coradoc/reverse_adoc/converters/base.rb +100 -0
- data/lib/coradoc/reverse_adoc/converters/blockquote.rb +27 -0
- data/lib/coradoc/reverse_adoc/converters/br.rb +15 -0
- data/lib/coradoc/reverse_adoc/converters/bypass.rb +81 -0
- data/lib/coradoc/reverse_adoc/converters/code.rb +56 -0
- data/lib/coradoc/reverse_adoc/converters/div.rb +18 -0
- data/lib/coradoc/reverse_adoc/converters/drop.rb +22 -0
- data/lib/coradoc/reverse_adoc/converters/em.rb +55 -0
- data/lib/coradoc/reverse_adoc/converters/figure.rb +25 -0
- data/lib/coradoc/reverse_adoc/converters/h.rb +42 -0
- data/lib/coradoc/reverse_adoc/converters/head.rb +23 -0
- data/lib/coradoc/reverse_adoc/converters/hr.rb +15 -0
- data/lib/coradoc/reverse_adoc/converters/ignore.rb +16 -0
- data/lib/coradoc/reverse_adoc/converters/img.rb +93 -0
- data/lib/coradoc/reverse_adoc/converters/li.rb +17 -0
- data/lib/coradoc/reverse_adoc/converters/mark.rb +21 -0
- data/lib/coradoc/reverse_adoc/converters/math.rb +31 -0
- data/lib/coradoc/reverse_adoc/converters/ol.rb +64 -0
- data/lib/coradoc/reverse_adoc/converters/p.rb +23 -0
- data/lib/coradoc/reverse_adoc/converters/pass_through.rb +13 -0
- data/lib/coradoc/reverse_adoc/converters/pre.rb +55 -0
- data/lib/coradoc/reverse_adoc/converters/q.rb +16 -0
- data/lib/coradoc/reverse_adoc/converters/strong.rb +52 -0
- data/lib/coradoc/reverse_adoc/converters/sub.rb +16 -0
- data/lib/coradoc/reverse_adoc/converters/sup.rb +16 -0
- data/lib/coradoc/reverse_adoc/converters/table.rb +69 -0
- data/lib/coradoc/reverse_adoc/converters/td.rb +83 -0
- data/lib/coradoc/reverse_adoc/converters/text.rb +65 -0
- data/lib/coradoc/reverse_adoc/converters/th.rb +14 -0
- data/lib/coradoc/reverse_adoc/converters/tr.rb +22 -0
- data/lib/coradoc/reverse_adoc/converters/video.rb +29 -0
- data/lib/coradoc/reverse_adoc/converters.rb +32 -0
- data/lib/coradoc/reverse_adoc/errors.rb +10 -0
- data/lib/coradoc/reverse_adoc/html_converter.rb +61 -0
- data/lib/coradoc/reverse_adoc.rb +27 -0
- data/lib/coradoc/transformer.rb +24 -14
- data/lib/coradoc/version.rb +1 -1
- data/lib/reverse_adoc.rb +20 -0
- metadata +178 -4
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
= AsciiDoc from HTML and Microsoft Word: reverse_adoc
|
|
2
|
+
|
|
3
|
+
https://github.com/metanorma/reverse_adoc[reverse_adoc] image:https://img.shields.io/gem/v/reverse_adoc.svg["Gem Version", link="https://rubygems.org/gems/reverse_adoc"]::
|
|
4
|
+
image:https://github.com/metanorma/reverse_adoc/workflows/rake/badge.svg["Build Status", link="https://github.com/metanorma/reverse_adoc/actions?workflow=rake"]
|
|
5
|
+
image:https://codeclimate.com/github/metanorma/reverse_adoc/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/reverse_adoc"]
|
|
6
|
+
image:https://img.shields.io/github/issues-pr-raw/metanorma/reverse_adoc.svg["Pull Requests", link="https://github.com/metanorma/reverse_adoc/pulls"]
|
|
7
|
+
image:https://img.shields.io/github/commits-since/metanorma/reverse_adoc/latest.svg["Commits since latest",link="https://github.com/metanorma/reverse_adoc/releases"]
|
|
8
|
+
|
|
9
|
+
== Purpose
|
|
10
|
+
|
|
11
|
+
Transforms HTML and Microsoft Word into AsciiDoc.
|
|
12
|
+
|
|
13
|
+
Based on https://github.com/xijo/reverse_markdown
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
== Installation
|
|
17
|
+
|
|
18
|
+
Install the gem:
|
|
19
|
+
|
|
20
|
+
[source,console]
|
|
21
|
+
----
|
|
22
|
+
[sudo] gem install reverse_adoc
|
|
23
|
+
----
|
|
24
|
+
|
|
25
|
+
or add it to your `Gemfile`:
|
|
26
|
+
|
|
27
|
+
[source,ruby]
|
|
28
|
+
----
|
|
29
|
+
gem 'reverse_adoc'
|
|
30
|
+
----
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
== Command-line usage
|
|
34
|
+
|
|
35
|
+
=== HTML to AsciiDoc: `reverse_adoc`
|
|
36
|
+
|
|
37
|
+
Convert HTML files to AsciiDoc:
|
|
38
|
+
|
|
39
|
+
[source,console]
|
|
40
|
+
----
|
|
41
|
+
$ reverse_adoc file.html > file.adoc
|
|
42
|
+
$ cat file.html | reverse_adoc > file.adoc
|
|
43
|
+
----
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
=== Microsoft Word to AsciiDoc: `w2a`
|
|
47
|
+
|
|
48
|
+
Convert Word `.doc` or `.docx` files to AsciiDoc:
|
|
49
|
+
|
|
50
|
+
[source,console]
|
|
51
|
+
----
|
|
52
|
+
$ w2a file.docx > file.adoc
|
|
53
|
+
----
|
|
54
|
+
|
|
55
|
+
[source,console]
|
|
56
|
+
----
|
|
57
|
+
$ w2a input.docx -o output.adoc
|
|
58
|
+
----
|
|
59
|
+
|
|
60
|
+
Help:
|
|
61
|
+
|
|
62
|
+
[source,console]
|
|
63
|
+
----
|
|
64
|
+
$ w2a -h
|
|
65
|
+
Usage: w2a [options] <file>
|
|
66
|
+
-a, --mathml2asciimath Convert MathML to AsciiMath
|
|
67
|
+
-o, --output=FILENAME Output file to write to
|
|
68
|
+
-e, --external-images Export images if data URI
|
|
69
|
+
-v, --version Version information
|
|
70
|
+
-h, --help Prints this help
|
|
71
|
+
----
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
NOTE: `w2a` requires LibreOffice to be installed. It uses LibreOffice's
|
|
75
|
+
export to XHTML. LibreOffice's export of XHTML is superior to the native Microsoft Word export
|
|
76
|
+
to HTML: it exports lists (which Word keeps as paragraphs), and it exports OOMML into MathML.
|
|
77
|
+
On the other hand, the LibreOffice export relies on default styling being used in the
|
|
78
|
+
document, and it may not cope with ordered lists or headings with customised appearance.
|
|
79
|
+
For best results, reset the styles in the document you're converting to those in
|
|
80
|
+
the default `Normal.dot` template.
|
|
81
|
+
|
|
82
|
+
NOTE: `w2a` requires the command-line version of LibreOffice, `soffice`. As it turns out,
|
|
83
|
+
LibreOffice v6 appears to render formulae in HTML as images instead of MathML expressions;
|
|
84
|
+
use LibreOffice v5. If you have both LibreOffice v5 and LibreOffice v6 installed, make sure
|
|
85
|
+
that your OS path searches for the LibreOffice v5 version of `soffice` first; e.g. on Mac,
|
|
86
|
+
include something like `/Applications/LibreOffice5.4.7.2.app/Contents/MacOS` in your PATH
|
|
87
|
+
environment.
|
|
88
|
+
|
|
89
|
+
NOTE: Some information in OOMML is not preserved in the export to MathML from LibreOffice;
|
|
90
|
+
in particular, font shifts such as double-struck fonts.
|
|
91
|
+
The LibreOffice exporter does seem to drop some text (possibly associated with
|
|
92
|
+
MathML); use with caution.
|
|
93
|
+
|
|
94
|
+
NOTE: Adapted from `w2m` of
|
|
95
|
+
https://github.com/benbalter/word-to-markdown[Ben Balter's word-to-markdown]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
=== Common options
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
==== MathML to AsciiMath conversion
|
|
102
|
+
|
|
103
|
+
If you wish to convert the MathML in the document to AsciiMath, run the script with the
|
|
104
|
+
`--mathml2asciimath` option:
|
|
105
|
+
|
|
106
|
+
[source,console]
|
|
107
|
+
----
|
|
108
|
+
$ w2a --mathml2asciimath document.docx > document.adoc
|
|
109
|
+
----
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
==== Extracting images
|
|
113
|
+
|
|
114
|
+
Images referred by the HTML can be extracted into the destination output folder by using:
|
|
115
|
+
|
|
116
|
+
[source,console]
|
|
117
|
+
----
|
|
118
|
+
$ reverse_adoc input.docx -o output/file.adoc -e
|
|
119
|
+
$ reverse_adoc input.docx --output output/file.adoc --external-images
|
|
120
|
+
----
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
Word embedded images can be extracted into the destination output folder by using:
|
|
124
|
+
|
|
125
|
+
[source,console]
|
|
126
|
+
----
|
|
127
|
+
$ w2a input.docx -o output/file.adoc -e
|
|
128
|
+
$ w2a input.docx --output output/file.adoc --external-images
|
|
129
|
+
----
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
==== Handling unknown HTML tags
|
|
133
|
+
|
|
134
|
+
The `--unknown_tags` option allows you to specify how to handle unknown tags
|
|
135
|
+
(default `pass_through`).
|
|
136
|
+
|
|
137
|
+
Valid options are:
|
|
138
|
+
|
|
139
|
+
* `pass_through` - Include the unknown tag completely into the result
|
|
140
|
+
* `drop` - Drop the unknown tag and its content
|
|
141
|
+
* `bypass` - Ignore the unknown tag but try to convert its content
|
|
142
|
+
* `raise` - Raise an error to let you know
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
==== Tagging of borders
|
|
146
|
+
|
|
147
|
+
Specify how to handle tag borders with the option `--tag_border` (default `' '`).
|
|
148
|
+
|
|
149
|
+
Valid options are:
|
|
150
|
+
|
|
151
|
+
* `' '` - Add whitespace if there is none at tag borders.
|
|
152
|
+
* `''` - Do not not add whitespace.
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
== Features
|
|
156
|
+
|
|
157
|
+
=== General
|
|
158
|
+
|
|
159
|
+
`reverse_adoc` shares features as a port of `reverse_markdown`:
|
|
160
|
+
|
|
161
|
+
* Module based -- if you miss a tag, just add it
|
|
162
|
+
* Can deal with nested lists
|
|
163
|
+
* Inline and block code is supported
|
|
164
|
+
* Supports blockquote
|
|
165
|
+
|
|
166
|
+
It supports the following HTML tags (these are supported by `reverse_markdown`):
|
|
167
|
+
|
|
168
|
+
* `a`
|
|
169
|
+
* `blockquote`
|
|
170
|
+
* `br`
|
|
171
|
+
* `code`, `tt` (added: `kbd`, `samp`, `var`)
|
|
172
|
+
* `div`, `article`
|
|
173
|
+
* `em`, `i` (added: `cite`)
|
|
174
|
+
* `h1`, `h2`, `h3`, `h4`, `h5`, `h6`, `hr`
|
|
175
|
+
* `img`
|
|
176
|
+
* `li`, `ol`, `ul` (added: `dir`)
|
|
177
|
+
* `p`, `pre`
|
|
178
|
+
* `strong`, `b`
|
|
179
|
+
* `table`, `td`, `th`, `tr`
|
|
180
|
+
|
|
181
|
+
[NOTE]
|
|
182
|
+
====
|
|
183
|
+
* reverse_adoc does *not* support `del` or `strike`, because Asciidoctor does not out of the box.
|
|
184
|
+
* As with reverse_markdown, `pre` is only treated as sourcecode if it is contained in a `div@class = highlight-` element, or has a `@brush` attribute naming the language (Confluence).
|
|
185
|
+
* The gem does not support `p@align`, because Asciidoctor doesn't
|
|
186
|
+
====
|
|
187
|
+
|
|
188
|
+
In addition, it supports:
|
|
189
|
+
|
|
190
|
+
* `aside`
|
|
191
|
+
* `audio`, `video` (with `@src` attributes)
|
|
192
|
+
* `figure`, `figcaption`
|
|
193
|
+
* `mark`
|
|
194
|
+
* `q`
|
|
195
|
+
* `sub`, `sup`
|
|
196
|
+
* `@id` anchors
|
|
197
|
+
* `blockquote@cite`
|
|
198
|
+
* `img/@width`, `img/@height`
|
|
199
|
+
* `ol/@style`, `ol/@start`, `ol/@reversed`, `ul/@type`
|
|
200
|
+
* `td/@colspan`, `td/@rowspan`, `td@/align`, `td@/valign`
|
|
201
|
+
* `table/caption`, `table/@width`, `table/@frame` (partial), `table/@rules` (partial)
|
|
202
|
+
* Lists and paragraphs within cells
|
|
203
|
+
** Not tables within cells: Asciidoctor cannot deal with nested tables
|
|
204
|
+
|
|
205
|
+
The gem does not support:
|
|
206
|
+
|
|
207
|
+
* `col`, `colgroup`
|
|
208
|
+
* `source`, `picture`
|
|
209
|
+
* `bdi`, `bdo`, `ruby`, `rt`, `rp`, `wbr`
|
|
210
|
+
* `frame`, `frameset`, `iframe`, `noframes`, `noscript`, `script`, `input`, `output`, `progress`
|
|
211
|
+
* `map`, `canvas`, `dialog`, `embed`, `object`, `param`, `svg`, `track`
|
|
212
|
+
* `fieldset`, `button`, `datalist`, `form`, `label`, `legend`, `menu`, `menulist`, `optgroup`, `option`, `select`, `textarea`
|
|
213
|
+
* `big`, `dfn`, `font`, `s`, `small`, `span`, `strike`, `u`
|
|
214
|
+
* `center`
|
|
215
|
+
* `data`, `meter`
|
|
216
|
+
* `del`, `ins`
|
|
217
|
+
* `footer`, `header`, `main`, `nav`, `details`, `section`, `summary`, `template`
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
=== MathML support
|
|
221
|
+
|
|
222
|
+
If you are using this gem in the context of https://www.metanorma.com[Metanorma],
|
|
223
|
+
Metanorma AsciiDoc accepts MathML as a native mathematical format. So you do not need
|
|
224
|
+
to convert the MathML to AsciiMath.
|
|
225
|
+
|
|
226
|
+
The gem will optionally invoke the https://github.com/metanorma/mathml2asciimath
|
|
227
|
+
gem, to convert MathML to AsciiMath. The conversion is not perfect, and will need to be
|
|
228
|
+
post-edited; but it's a lot better than nothing.
|
|
229
|
+
|
|
230
|
+
NOTE: Asciidoctor does not support MathML input. HTML uses MathML.
|
|
231
|
+
The gem will recognize MathML expressions in HTML, and will wrap them in Asciidoctor
|
|
232
|
+
`stem:[ ]` macros. The result of this gem is not actually legal Asciidoctor for `stem`:
|
|
233
|
+
Asciidoctor will presumably
|
|
234
|
+
think this is AsciiMath in the `stem:[ ]` macro, try to pass it into MathJax as
|
|
235
|
+
AsciiMath, and fail. But of course, MathJax has no problem with MathML, and some postprocessing
|
|
236
|
+
on the Asciidoctor output can ensure that the MathML is treated by MathJax (or whatever else
|
|
237
|
+
uses the output) as such; so this is still much better than nothing for stem processing.
|
|
238
|
+
|
|
239
|
+
=== Word cleanup
|
|
240
|
+
|
|
241
|
+
This gem is routinely used in the Metanorma project to export Word documents to AsciiDoc.
|
|
242
|
+
The HTML export from Word that the gem uses, from LibreOffice, is much cleaner than the
|
|
243
|
+
native HTML 4 export from Word; but it has some infelicities which this gem cleans up:
|
|
244
|
+
|
|
245
|
+
* The HTML export has trouble with subscripts, and routinely exports them as headings; the `w2a`
|
|
246
|
+
script tries to clean them up.
|
|
247
|
+
* The `w2a` cleans up spaces, but it does not strip them.
|
|
248
|
+
* Spaces are removed from anchors and cross-references.
|
|
249
|
+
* Double underscores are removed from anchors and cross-references.
|
|
250
|
+
* Cross-references to `_GoBack` and to `_Toc` followed by numbers (used to construct tables of contents) are ignored.
|
|
251
|
+
|
|
252
|
+
== Ruby library usage
|
|
253
|
+
|
|
254
|
+
=== General
|
|
255
|
+
|
|
256
|
+
Simple to use.
|
|
257
|
+
|
|
258
|
+
[source,ruby]
|
|
259
|
+
----
|
|
260
|
+
require 'coradoc/reverse_adoc'
|
|
261
|
+
|
|
262
|
+
result = Coradoc::ReverseAdoc.convert input
|
|
263
|
+
result.inspect # " *feelings* "
|
|
264
|
+
----
|
|
265
|
+
|
|
266
|
+
=== Configure with options
|
|
267
|
+
|
|
268
|
+
Just pass your chosen configuration options in after the input. The given options will last for this operation only.
|
|
269
|
+
|
|
270
|
+
[source,ruby]
|
|
271
|
+
----
|
|
272
|
+
require 'coradoc/reverse_adoc'
|
|
273
|
+
|
|
274
|
+
Coradoc::ReverseAdoc.convert(input, unknown_tags: :raise, mathml2asciimath: true)
|
|
275
|
+
----
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
=== Preconfigure using an initializer
|
|
279
|
+
|
|
280
|
+
Or configure it block style on a initializer level. These configurations will last for all conversions until they are set to something different.
|
|
281
|
+
|
|
282
|
+
[source,ruby]
|
|
283
|
+
----
|
|
284
|
+
require 'coradoc/reverse_adoc'
|
|
285
|
+
|
|
286
|
+
Coradoc::ReverseAdoc.config do |config|
|
|
287
|
+
config.unknown_tags = :bypass
|
|
288
|
+
config.mathml2asciimath = true
|
|
289
|
+
config.tag_border = ''
|
|
290
|
+
end
|
|
291
|
+
----
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
== Related stuff
|
|
295
|
+
|
|
296
|
+
* https://github.com/xijo/reverse_markdown[Xijo's original reverse_markdown gem]
|
|
297
|
+
* https://github.com/xijo/reverse_markdown/wiki/Write-your-own-converter[Write custom converters] - Wiki entry about how to write your own converter
|
|
298
|
+
* https://github.com/harlantwood/html_massage[html_massage] - A gem by Harlan T. Wood to convert regular sites into markdown using reverse_markdown
|
|
299
|
+
* https://github.com/benbalter/word-to-markdown[word-to-markdown] - Convert word docs into markdown while using reverse_markdown, by Ben Balter
|
|
300
|
+
* https://github.com/asciidocfx/HtmlToAsciidoc[HtmlToAsciidoc] - Javascript regexp-based converter of HTML to Asciidoctor
|
|
301
|
+
* https://asciidoctor.org/docs/user-manual/[The Asciidoctor User Manual]
|
|
302
|
+
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
module Coradoc::ReverseAdoc
|
|
2
|
+
class Cleaner
|
|
3
|
+
def tidy(string)
|
|
4
|
+
result = remove_inner_whitespaces(String.new(string))
|
|
5
|
+
result = remove_newlines(result)
|
|
6
|
+
result = remove_leading_newlines(result)
|
|
7
|
+
result = clean_tag_borders(result)
|
|
8
|
+
clean_punctuation_characters(result)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def remove_newlines(string)
|
|
12
|
+
string.gsub(/\n{3,}/, "\n\n")
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def remove_leading_newlines(string)
|
|
16
|
+
string.gsub(/\A\n+/, "")
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def remove_inner_whitespaces(string)
|
|
20
|
+
unless string.nil?
|
|
21
|
+
string.gsub!(/\n stem:\[/, "\nstem:[")
|
|
22
|
+
string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ")
|
|
23
|
+
string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1")
|
|
24
|
+
end
|
|
25
|
+
string.each_line.inject("") do |memo, line|
|
|
26
|
+
memo + preserve_border_whitespaces(line) do
|
|
27
|
+
line.strip.gsub(/[ \t]{2,}/, " ")
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Find non-asterisk content that is enclosed by two or
|
|
33
|
+
# more asterisks. Ensure that only one whitespace occurs
|
|
34
|
+
# in the border area.
|
|
35
|
+
# Same for underscores and brackets.
|
|
36
|
+
def clean_tag_borders(string)
|
|
37
|
+
# result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match|
|
|
38
|
+
# preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do
|
|
39
|
+
# match.strip.sub("** ", "**").sub(" **", "**")
|
|
40
|
+
# end
|
|
41
|
+
# end
|
|
42
|
+
|
|
43
|
+
# result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match|
|
|
44
|
+
# preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do
|
|
45
|
+
# match.strip.sub("__ ", "__").sub(" __", "__")
|
|
46
|
+
# end
|
|
47
|
+
# end
|
|
48
|
+
|
|
49
|
+
result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
|
|
50
|
+
preserve_border_whitespaces(match,
|
|
51
|
+
default_border: Coradoc::ReverseAdoc.config.tag_border) do
|
|
52
|
+
match.strip.sub("~~ ", "~~").sub(" ~~", "~~")
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
result.gsub(/\s?\[.*?\]\s?/) do |match|
|
|
57
|
+
preserve_border_whitespaces(match) do
|
|
58
|
+
match.strip.sub("[ ", "[").sub(" ]", "]")
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def clean_punctuation_characters(string)
|
|
64
|
+
string.gsub(/(\*\*|~~|__)\s([.!?'"])/, "#{'\\1'.strip}\\2")
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# preprocesses HTML, rather than postprocessing it
|
|
68
|
+
def preprocess_word_html(string)
|
|
69
|
+
clean_headings(scrub_whitespace(string.dup))
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def scrub_whitespace(string)
|
|
73
|
+
string.gsub!(/ | |\u00a0/i, " ") # HTML encoded spaces
|
|
74
|
+
string.sub!(/^\A[[:space:]]+/m, "") # document leading whitespace
|
|
75
|
+
string.sub!(/[[:space:]]+\z$/m, "") # document trailing whitespace
|
|
76
|
+
string.gsub!(/( +)$/, " ") # line trailing whitespace
|
|
77
|
+
string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
|
|
78
|
+
# string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs
|
|
79
|
+
string
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# following added by me
|
|
83
|
+
def clean_headings(string)
|
|
84
|
+
string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ")
|
|
85
|
+
# I don't know why Libre Office is inserting them, but they need to go
|
|
86
|
+
string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
|
|
87
|
+
"<sup>\\2</sup>")
|
|
88
|
+
# I absolutely don't know why Libre Office is rendering superscripts as h1
|
|
89
|
+
string
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
private
|
|
93
|
+
|
|
94
|
+
def preserve_border_whitespaces(string, options = {})
|
|
95
|
+
return string if /\A\s*\Z/.match?(string)
|
|
96
|
+
|
|
97
|
+
default_border = options.fetch(:default_border, "")
|
|
98
|
+
# If the string contains part of a link so the characters [,],(,)
|
|
99
|
+
# then don't add any extra spaces
|
|
100
|
+
default_border = "" if /[\[(\])]/.match?(string)
|
|
101
|
+
string_start = present_or_default(string[/\A\s*/], default_border)
|
|
102
|
+
string_end = present_or_default(string[/\s*\Z/], default_border)
|
|
103
|
+
result = yield
|
|
104
|
+
string_start + result + string_end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def present_or_default(string, default)
|
|
108
|
+
return default if string.nil? || string.empty?
|
|
109
|
+
|
|
110
|
+
string
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
require "tmpdir"
|
|
2
|
+
|
|
3
|
+
module Coradoc::ReverseAdoc
|
|
4
|
+
class Config
|
|
5
|
+
attr_accessor :unknown_tags, :tag_border, :mathml2asciimath, :external_images,
|
|
6
|
+
:destination, :sourcedir, :image_counter, :image_counter_pattern, :input_format
|
|
7
|
+
|
|
8
|
+
def initialize
|
|
9
|
+
@unknown_tags = :pass_through
|
|
10
|
+
@input_format = :html
|
|
11
|
+
@mathml2asciimath = false
|
|
12
|
+
@external_images = false
|
|
13
|
+
|
|
14
|
+
# Destination to save file and images
|
|
15
|
+
@destination = nil
|
|
16
|
+
|
|
17
|
+
# Source of HTML
|
|
18
|
+
# @sourcedir = nil
|
|
19
|
+
|
|
20
|
+
# Image counter, assuming there are max 999 images
|
|
21
|
+
@image_counter = 1
|
|
22
|
+
# pad with 0s
|
|
23
|
+
@image_counter_pattern = "%03d"
|
|
24
|
+
|
|
25
|
+
@em_delimiter = "_".freeze
|
|
26
|
+
@strong_delimiter = "*".freeze
|
|
27
|
+
@inline_options = {}
|
|
28
|
+
@tag_border = " ".freeze
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def with(options = {})
|
|
32
|
+
@inline_options = options
|
|
33
|
+
result = yield
|
|
34
|
+
@inline_options = {}
|
|
35
|
+
result
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def unknown_tags
|
|
39
|
+
@inline_options[:unknown_tags] || @unknown_tags
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def mathml2asciimath
|
|
43
|
+
@inline_options[:mathml2asciimath] || @mathml2asciimath
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def external_images
|
|
47
|
+
@inline_options[:external_images] || @external_images
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def tag_border
|
|
51
|
+
@inline_options[:tag_border] || @tag_border
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
require "coradoc"
|
|
2
|
+
|
|
3
|
+
module Coradoc::ReverseAdoc
|
|
4
|
+
module Converters
|
|
5
|
+
class A < Base
|
|
6
|
+
def to_coradoc(node, state = {})
|
|
7
|
+
name = treat_children(node, state)
|
|
8
|
+
|
|
9
|
+
href = node["href"]
|
|
10
|
+
title = extract_title(node)
|
|
11
|
+
id = node["id"] || node["name"]
|
|
12
|
+
|
|
13
|
+
id = id&.gsub(/\s/, "")&.gsub(/__+/, "_")
|
|
14
|
+
|
|
15
|
+
return "" if /^_Toc\d+$|^_GoBack$/.match?(id)
|
|
16
|
+
|
|
17
|
+
if !id.nil? && !id.empty?
|
|
18
|
+
return Coradoc::Element::Inline::Anchor.new(id)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
if href.to_s.start_with?("#")
|
|
22
|
+
href = href.sub(/^#/, "").gsub(/\s/, "").gsub(/__+/, "_")
|
|
23
|
+
return Coradoc::Element::Inline::CrossReference.new(href, name)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
if href.to_s.empty?
|
|
27
|
+
return name
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
Coradoc::Element::Inline::Link.new(path: href,
|
|
31
|
+
name: name,
|
|
32
|
+
title: title)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def convert(node, state = {})
|
|
36
|
+
Coradoc::Generator.gen_adoc(to_coradoc(node, state))
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
register :a, A.new
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
module Coradoc::ReverseAdoc
|
|
2
|
+
module Converters
|
|
3
|
+
class Aside < Base
|
|
4
|
+
def to_coradoc(node, state = {})
|
|
5
|
+
content = treat_children(node, state)
|
|
6
|
+
Coradoc::Element::Block::Side.new(lines: content.lines)
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def convert(node, state = {})
|
|
10
|
+
Coradoc::Generator.gen_adoc(to_coradoc(node, state))
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
register :aside, Aside.new
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
module Coradoc::ReverseAdoc
|
|
2
|
+
module Converters
|
|
3
|
+
class Audio < Base
|
|
4
|
+
def to_coradoc(node, _state = {})
|
|
5
|
+
src = node["src"]
|
|
6
|
+
id = node["id"]
|
|
7
|
+
title = extract_title(node)
|
|
8
|
+
attributes = Coradoc::Element::AttributeList.new
|
|
9
|
+
options = options(node)
|
|
10
|
+
attributes.add_named("options", options) if options.any?
|
|
11
|
+
Coradoc::Element::Audio.new(title, id: id, src: src,
|
|
12
|
+
attributes: attributes)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def convert(node, state = {})
|
|
16
|
+
Coradoc::Generator.gen_adoc(to_coradoc(node, state))
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def options(node)
|
|
20
|
+
autoplay = node["autoplay"]
|
|
21
|
+
loop_attr = node["loop"]
|
|
22
|
+
controls = node["controls"]
|
|
23
|
+
[autoplay, loop_attr, controls].compact
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
register :audio, Audio.new
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
module Coradoc::ReverseAdoc
|
|
2
|
+
module Converters
|
|
3
|
+
class Base
|
|
4
|
+
def treat_children(node, state)
|
|
5
|
+
node.children.inject("") do |memo, child|
|
|
6
|
+
memo << treat(child, state)
|
|
7
|
+
end
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def treat(node, state)
|
|
11
|
+
Coradoc::ReverseAdoc::Converters.lookup(node.name).convert(node, state)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def treat_children_coradoc(node, state)
|
|
15
|
+
node.children.inject([]) do |memo, child|
|
|
16
|
+
memo << treat_coradoc(child, state)
|
|
17
|
+
end.flatten.reject { |x| x == "" || x.nil? }
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def treat_coradoc(node, state)
|
|
21
|
+
Coradoc::ReverseAdoc::Converters.lookup(node.name).to_coradoc(node, state)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def escape_keychars(string)
|
|
25
|
+
subs = { "*" => '\*', "_" => '\_' }
|
|
26
|
+
string
|
|
27
|
+
.gsub(/((?<=\s)[\*_]+)|[\*_]+(?=\s)/) do |n|
|
|
28
|
+
n.chars.map do |char|
|
|
29
|
+
subs[char]
|
|
30
|
+
end.join
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def extract_title(node)
|
|
35
|
+
title = escape_keychars(node["title"].to_s)
|
|
36
|
+
title.empty? ? "" : %[ #{title}]
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def node_has_ancestor?(node, name)
|
|
40
|
+
case name
|
|
41
|
+
when String
|
|
42
|
+
node.ancestors.map(&:name).include?(name)
|
|
43
|
+
when Array
|
|
44
|
+
(node.ancestors.map(&:name) & name).any?
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def textnode_before_end_with?(node, str)
|
|
49
|
+
return nil if !str.is_a?(String) || str.empty?
|
|
50
|
+
|
|
51
|
+
node2 = node.at_xpath("preceding-sibling::node()[1]")
|
|
52
|
+
node2.respond_to?(:text) && node2.text.end_with?(str)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def unconstrained_before?(node)
|
|
56
|
+
before = node.at_xpath("preceding::node()[1]")
|
|
57
|
+
|
|
58
|
+
before &&
|
|
59
|
+
!before.text.strip.empty? &&
|
|
60
|
+
before.text[-1]&.match?(/\w/)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# TODO: This logic ought to be cleaned up.
|
|
64
|
+
def unconstrained_after?(node)
|
|
65
|
+
after = node.at_xpath("following::node()[1]")
|
|
66
|
+
|
|
67
|
+
after && !after.text.strip.empty? &&
|
|
68
|
+
after.text[0]&.match?(/\w|,|;|"|\.\?!/)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# def trailing_whitespace?(node)
|
|
72
|
+
|
|
73
|
+
# TODO: This logic ought to be cleaned up.
|
|
74
|
+
def constrained?(node)
|
|
75
|
+
before = node.at_xpath("preceding::node()[1]").to_s[-1]
|
|
76
|
+
before = if before
|
|
77
|
+
before&.match?(/\s/) ? true : false
|
|
78
|
+
else
|
|
79
|
+
true
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
if !before && (node.to_s[0] =~ /\s/)
|
|
83
|
+
before = true
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
after = node.at_xpath("following::node()[1]").to_s[0]
|
|
87
|
+
after = if after
|
|
88
|
+
after&.match?(/\s|,|;|"|\.\?!/) ? true : false
|
|
89
|
+
else
|
|
90
|
+
true
|
|
91
|
+
end
|
|
92
|
+
if !after && (node.to_s[-1] =~ /\s/)
|
|
93
|
+
after = true
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
before && after
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
module Coradoc::ReverseAdoc
|
|
2
|
+
module Converters
|
|
3
|
+
class Blockquote < Base
|
|
4
|
+
def to_coradoc(node, state = {})
|
|
5
|
+
node["id"]
|
|
6
|
+
cite = node["cite"]
|
|
7
|
+
attributes = if cite.nil?
|
|
8
|
+
nil
|
|
9
|
+
else
|
|
10
|
+
Coradoc::Element::AttributeList.new(
|
|
11
|
+
"quote", cite
|
|
12
|
+
)
|
|
13
|
+
end
|
|
14
|
+
content = treat_children(node, state).strip
|
|
15
|
+
content = Coradoc::ReverseAdoc.cleaner.remove_newlines(content)
|
|
16
|
+
Coradoc::Element::Block::Quote.new(nil, lines: content,
|
|
17
|
+
attributes: attributes)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def convert(node, state = {})
|
|
21
|
+
Coradoc::Generator.gen_adoc(to_coradoc(node, state))
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
register :blockquote, Blockquote.new
|
|
26
|
+
end
|
|
27
|
+
end
|