coradoc 0.1.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (126) hide show
  1. checksums.yaml +4 -4
  2. data/.docker/Dockerfile +1 -1
  3. data/.docker/docker-compose.yml +2 -2
  4. data/.editorconfig +15 -0
  5. data/.pryrc.sample +1 -0
  6. data/CHANGELOG.md +4 -0
  7. data/Rakefile +10 -0
  8. data/coradoc.gemspec +11 -2
  9. data/exe/reverse_adoc +70 -0
  10. data/exe/w2a +72 -0
  11. data/lib/coradoc/document.rb +40 -23
  12. data/lib/coradoc/element/admonition.rb +13 -0
  13. data/lib/coradoc/{document → element}/attribute.rb +1 -1
  14. data/lib/coradoc/element/attribute_list.rb +46 -0
  15. data/lib/coradoc/element/audio.rb +22 -0
  16. data/lib/coradoc/element/author.rb +22 -0
  17. data/lib/coradoc/element/base.rb +15 -0
  18. data/lib/coradoc/element/block/core.rb +69 -0
  19. data/lib/coradoc/element/block/example.rb +22 -0
  20. data/lib/coradoc/element/block/literal.rb +19 -0
  21. data/lib/coradoc/element/block/quote.rb +19 -0
  22. data/lib/coradoc/element/block/side.rb +17 -0
  23. data/lib/coradoc/element/block/sourcecode.rb +20 -0
  24. data/lib/coradoc/element/block.rb +13 -0
  25. data/lib/coradoc/element/break.rb +11 -0
  26. data/lib/coradoc/{document/bibdata.rb → element/document_attributes.rb} +4 -8
  27. data/lib/coradoc/element/header.rb +20 -0
  28. data/lib/coradoc/element/image/block_image.rb +12 -0
  29. data/lib/coradoc/element/image/core.rb +25 -0
  30. data/lib/coradoc/element/image/inline_image.rb +12 -0
  31. data/lib/coradoc/element/image.rb +10 -0
  32. data/lib/coradoc/element/inline/anchor.rb +17 -0
  33. data/lib/coradoc/element/inline/bold.rb +20 -0
  34. data/lib/coradoc/element/inline/cross_reference.rb +22 -0
  35. data/lib/coradoc/element/inline/hard_line_break.rb +11 -0
  36. data/lib/coradoc/element/inline/highlight.rb +20 -0
  37. data/lib/coradoc/element/inline/image.rb +26 -0
  38. data/lib/coradoc/element/inline/italic.rb +20 -0
  39. data/lib/coradoc/element/inline/link.rb +26 -0
  40. data/lib/coradoc/element/inline/monospace.rb +20 -0
  41. data/lib/coradoc/element/inline/quotation.rb +18 -0
  42. data/lib/coradoc/element/inline/subscript.rb +18 -0
  43. data/lib/coradoc/element/inline/superscript.rb +18 -0
  44. data/lib/coradoc/element/inline.rb +18 -0
  45. data/lib/coradoc/element/list/core.rb +35 -0
  46. data/lib/coradoc/element/list/definition.rb +8 -0
  47. data/lib/coradoc/element/list/ordered.rb +15 -0
  48. data/lib/coradoc/element/list/unordered.rb +15 -0
  49. data/lib/coradoc/element/list.rb +12 -0
  50. data/lib/coradoc/element/list_item.rb +19 -0
  51. data/lib/coradoc/element/paragraph.rb +32 -0
  52. data/lib/coradoc/element/revision.rb +25 -0
  53. data/lib/coradoc/element/section.rb +37 -0
  54. data/lib/coradoc/element/table.rb +67 -0
  55. data/lib/coradoc/element/text_element.rb +32 -0
  56. data/lib/coradoc/{document → element}/title.rb +14 -5
  57. data/lib/coradoc/element/video.rb +23 -0
  58. data/lib/coradoc/generator.rb +17 -0
  59. data/lib/coradoc/legacy_parser.rb +48 -48
  60. data/lib/coradoc/oscal.rb +6 -5
  61. data/lib/coradoc/parser/asciidoc/base.rb +17 -17
  62. data/lib/coradoc/parser/asciidoc/content.rb +55 -29
  63. data/lib/coradoc/parser/asciidoc/document_attributes.rb +19 -0
  64. data/lib/coradoc/parser/asciidoc/header.rb +13 -10
  65. data/lib/coradoc/parser/asciidoc/section.rb +6 -6
  66. data/lib/coradoc/parser/base.rb +5 -5
  67. data/lib/coradoc/parser.rb +1 -1
  68. data/lib/coradoc/reverse_adoc/LICENSE.txt +25 -0
  69. data/lib/coradoc/reverse_adoc/README.adoc +302 -0
  70. data/lib/coradoc/reverse_adoc/cleaner.rb +113 -0
  71. data/lib/coradoc/reverse_adoc/config.rb +54 -0
  72. data/lib/coradoc/reverse_adoc/converters/a.rb +42 -0
  73. data/lib/coradoc/reverse_adoc/converters/aside.rb +16 -0
  74. data/lib/coradoc/reverse_adoc/converters/audio.rb +29 -0
  75. data/lib/coradoc/reverse_adoc/converters/base.rb +100 -0
  76. data/lib/coradoc/reverse_adoc/converters/blockquote.rb +27 -0
  77. data/lib/coradoc/reverse_adoc/converters/br.rb +15 -0
  78. data/lib/coradoc/reverse_adoc/converters/bypass.rb +81 -0
  79. data/lib/coradoc/reverse_adoc/converters/code.rb +56 -0
  80. data/lib/coradoc/reverse_adoc/converters/div.rb +18 -0
  81. data/lib/coradoc/reverse_adoc/converters/drop.rb +22 -0
  82. data/lib/coradoc/reverse_adoc/converters/em.rb +55 -0
  83. data/lib/coradoc/reverse_adoc/converters/figure.rb +25 -0
  84. data/lib/coradoc/reverse_adoc/converters/h.rb +42 -0
  85. data/lib/coradoc/reverse_adoc/converters/head.rb +23 -0
  86. data/lib/coradoc/reverse_adoc/converters/hr.rb +15 -0
  87. data/lib/coradoc/reverse_adoc/converters/ignore.rb +16 -0
  88. data/lib/coradoc/reverse_adoc/converters/img.rb +93 -0
  89. data/lib/coradoc/reverse_adoc/converters/li.rb +17 -0
  90. data/lib/coradoc/reverse_adoc/converters/mark.rb +21 -0
  91. data/lib/coradoc/reverse_adoc/converters/math.rb +31 -0
  92. data/lib/coradoc/reverse_adoc/converters/ol.rb +64 -0
  93. data/lib/coradoc/reverse_adoc/converters/p.rb +23 -0
  94. data/lib/coradoc/reverse_adoc/converters/pass_through.rb +13 -0
  95. data/lib/coradoc/reverse_adoc/converters/pre.rb +55 -0
  96. data/lib/coradoc/reverse_adoc/converters/q.rb +16 -0
  97. data/lib/coradoc/reverse_adoc/converters/strong.rb +52 -0
  98. data/lib/coradoc/reverse_adoc/converters/sub.rb +16 -0
  99. data/lib/coradoc/reverse_adoc/converters/sup.rb +16 -0
  100. data/lib/coradoc/reverse_adoc/converters/table.rb +69 -0
  101. data/lib/coradoc/reverse_adoc/converters/td.rb +83 -0
  102. data/lib/coradoc/reverse_adoc/converters/text.rb +65 -0
  103. data/lib/coradoc/reverse_adoc/converters/th.rb +14 -0
  104. data/lib/coradoc/reverse_adoc/converters/tr.rb +22 -0
  105. data/lib/coradoc/reverse_adoc/converters/video.rb +29 -0
  106. data/lib/coradoc/reverse_adoc/converters.rb +32 -0
  107. data/lib/coradoc/reverse_adoc/errors.rb +10 -0
  108. data/lib/coradoc/reverse_adoc/html_converter.rb +61 -0
  109. data/lib/coradoc/reverse_adoc.rb +27 -0
  110. data/lib/coradoc/transformer.rb +59 -46
  111. data/lib/coradoc/version.rb +1 -1
  112. data/lib/coradoc.rb +6 -4
  113. data/lib/reverse_adoc.rb +20 -0
  114. metadata +231 -23
  115. data/lib/coradoc/document/admonition.rb +0 -11
  116. data/lib/coradoc/document/author.rb +0 -11
  117. data/lib/coradoc/document/base.rb +0 -17
  118. data/lib/coradoc/document/block.rb +0 -34
  119. data/lib/coradoc/document/header.rb +0 -11
  120. data/lib/coradoc/document/list.rb +0 -14
  121. data/lib/coradoc/document/paragraph.rb +0 -19
  122. data/lib/coradoc/document/revision.rb +0 -11
  123. data/lib/coradoc/document/section.rb +0 -28
  124. data/lib/coradoc/document/table.rb +0 -20
  125. data/lib/coradoc/document/text_element.rb +0 -22
  126. data/lib/coradoc/parser/asciidoc/bibdata.rb +0 -19
@@ -0,0 +1,25 @@
1
+ BSD 2-Clause License
2
+
3
+ Copyright (c) 2018, Ribose
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ * Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ * Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,302 @@
1
+ = AsciiDoc from HTML and Microsoft Word: reverse_adoc
2
+
3
+ https://github.com/metanorma/reverse_adoc[reverse_adoc] image:https://img.shields.io/gem/v/reverse_adoc.svg["Gem Version", link="https://rubygems.org/gems/reverse_adoc"]::
4
+ image:https://github.com/metanorma/reverse_adoc/workflows/rake/badge.svg["Build Status", link="https://github.com/metanorma/reverse_adoc/actions?workflow=rake"]
5
+ image:https://codeclimate.com/github/metanorma/reverse_adoc/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/reverse_adoc"]
6
+ image:https://img.shields.io/github/issues-pr-raw/metanorma/reverse_adoc.svg["Pull Requests", link="https://github.com/metanorma/reverse_adoc/pulls"]
7
+ image:https://img.shields.io/github/commits-since/metanorma/reverse_adoc/latest.svg["Commits since latest",link="https://github.com/metanorma/reverse_adoc/releases"]
8
+
9
+ == Purpose
10
+
11
+ Transforms HTML and Microsoft Word into AsciiDoc.
12
+
13
+ Based on https://github.com/xijo/reverse_markdown
14
+
15
+
16
+ == Installation
17
+
18
+ Install the gem:
19
+
20
+ [source,console]
21
+ ----
22
+ [sudo] gem install reverse_adoc
23
+ ----
24
+
25
+ or add it to your `Gemfile`:
26
+
27
+ [source,ruby]
28
+ ----
29
+ gem 'reverse_adoc'
30
+ ----
31
+
32
+
33
+ == Command-line usage
34
+
35
+ === HTML to AsciiDoc: `reverse_adoc`
36
+
37
+ Convert HTML files to AsciiDoc:
38
+
39
+ [source,console]
40
+ ----
41
+ $ reverse_adoc file.html > file.adoc
42
+ $ cat file.html | reverse_adoc > file.adoc
43
+ ----
44
+
45
+
46
+ === Microsoft Word to AsciiDoc: `w2a`
47
+
48
+ Convert Word `.doc` or `.docx` files to AsciiDoc:
49
+
50
+ [source,console]
51
+ ----
52
+ $ w2a file.docx > file.adoc
53
+ ----
54
+
55
+ [source,console]
56
+ ----
57
+ $ w2a input.docx -o output.adoc
58
+ ----
59
+
60
+ Help:
61
+
62
+ [source,console]
63
+ ----
64
+ $ w2a -h
65
+ Usage: w2a [options] <file>
66
+ -a, --mathml2asciimath Convert MathML to AsciiMath
67
+ -o, --output=FILENAME Output file to write to
68
+ -e, --external-images Export images if data URI
69
+ -v, --version Version information
70
+ -h, --help Prints this help
71
+ ----
72
+
73
+
74
+ NOTE: `w2a` requires LibreOffice to be installed. It uses LibreOffice's
75
+ export to XHTML. LibreOffice's export of XHTML is superior to the native Microsoft Word export
76
+ to HTML: it exports lists (which Word keeps as paragraphs), and it exports OOMML into MathML.
77
+ On the other hand, the LibreOffice export relies on default styling being used in the
78
+ document, and it may not cope with ordered lists or headings with customised appearance.
79
+ For best results, reset the styles in the document you're converting to those in
80
+ the default `Normal.dot` template.
81
+
82
+ NOTE: `w2a` requires the command-line version of LibreOffice, `soffice`. As it turns out,
83
+ LibreOffice v6 appears to render formulae in HTML as images instead of MathML expressions;
84
+ use LibreOffice v5. If you have both LibreOffice v5 and LibreOffice v6 installed, make sure
85
+ that your OS path searches for the LibreOffice v5 version of `soffice` first; e.g. on Mac,
86
+ include something like `/Applications/LibreOffice5.4.7.2.app/Contents/MacOS` in your PATH
87
+ environment.
88
+
89
+ NOTE: Some information in OOMML is not preserved in the export to MathML from LibreOffice;
90
+ in particular, font shifts such as double-struck fonts.
91
+ The LibreOffice exporter does seem to drop some text (possibly associated with
92
+ MathML); use with caution.
93
+
94
+ NOTE: Adapted from `w2m` of
95
+ https://github.com/benbalter/word-to-markdown[Ben Balter's word-to-markdown]
96
+
97
+
98
+ === Common options
99
+
100
+
101
+ ==== MathML to AsciiMath conversion
102
+
103
+ If you wish to convert the MathML in the document to AsciiMath, run the script with the
104
+ `--mathml2asciimath` option:
105
+
106
+ [source,console]
107
+ ----
108
+ $ w2a --mathml2asciimath document.docx > document.adoc
109
+ ----
110
+
111
+
112
+ ==== Extracting images
113
+
114
+ Images referred by the HTML can be extracted into the destination output folder by using:
115
+
116
+ [source,console]
117
+ ----
118
+ $ reverse_adoc input.docx -o output/file.adoc -e
119
+ $ reverse_adoc input.docx --output output/file.adoc --external-images
120
+ ----
121
+
122
+
123
+ Word embedded images can be extracted into the destination output folder by using:
124
+
125
+ [source,console]
126
+ ----
127
+ $ w2a input.docx -o output/file.adoc -e
128
+ $ w2a input.docx --output output/file.adoc --external-images
129
+ ----
130
+
131
+
132
+ ==== Handling unknown HTML tags
133
+
134
+ The `--unknown_tags` option allows you to specify how to handle unknown tags
135
+ (default `pass_through`).
136
+
137
+ Valid options are:
138
+
139
+ * `pass_through` - Include the unknown tag completely into the result
140
+ * `drop` - Drop the unknown tag and its content
141
+ * `bypass` - Ignore the unknown tag but try to convert its content
142
+ * `raise` - Raise an error to let you know
143
+
144
+
145
+ ==== Tagging of borders
146
+
147
+ Specify how to handle tag borders with the option `--tag_border` (default `' '`).
148
+
149
+ Valid options are:
150
+
151
+ * `' '` - Add whitespace if there is none at tag borders.
152
+ * `''` - Do not not add whitespace.
153
+
154
+
155
+ == Features
156
+
157
+ === General
158
+
159
+ `reverse_adoc` shares features as a port of `reverse_markdown`:
160
+
161
+ * Module based -- if you miss a tag, just add it
162
+ * Can deal with nested lists
163
+ * Inline and block code is supported
164
+ * Supports blockquote
165
+
166
+ It supports the following HTML tags (these are supported by `reverse_markdown`):
167
+
168
+ * `a`
169
+ * `blockquote`
170
+ * `br`
171
+ * `code`, `tt` (added: `kbd`, `samp`, `var`)
172
+ * `div`, `article`
173
+ * `em`, `i` (added: `cite`)
174
+ * `h1`, `h2`, `h3`, `h4`, `h5`, `h6`, `hr`
175
+ * `img`
176
+ * `li`, `ol`, `ul` (added: `dir`)
177
+ * `p`, `pre`
178
+ * `strong`, `b`
179
+ * `table`, `td`, `th`, `tr`
180
+
181
+ [NOTE]
182
+ ====
183
+ * reverse_adoc does *not* support `del` or `strike`, because Asciidoctor does not out of the box.
184
+ * As with reverse_markdown, `pre` is only treated as sourcecode if it is contained in a `div@class = highlight-` element, or has a `@brush` attribute naming the language (Confluence).
185
+ * The gem does not support `p@align`, because Asciidoctor doesn't
186
+ ====
187
+
188
+ In addition, it supports:
189
+
190
+ * `aside`
191
+ * `audio`, `video` (with `@src` attributes)
192
+ * `figure`, `figcaption`
193
+ * `mark`
194
+ * `q`
195
+ * `sub`, `sup`
196
+ * `@id` anchors
197
+ * `blockquote@cite`
198
+ * `img/@width`, `img/@height`
199
+ * `ol/@style`, `ol/@start`, `ol/@reversed`, `ul/@type`
200
+ * `td/@colspan`, `td/@rowspan`, `td@/align`, `td@/valign`
201
+ * `table/caption`, `table/@width`, `table/@frame` (partial), `table/@rules` (partial)
202
+ * Lists and paragraphs within cells
203
+ ** Not tables within cells: Asciidoctor cannot deal with nested tables
204
+
205
+ The gem does not support:
206
+
207
+ * `col`, `colgroup`
208
+ * `source`, `picture`
209
+ * `bdi`, `bdo`, `ruby`, `rt`, `rp`, `wbr`
210
+ * `frame`, `frameset`, `iframe`, `noframes`, `noscript`, `script`, `input`, `output`, `progress`
211
+ * `map`, `canvas`, `dialog`, `embed`, `object`, `param`, `svg`, `track`
212
+ * `fieldset`, `button`, `datalist`, `form`, `label`, `legend`, `menu`, `menulist`, `optgroup`, `option`, `select`, `textarea`
213
+ * `big`, `dfn`, `font`, `s`, `small`, `span`, `strike`, `u`
214
+ * `center`
215
+ * `data`, `meter`
216
+ * `del`, `ins`
217
+ * `footer`, `header`, `main`, `nav`, `details`, `section`, `summary`, `template`
218
+
219
+
220
+ === MathML support
221
+
222
+ If you are using this gem in the context of https://www.metanorma.com[Metanorma],
223
+ Metanorma AsciiDoc accepts MathML as a native mathematical format. So you do not need
224
+ to convert the MathML to AsciiMath.
225
+
226
+ The gem will optionally invoke the https://github.com/metanorma/mathml2asciimath
227
+ gem, to convert MathML to AsciiMath. The conversion is not perfect, and will need to be
228
+ post-edited; but it's a lot better than nothing.
229
+
230
+ NOTE: Asciidoctor does not support MathML input. HTML uses MathML.
231
+ The gem will recognize MathML expressions in HTML, and will wrap them in Asciidoctor
232
+ `stem:[ ]` macros. The result of this gem is not actually legal Asciidoctor for `stem`:
233
+ Asciidoctor will presumably
234
+ think this is AsciiMath in the `stem:[ ]` macro, try to pass it into MathJax as
235
+ AsciiMath, and fail. But of course, MathJax has no problem with MathML, and some postprocessing
236
+ on the Asciidoctor output can ensure that the MathML is treated by MathJax (or whatever else
237
+ uses the output) as such; so this is still much better than nothing for stem processing.
238
+
239
+ === Word cleanup
240
+
241
+ This gem is routinely used in the Metanorma project to export Word documents to AsciiDoc.
242
+ The HTML export from Word that the gem uses, from LibreOffice, is much cleaner than the
243
+ native HTML 4 export from Word; but it has some infelicities which this gem cleans up:
244
+
245
+ * The HTML export has trouble with subscripts, and routinely exports them as headings; the `w2a`
246
+ script tries to clean them up.
247
+ * The `w2a` cleans up spaces, but it does not strip them.
248
+ * Spaces are removed from anchors and cross-references.
249
+ * Double underscores are removed from anchors and cross-references.
250
+ * Cross-references to `_GoBack` and to `_Toc` followed by numbers (used to construct tables of contents) are ignored.
251
+
252
+ == Ruby library usage
253
+
254
+ === General
255
+
256
+ Simple to use.
257
+
258
+ [source,ruby]
259
+ ----
260
+ require 'coradoc/reverse_adoc'
261
+
262
+ result = Coradoc::ReverseAdoc.convert input
263
+ result.inspect # " *feelings* "
264
+ ----
265
+
266
+ === Configure with options
267
+
268
+ Just pass your chosen configuration options in after the input. The given options will last for this operation only.
269
+
270
+ [source,ruby]
271
+ ----
272
+ require 'coradoc/reverse_adoc'
273
+
274
+ Coradoc::ReverseAdoc.convert(input, unknown_tags: :raise, mathml2asciimath: true)
275
+ ----
276
+
277
+
278
+ === Preconfigure using an initializer
279
+
280
+ Or configure it block style on a initializer level. These configurations will last for all conversions until they are set to something different.
281
+
282
+ [source,ruby]
283
+ ----
284
+ require 'coradoc/reverse_adoc'
285
+
286
+ Coradoc::ReverseAdoc.config do |config|
287
+ config.unknown_tags = :bypass
288
+ config.mathml2asciimath = true
289
+ config.tag_border = ''
290
+ end
291
+ ----
292
+
293
+
294
+ == Related stuff
295
+
296
+ * https://github.com/xijo/reverse_markdown[Xijo's original reverse_markdown gem]
297
+ * https://github.com/xijo/reverse_markdown/wiki/Write-your-own-converter[Write custom converters] - Wiki entry about how to write your own converter
298
+ * https://github.com/harlantwood/html_massage[html_massage] - A gem by Harlan T. Wood to convert regular sites into markdown using reverse_markdown
299
+ * https://github.com/benbalter/word-to-markdown[word-to-markdown] - Convert word docs into markdown while using reverse_markdown, by Ben Balter
300
+ * https://github.com/asciidocfx/HtmlToAsciidoc[HtmlToAsciidoc] - Javascript regexp-based converter of HTML to Asciidoctor
301
+ * https://asciidoctor.org/docs/user-manual/[The Asciidoctor User Manual]
302
+
@@ -0,0 +1,113 @@
1
+ module Coradoc::ReverseAdoc
2
+ class Cleaner
3
+ def tidy(string)
4
+ result = remove_inner_whitespaces(String.new(string))
5
+ result = remove_newlines(result)
6
+ result = remove_leading_newlines(result)
7
+ result = clean_tag_borders(result)
8
+ clean_punctuation_characters(result)
9
+ end
10
+
11
+ def remove_newlines(string)
12
+ string.gsub(/\n{3,}/, "\n\n")
13
+ end
14
+
15
+ def remove_leading_newlines(string)
16
+ string.gsub(/\A\n+/, "")
17
+ end
18
+
19
+ def remove_inner_whitespaces(string)
20
+ unless string.nil?
21
+ string.gsub!(/\n stem:\[/, "\nstem:[")
22
+ string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ")
23
+ string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1")
24
+ end
25
+ string.each_line.inject("") do |memo, line|
26
+ memo + preserve_border_whitespaces(line) do
27
+ line.strip.gsub(/[ \t]{2,}/, " ")
28
+ end
29
+ end
30
+ end
31
+
32
+ # Find non-asterisk content that is enclosed by two or
33
+ # more asterisks. Ensure that only one whitespace occurs
34
+ # in the border area.
35
+ # Same for underscores and brackets.
36
+ def clean_tag_borders(string)
37
+ # result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match|
38
+ # preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do
39
+ # match.strip.sub("** ", "**").sub(" **", "**")
40
+ # end
41
+ # end
42
+
43
+ # result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match|
44
+ # preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do
45
+ # match.strip.sub("__ ", "__").sub(" __", "__")
46
+ # end
47
+ # end
48
+
49
+ result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
50
+ preserve_border_whitespaces(match,
51
+ default_border: Coradoc::ReverseAdoc.config.tag_border) do
52
+ match.strip.sub("~~ ", "~~").sub(" ~~", "~~")
53
+ end
54
+ end
55
+
56
+ result.gsub(/\s?\[.*?\]\s?/) do |match|
57
+ preserve_border_whitespaces(match) do
58
+ match.strip.sub("[ ", "[").sub(" ]", "]")
59
+ end
60
+ end
61
+ end
62
+
63
+ def clean_punctuation_characters(string)
64
+ string.gsub(/(\*\*|~~|__)\s([.!?'"])/, "#{'\\1'.strip}\\2")
65
+ end
66
+
67
+ # preprocesses HTML, rather than postprocessing it
68
+ def preprocess_word_html(string)
69
+ clean_headings(scrub_whitespace(string.dup))
70
+ end
71
+
72
+ def scrub_whitespace(string)
73
+ string.gsub!(/&nbsp;|&#xA0;|\u00a0/i, "&#xA0;") # HTML encoded spaces
74
+ string.sub!(/^\A[[:space:]]+/m, "") # document leading whitespace
75
+ string.sub!(/[[:space:]]+\z$/m, "") # document trailing whitespace
76
+ string.gsub!(/( +)$/, " ") # line trailing whitespace
77
+ string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
78
+ # string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs
79
+ string
80
+ end
81
+
82
+ # following added by me
83
+ def clean_headings(string)
84
+ string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ")
85
+ # I don't know why Libre Office is inserting them, but they need to go
86
+ string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
87
+ "<sup>\\2</sup>")
88
+ # I absolutely don't know why Libre Office is rendering superscripts as h1
89
+ string
90
+ end
91
+
92
+ private
93
+
94
+ def preserve_border_whitespaces(string, options = {})
95
+ return string if /\A\s*\Z/.match?(string)
96
+
97
+ default_border = options.fetch(:default_border, "")
98
+ # If the string contains part of a link so the characters [,],(,)
99
+ # then don't add any extra spaces
100
+ default_border = "" if /[\[(\])]/.match?(string)
101
+ string_start = present_or_default(string[/\A\s*/], default_border)
102
+ string_end = present_or_default(string[/\s*\Z/], default_border)
103
+ result = yield
104
+ string_start + result + string_end
105
+ end
106
+
107
+ def present_or_default(string, default)
108
+ return default if string.nil? || string.empty?
109
+
110
+ string
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,54 @@
1
+ require "tmpdir"
2
+
3
+ module Coradoc::ReverseAdoc
4
+ class Config
5
+ attr_accessor :unknown_tags, :tag_border, :mathml2asciimath, :external_images,
6
+ :destination, :sourcedir, :image_counter, :image_counter_pattern, :input_format
7
+
8
+ def initialize
9
+ @unknown_tags = :pass_through
10
+ @input_format = :html
11
+ @mathml2asciimath = false
12
+ @external_images = false
13
+
14
+ # Destination to save file and images
15
+ @destination = nil
16
+
17
+ # Source of HTML
18
+ # @sourcedir = nil
19
+
20
+ # Image counter, assuming there are max 999 images
21
+ @image_counter = 1
22
+ # pad with 0s
23
+ @image_counter_pattern = "%03d"
24
+
25
+ @em_delimiter = "_".freeze
26
+ @strong_delimiter = "*".freeze
27
+ @inline_options = {}
28
+ @tag_border = " ".freeze
29
+ end
30
+
31
+ def with(options = {})
32
+ @inline_options = options
33
+ result = yield
34
+ @inline_options = {}
35
+ result
36
+ end
37
+
38
+ def unknown_tags
39
+ @inline_options[:unknown_tags] || @unknown_tags
40
+ end
41
+
42
+ def mathml2asciimath
43
+ @inline_options[:mathml2asciimath] || @mathml2asciimath
44
+ end
45
+
46
+ def external_images
47
+ @inline_options[:external_images] || @external_images
48
+ end
49
+
50
+ def tag_border
51
+ @inline_options[:tag_border] || @tag_border
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,42 @@
1
+ require "coradoc"
2
+
3
+ module Coradoc::ReverseAdoc
4
+ module Converters
5
+ class A < Base
6
+ def to_coradoc(node, state = {})
7
+ name = treat_children(node, state)
8
+
9
+ href = node["href"]
10
+ title = extract_title(node)
11
+ id = node["id"] || node["name"]
12
+
13
+ id = id&.gsub(/\s/, "")&.gsub(/__+/, "_")
14
+
15
+ return "" if /^_Toc\d+$|^_GoBack$/.match?(id)
16
+
17
+ if !id.nil? && !id.empty?
18
+ return Coradoc::Element::Inline::Anchor.new(id)
19
+ end
20
+
21
+ if href.to_s.start_with?("#")
22
+ href = href.sub(/^#/, "").gsub(/\s/, "").gsub(/__+/, "_")
23
+ return Coradoc::Element::Inline::CrossReference.new(href, name)
24
+ end
25
+
26
+ if href.to_s.empty?
27
+ return name
28
+ end
29
+
30
+ Coradoc::Element::Inline::Link.new(path: href,
31
+ name: name,
32
+ title: title)
33
+ end
34
+
35
+ def convert(node, state = {})
36
+ Coradoc::Generator.gen_adoc(to_coradoc(node, state))
37
+ end
38
+ end
39
+
40
+ register :a, A.new
41
+ end
42
+ end
@@ -0,0 +1,16 @@
1
+ module Coradoc::ReverseAdoc
2
+ module Converters
3
+ class Aside < Base
4
+ def to_coradoc(node, state = {})
5
+ content = treat_children(node, state)
6
+ Coradoc::Element::Block::Side.new(lines: content.lines)
7
+ end
8
+
9
+ def convert(node, state = {})
10
+ Coradoc::Generator.gen_adoc(to_coradoc(node, state))
11
+ end
12
+ end
13
+
14
+ register :aside, Aside.new
15
+ end
16
+ end
@@ -0,0 +1,29 @@
1
+ module Coradoc::ReverseAdoc
2
+ module Converters
3
+ class Audio < Base
4
+ def to_coradoc(node, _state = {})
5
+ src = node["src"]
6
+ id = node["id"]
7
+ title = extract_title(node)
8
+ attributes = Coradoc::Element::AttributeList.new
9
+ options = options(node)
10
+ attributes.add_named("options", options) if options.any?
11
+ Coradoc::Element::Audio.new(title, id: id, src: src,
12
+ attributes: attributes)
13
+ end
14
+
15
+ def convert(node, state = {})
16
+ Coradoc::Generator.gen_adoc(to_coradoc(node, state))
17
+ end
18
+
19
+ def options(node)
20
+ autoplay = node["autoplay"]
21
+ loop_attr = node["loop"]
22
+ controls = node["controls"]
23
+ [autoplay, loop_attr, controls].compact
24
+ end
25
+ end
26
+
27
+ register :audio, Audio.new
28
+ end
29
+ end
@@ -0,0 +1,100 @@
1
+ module Coradoc::ReverseAdoc
2
+ module Converters
3
+ class Base
4
+ def treat_children(node, state)
5
+ node.children.inject("") do |memo, child|
6
+ memo << treat(child, state)
7
+ end
8
+ end
9
+
10
+ def treat(node, state)
11
+ Coradoc::ReverseAdoc::Converters.lookup(node.name).convert(node, state)
12
+ end
13
+
14
+ def treat_children_coradoc(node, state)
15
+ node.children.inject([]) do |memo, child|
16
+ memo << treat_coradoc(child, state)
17
+ end.flatten.reject { |x| x == "" || x.nil? }
18
+ end
19
+
20
+ def treat_coradoc(node, state)
21
+ Coradoc::ReverseAdoc::Converters.lookup(node.name).to_coradoc(node, state)
22
+ end
23
+
24
+ def escape_keychars(string)
25
+ subs = { "*" => '\*', "_" => '\_' }
26
+ string
27
+ .gsub(/((?<=\s)[\*_]+)|[\*_]+(?=\s)/) do |n|
28
+ n.chars.map do |char|
29
+ subs[char]
30
+ end.join
31
+ end
32
+ end
33
+
34
+ def extract_title(node)
35
+ title = escape_keychars(node["title"].to_s)
36
+ title.empty? ? "" : %[ #{title}]
37
+ end
38
+
39
+ def node_has_ancestor?(node, name)
40
+ case name
41
+ when String
42
+ node.ancestors.map(&:name).include?(name)
43
+ when Array
44
+ (node.ancestors.map(&:name) & name).any?
45
+ end
46
+ end
47
+
48
+ def textnode_before_end_with?(node, str)
49
+ return nil if !str.is_a?(String) || str.empty?
50
+
51
+ node2 = node.at_xpath("preceding-sibling::node()[1]")
52
+ node2.respond_to?(:text) && node2.text.end_with?(str)
53
+ end
54
+
55
+ def unconstrained_before?(node)
56
+ before = node.at_xpath("preceding::node()[1]")
57
+
58
+ before &&
59
+ !before.text.strip.empty? &&
60
+ before.text[-1]&.match?(/\w/)
61
+ end
62
+
63
+ # TODO: This logic ought to be cleaned up.
64
+ def unconstrained_after?(node)
65
+ after = node.at_xpath("following::node()[1]")
66
+
67
+ after && !after.text.strip.empty? &&
68
+ after.text[0]&.match?(/\w|,|;|"|\.\?!/)
69
+ end
70
+
71
+ # def trailing_whitespace?(node)
72
+
73
+ # TODO: This logic ought to be cleaned up.
74
+ def constrained?(node)
75
+ before = node.at_xpath("preceding::node()[1]").to_s[-1]
76
+ before = if before
77
+ before&.match?(/\s/) ? true : false
78
+ else
79
+ true
80
+ end
81
+
82
+ if !before && (node.to_s[0] =~ /\s/)
83
+ before = true
84
+ end
85
+
86
+ after = node.at_xpath("following::node()[1]").to_s[0]
87
+ after = if after
88
+ after&.match?(/\s|,|;|"|\.\?!/) ? true : false
89
+ else
90
+ true
91
+ end
92
+ if !after && (node.to_s[-1] =~ /\s/)
93
+ after = true
94
+ end
95
+
96
+ before && after
97
+ end
98
+ end
99
+ end
100
+ end