coradoc 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. checksums.yaml +4 -4
  2. data/.docker/Dockerfile +1 -1
  3. data/.docker/docker-compose.yml +2 -2
  4. data/.editorconfig +15 -0
  5. data/CHANGELOG.md +4 -0
  6. data/Rakefile +10 -0
  7. data/coradoc.gemspec +11 -2
  8. data/exe/reverse_adoc +70 -0
  9. data/exe/w2a +72 -0
  10. data/lib/coradoc/document.rb +5 -6
  11. data/lib/coradoc/element/admonition.rb +8 -6
  12. data/lib/coradoc/element/attribute_list.rb +2 -2
  13. data/lib/coradoc/element/audio.rb +1 -1
  14. data/lib/coradoc/element/author.rb +16 -14
  15. data/lib/coradoc/element/base.rb +0 -2
  16. data/lib/coradoc/element/block/core.rb +2 -2
  17. data/lib/coradoc/element/block/literal.rb +1 -1
  18. data/lib/coradoc/element/block/sourcecode.rb +2 -2
  19. data/lib/coradoc/element/image/core.rb +1 -0
  20. data/lib/coradoc/element/image.rb +0 -1
  21. data/lib/coradoc/element/inline/bold.rb +1 -0
  22. data/lib/coradoc/element/inline/highlight.rb +1 -0
  23. data/lib/coradoc/element/inline/image.rb +1 -0
  24. data/lib/coradoc/element/inline/italic.rb +1 -0
  25. data/lib/coradoc/element/inline/link.rb +9 -9
  26. data/lib/coradoc/element/inline/monospace.rb +1 -0
  27. data/lib/coradoc/element/inline/quotation.rb +1 -0
  28. data/lib/coradoc/element/inline/subscript.rb +1 -0
  29. data/lib/coradoc/element/inline/superscript.rb +1 -0
  30. data/lib/coradoc/element/inline.rb +0 -1
  31. data/lib/coradoc/element/list/core.rb +3 -4
  32. data/lib/coradoc/element/list.rb +0 -1
  33. data/lib/coradoc/element/list_item.rb +1 -1
  34. data/lib/coradoc/element/paragraph.rb +1 -1
  35. data/lib/coradoc/element/revision.rb +18 -16
  36. data/lib/coradoc/element/table.rb +10 -10
  37. data/lib/coradoc/element/text_element.rb +21 -15
  38. data/lib/coradoc/element/title.rb +2 -2
  39. data/lib/coradoc/element/video.rb +1 -1
  40. data/lib/coradoc/generator.rb +2 -2
  41. data/lib/coradoc/legacy_parser.rb +41 -41
  42. data/lib/coradoc/oscal.rb +2 -4
  43. data/lib/coradoc/parser/asciidoc/content.rb +15 -15
  44. data/lib/coradoc/parser/asciidoc/document_attributes.rb +1 -1
  45. data/lib/coradoc/parser/asciidoc/header.rb +6 -6
  46. data/lib/coradoc/parser/asciidoc/section.rb +1 -1
  47. data/lib/coradoc/reverse_adoc/LICENSE.txt +25 -0
  48. data/lib/coradoc/reverse_adoc/README.adoc +302 -0
  49. data/lib/coradoc/reverse_adoc/cleaner.rb +113 -0
  50. data/lib/coradoc/reverse_adoc/config.rb +54 -0
  51. data/lib/coradoc/reverse_adoc/converters/a.rb +42 -0
  52. data/lib/coradoc/reverse_adoc/converters/aside.rb +16 -0
  53. data/lib/coradoc/reverse_adoc/converters/audio.rb +29 -0
  54. data/lib/coradoc/reverse_adoc/converters/base.rb +100 -0
  55. data/lib/coradoc/reverse_adoc/converters/blockquote.rb +27 -0
  56. data/lib/coradoc/reverse_adoc/converters/br.rb +15 -0
  57. data/lib/coradoc/reverse_adoc/converters/bypass.rb +81 -0
  58. data/lib/coradoc/reverse_adoc/converters/code.rb +56 -0
  59. data/lib/coradoc/reverse_adoc/converters/div.rb +18 -0
  60. data/lib/coradoc/reverse_adoc/converters/drop.rb +22 -0
  61. data/lib/coradoc/reverse_adoc/converters/em.rb +55 -0
  62. data/lib/coradoc/reverse_adoc/converters/figure.rb +25 -0
  63. data/lib/coradoc/reverse_adoc/converters/h.rb +42 -0
  64. data/lib/coradoc/reverse_adoc/converters/head.rb +23 -0
  65. data/lib/coradoc/reverse_adoc/converters/hr.rb +15 -0
  66. data/lib/coradoc/reverse_adoc/converters/ignore.rb +16 -0
  67. data/lib/coradoc/reverse_adoc/converters/img.rb +93 -0
  68. data/lib/coradoc/reverse_adoc/converters/li.rb +17 -0
  69. data/lib/coradoc/reverse_adoc/converters/mark.rb +21 -0
  70. data/lib/coradoc/reverse_adoc/converters/math.rb +31 -0
  71. data/lib/coradoc/reverse_adoc/converters/ol.rb +64 -0
  72. data/lib/coradoc/reverse_adoc/converters/p.rb +23 -0
  73. data/lib/coradoc/reverse_adoc/converters/pass_through.rb +13 -0
  74. data/lib/coradoc/reverse_adoc/converters/pre.rb +55 -0
  75. data/lib/coradoc/reverse_adoc/converters/q.rb +16 -0
  76. data/lib/coradoc/reverse_adoc/converters/strong.rb +52 -0
  77. data/lib/coradoc/reverse_adoc/converters/sub.rb +16 -0
  78. data/lib/coradoc/reverse_adoc/converters/sup.rb +16 -0
  79. data/lib/coradoc/reverse_adoc/converters/table.rb +69 -0
  80. data/lib/coradoc/reverse_adoc/converters/td.rb +83 -0
  81. data/lib/coradoc/reverse_adoc/converters/text.rb +65 -0
  82. data/lib/coradoc/reverse_adoc/converters/th.rb +14 -0
  83. data/lib/coradoc/reverse_adoc/converters/tr.rb +22 -0
  84. data/lib/coradoc/reverse_adoc/converters/video.rb +29 -0
  85. data/lib/coradoc/reverse_adoc/converters.rb +32 -0
  86. data/lib/coradoc/reverse_adoc/errors.rb +10 -0
  87. data/lib/coradoc/reverse_adoc/html_converter.rb +61 -0
  88. data/lib/coradoc/reverse_adoc.rb +27 -0
  89. data/lib/coradoc/transformer.rb +24 -14
  90. data/lib/coradoc/version.rb +1 -1
  91. data/lib/reverse_adoc.rb +20 -0
  92. metadata +178 -4
@@ -0,0 +1,302 @@
1
+ = AsciiDoc from HTML and Microsoft Word: reverse_adoc
2
+
3
+ https://github.com/metanorma/reverse_adoc[reverse_adoc] image:https://img.shields.io/gem/v/reverse_adoc.svg["Gem Version", link="https://rubygems.org/gems/reverse_adoc"]::
4
+ image:https://github.com/metanorma/reverse_adoc/workflows/rake/badge.svg["Build Status", link="https://github.com/metanorma/reverse_adoc/actions?workflow=rake"]
5
+ image:https://codeclimate.com/github/metanorma/reverse_adoc/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/reverse_adoc"]
6
+ image:https://img.shields.io/github/issues-pr-raw/metanorma/reverse_adoc.svg["Pull Requests", link="https://github.com/metanorma/reverse_adoc/pulls"]
7
+ image:https://img.shields.io/github/commits-since/metanorma/reverse_adoc/latest.svg["Commits since latest",link="https://github.com/metanorma/reverse_adoc/releases"]
8
+
9
+ == Purpose
10
+
11
+ Transforms HTML and Microsoft Word into AsciiDoc.
12
+
13
+ Based on https://github.com/xijo/reverse_markdown
14
+
15
+
16
+ == Installation
17
+
18
+ Install the gem:
19
+
20
+ [source,console]
21
+ ----
22
+ [sudo] gem install reverse_adoc
23
+ ----
24
+
25
+ or add it to your `Gemfile`:
26
+
27
+ [source,ruby]
28
+ ----
29
+ gem 'reverse_adoc'
30
+ ----
31
+
32
+
33
+ == Command-line usage
34
+
35
+ === HTML to AsciiDoc: `reverse_adoc`
36
+
37
+ Convert HTML files to AsciiDoc:
38
+
39
+ [source,console]
40
+ ----
41
+ $ reverse_adoc file.html > file.adoc
42
+ $ cat file.html | reverse_adoc > file.adoc
43
+ ----
44
+
45
+
46
+ === Microsoft Word to AsciiDoc: `w2a`
47
+
48
+ Convert Word `.doc` or `.docx` files to AsciiDoc:
49
+
50
+ [source,console]
51
+ ----
52
+ $ w2a file.docx > file.adoc
53
+ ----
54
+
55
+ [source,console]
56
+ ----
57
+ $ w2a input.docx -o output.adoc
58
+ ----
59
+
60
+ Help:
61
+
62
+ [source,console]
63
+ ----
64
+ $ w2a -h
65
+ Usage: w2a [options] <file>
66
+ -a, --mathml2asciimath Convert MathML to AsciiMath
67
+ -o, --output=FILENAME Output file to write to
68
+ -e, --external-images Export images if data URI
69
+ -v, --version Version information
70
+ -h, --help Prints this help
71
+ ----
72
+
73
+
74
+ NOTE: `w2a` requires LibreOffice to be installed. It uses LibreOffice's
75
+ export to XHTML. LibreOffice's export of XHTML is superior to the native Microsoft Word export
76
+ to HTML: it exports lists (which Word keeps as paragraphs), and it exports OOMML into MathML.
77
+ On the other hand, the LibreOffice export relies on default styling being used in the
78
+ document, and it may not cope with ordered lists or headings with customised appearance.
79
+ For best results, reset the styles in the document you're converting to those in
80
+ the default `Normal.dot` template.
81
+
82
+ NOTE: `w2a` requires the command-line version of LibreOffice, `soffice`. As it turns out,
83
+ LibreOffice v6 appears to render formulae in HTML as images instead of MathML expressions;
84
+ use LibreOffice v5. If you have both LibreOffice v5 and LibreOffice v6 installed, make sure
85
+ that your OS path searches for the LibreOffice v5 version of `soffice` first; e.g. on Mac,
86
+ include something like `/Applications/LibreOffice5.4.7.2.app/Contents/MacOS` in your PATH
87
+ environment.
88
+
89
+ NOTE: Some information in OOMML is not preserved in the export to MathML from LibreOffice;
90
+ in particular, font shifts such as double-struck fonts.
91
+ The LibreOffice exporter does seem to drop some text (possibly associated with
92
+ MathML); use with caution.
93
+
94
+ NOTE: Adapted from `w2m` of
95
+ https://github.com/benbalter/word-to-markdown[Ben Balter's word-to-markdown]
96
+
97
+
98
+ === Common options
99
+
100
+
101
+ ==== MathML to AsciiMath conversion
102
+
103
+ If you wish to convert the MathML in the document to AsciiMath, run the script with the
104
+ `--mathml2asciimath` option:
105
+
106
+ [source,console]
107
+ ----
108
+ $ w2a --mathml2asciimath document.docx > document.adoc
109
+ ----
110
+
111
+
112
+ ==== Extracting images
113
+
114
+ Images referred by the HTML can be extracted into the destination output folder by using:
115
+
116
+ [source,console]
117
+ ----
118
+ $ reverse_adoc input.docx -o output/file.adoc -e
119
+ $ reverse_adoc input.docx --output output/file.adoc --external-images
120
+ ----
121
+
122
+
123
+ Word embedded images can be extracted into the destination output folder by using:
124
+
125
+ [source,console]
126
+ ----
127
+ $ w2a input.docx -o output/file.adoc -e
128
+ $ w2a input.docx --output output/file.adoc --external-images
129
+ ----
130
+
131
+
132
+ ==== Handling unknown HTML tags
133
+
134
+ The `--unknown_tags` option allows you to specify how to handle unknown tags
135
+ (default `pass_through`).
136
+
137
+ Valid options are:
138
+
139
+ * `pass_through` - Include the unknown tag completely into the result
140
+ * `drop` - Drop the unknown tag and its content
141
+ * `bypass` - Ignore the unknown tag but try to convert its content
142
+ * `raise` - Raise an error to let you know
143
+
144
+
145
+ ==== Tagging of borders
146
+
147
+ Specify how to handle tag borders with the option `--tag_border` (default `' '`).
148
+
149
+ Valid options are:
150
+
151
+ * `' '` - Add whitespace if there is none at tag borders.
152
+ * `''` - Do not not add whitespace.
153
+
154
+
155
+ == Features
156
+
157
+ === General
158
+
159
+ `reverse_adoc` shares features as a port of `reverse_markdown`:
160
+
161
+ * Module based -- if you miss a tag, just add it
162
+ * Can deal with nested lists
163
+ * Inline and block code is supported
164
+ * Supports blockquote
165
+
166
+ It supports the following HTML tags (these are supported by `reverse_markdown`):
167
+
168
+ * `a`
169
+ * `blockquote`
170
+ * `br`
171
+ * `code`, `tt` (added: `kbd`, `samp`, `var`)
172
+ * `div`, `article`
173
+ * `em`, `i` (added: `cite`)
174
+ * `h1`, `h2`, `h3`, `h4`, `h5`, `h6`, `hr`
175
+ * `img`
176
+ * `li`, `ol`, `ul` (added: `dir`)
177
+ * `p`, `pre`
178
+ * `strong`, `b`
179
+ * `table`, `td`, `th`, `tr`
180
+
181
+ [NOTE]
182
+ ====
183
+ * reverse_adoc does *not* support `del` or `strike`, because Asciidoctor does not out of the box.
184
+ * As with reverse_markdown, `pre` is only treated as sourcecode if it is contained in a `div@class = highlight-` element, or has a `@brush` attribute naming the language (Confluence).
185
+ * The gem does not support `p@align`, because Asciidoctor doesn't
186
+ ====
187
+
188
+ In addition, it supports:
189
+
190
+ * `aside`
191
+ * `audio`, `video` (with `@src` attributes)
192
+ * `figure`, `figcaption`
193
+ * `mark`
194
+ * `q`
195
+ * `sub`, `sup`
196
+ * `@id` anchors
197
+ * `blockquote@cite`
198
+ * `img/@width`, `img/@height`
199
+ * `ol/@style`, `ol/@start`, `ol/@reversed`, `ul/@type`
200
+ * `td/@colspan`, `td/@rowspan`, `td@/align`, `td@/valign`
201
+ * `table/caption`, `table/@width`, `table/@frame` (partial), `table/@rules` (partial)
202
+ * Lists and paragraphs within cells
203
+ ** Not tables within cells: Asciidoctor cannot deal with nested tables
204
+
205
+ The gem does not support:
206
+
207
+ * `col`, `colgroup`
208
+ * `source`, `picture`
209
+ * `bdi`, `bdo`, `ruby`, `rt`, `rp`, `wbr`
210
+ * `frame`, `frameset`, `iframe`, `noframes`, `noscript`, `script`, `input`, `output`, `progress`
211
+ * `map`, `canvas`, `dialog`, `embed`, `object`, `param`, `svg`, `track`
212
+ * `fieldset`, `button`, `datalist`, `form`, `label`, `legend`, `menu`, `menulist`, `optgroup`, `option`, `select`, `textarea`
213
+ * `big`, `dfn`, `font`, `s`, `small`, `span`, `strike`, `u`
214
+ * `center`
215
+ * `data`, `meter`
216
+ * `del`, `ins`
217
+ * `footer`, `header`, `main`, `nav`, `details`, `section`, `summary`, `template`
218
+
219
+
220
+ === MathML support
221
+
222
+ If you are using this gem in the context of https://www.metanorma.com[Metanorma],
223
+ Metanorma AsciiDoc accepts MathML as a native mathematical format. So you do not need
224
+ to convert the MathML to AsciiMath.
225
+
226
+ The gem will optionally invoke the https://github.com/metanorma/mathml2asciimath
227
+ gem, to convert MathML to AsciiMath. The conversion is not perfect, and will need to be
228
+ post-edited; but it's a lot better than nothing.
229
+
230
+ NOTE: Asciidoctor does not support MathML input. HTML uses MathML.
231
+ The gem will recognize MathML expressions in HTML, and will wrap them in Asciidoctor
232
+ `stem:[ ]` macros. The result of this gem is not actually legal Asciidoctor for `stem`:
233
+ Asciidoctor will presumably
234
+ think this is AsciiMath in the `stem:[ ]` macro, try to pass it into MathJax as
235
+ AsciiMath, and fail. But of course, MathJax has no problem with MathML, and some postprocessing
236
+ on the Asciidoctor output can ensure that the MathML is treated by MathJax (or whatever else
237
+ uses the output) as such; so this is still much better than nothing for stem processing.
238
+
239
+ === Word cleanup
240
+
241
+ This gem is routinely used in the Metanorma project to export Word documents to AsciiDoc.
242
+ The HTML export from Word that the gem uses, from LibreOffice, is much cleaner than the
243
+ native HTML 4 export from Word; but it has some infelicities which this gem cleans up:
244
+
245
+ * The HTML export has trouble with subscripts, and routinely exports them as headings; the `w2a`
246
+ script tries to clean them up.
247
+ * The `w2a` cleans up spaces, but it does not strip them.
248
+ * Spaces are removed from anchors and cross-references.
249
+ * Double underscores are removed from anchors and cross-references.
250
+ * Cross-references to `_GoBack` and to `_Toc` followed by numbers (used to construct tables of contents) are ignored.
251
+
252
+ == Ruby library usage
253
+
254
+ === General
255
+
256
+ Simple to use.
257
+
258
+ [source,ruby]
259
+ ----
260
+ require 'coradoc/reverse_adoc'
261
+
262
+ result = Coradoc::ReverseAdoc.convert input
263
+ result.inspect # " *feelings* "
264
+ ----
265
+
266
+ === Configure with options
267
+
268
+ Just pass your chosen configuration options in after the input. The given options will last for this operation only.
269
+
270
+ [source,ruby]
271
+ ----
272
+ require 'coradoc/reverse_adoc'
273
+
274
+ Coradoc::ReverseAdoc.convert(input, unknown_tags: :raise, mathml2asciimath: true)
275
+ ----
276
+
277
+
278
+ === Preconfigure using an initializer
279
+
280
+ Or configure it block style on a initializer level. These configurations will last for all conversions until they are set to something different.
281
+
282
+ [source,ruby]
283
+ ----
284
+ require 'coradoc/reverse_adoc'
285
+
286
+ Coradoc::ReverseAdoc.config do |config|
287
+ config.unknown_tags = :bypass
288
+ config.mathml2asciimath = true
289
+ config.tag_border = ''
290
+ end
291
+ ----
292
+
293
+
294
+ == Related stuff
295
+
296
+ * https://github.com/xijo/reverse_markdown[Xijo's original reverse_markdown gem]
297
+ * https://github.com/xijo/reverse_markdown/wiki/Write-your-own-converter[Write custom converters] - Wiki entry about how to write your own converter
298
+ * https://github.com/harlantwood/html_massage[html_massage] - A gem by Harlan T. Wood to convert regular sites into markdown using reverse_markdown
299
+ * https://github.com/benbalter/word-to-markdown[word-to-markdown] - Convert word docs into markdown while using reverse_markdown, by Ben Balter
300
+ * https://github.com/asciidocfx/HtmlToAsciidoc[HtmlToAsciidoc] - Javascript regexp-based converter of HTML to Asciidoctor
301
+ * https://asciidoctor.org/docs/user-manual/[The Asciidoctor User Manual]
302
+
@@ -0,0 +1,113 @@
1
+ module Coradoc::ReverseAdoc
2
+ class Cleaner
3
+ def tidy(string)
4
+ result = remove_inner_whitespaces(String.new(string))
5
+ result = remove_newlines(result)
6
+ result = remove_leading_newlines(result)
7
+ result = clean_tag_borders(result)
8
+ clean_punctuation_characters(result)
9
+ end
10
+
11
+ def remove_newlines(string)
12
+ string.gsub(/\n{3,}/, "\n\n")
13
+ end
14
+
15
+ def remove_leading_newlines(string)
16
+ string.gsub(/\A\n+/, "")
17
+ end
18
+
19
+ def remove_inner_whitespaces(string)
20
+ unless string.nil?
21
+ string.gsub!(/\n stem:\[/, "\nstem:[")
22
+ string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ")
23
+ string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1")
24
+ end
25
+ string.each_line.inject("") do |memo, line|
26
+ memo + preserve_border_whitespaces(line) do
27
+ line.strip.gsub(/[ \t]{2,}/, " ")
28
+ end
29
+ end
30
+ end
31
+
32
+ # Find non-asterisk content that is enclosed by two or
33
+ # more asterisks. Ensure that only one whitespace occurs
34
+ # in the border area.
35
+ # Same for underscores and brackets.
36
+ def clean_tag_borders(string)
37
+ # result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match|
38
+ # preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do
39
+ # match.strip.sub("** ", "**").sub(" **", "**")
40
+ # end
41
+ # end
42
+
43
+ # result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match|
44
+ # preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do
45
+ # match.strip.sub("__ ", "__").sub(" __", "__")
46
+ # end
47
+ # end
48
+
49
+ result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
50
+ preserve_border_whitespaces(match,
51
+ default_border: Coradoc::ReverseAdoc.config.tag_border) do
52
+ match.strip.sub("~~ ", "~~").sub(" ~~", "~~")
53
+ end
54
+ end
55
+
56
+ result.gsub(/\s?\[.*?\]\s?/) do |match|
57
+ preserve_border_whitespaces(match) do
58
+ match.strip.sub("[ ", "[").sub(" ]", "]")
59
+ end
60
+ end
61
+ end
62
+
63
+ def clean_punctuation_characters(string)
64
+ string.gsub(/(\*\*|~~|__)\s([.!?'"])/, "#{'\\1'.strip}\\2")
65
+ end
66
+
67
+ # preprocesses HTML, rather than postprocessing it
68
+ def preprocess_word_html(string)
69
+ clean_headings(scrub_whitespace(string.dup))
70
+ end
71
+
72
+ def scrub_whitespace(string)
73
+ string.gsub!(/&nbsp;|&#xA0;|\u00a0/i, "&#xA0;") # HTML encoded spaces
74
+ string.sub!(/^\A[[:space:]]+/m, "") # document leading whitespace
75
+ string.sub!(/[[:space:]]+\z$/m, "") # document trailing whitespace
76
+ string.gsub!(/( +)$/, " ") # line trailing whitespace
77
+ string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
78
+ # string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs
79
+ string
80
+ end
81
+
82
+ # following added by me
83
+ def clean_headings(string)
84
+ string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ")
85
+ # I don't know why Libre Office is inserting them, but they need to go
86
+ string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
87
+ "<sup>\\2</sup>")
88
+ # I absolutely don't know why Libre Office is rendering superscripts as h1
89
+ string
90
+ end
91
+
92
+ private
93
+
94
+ def preserve_border_whitespaces(string, options = {})
95
+ return string if /\A\s*\Z/.match?(string)
96
+
97
+ default_border = options.fetch(:default_border, "")
98
+ # If the string contains part of a link so the characters [,],(,)
99
+ # then don't add any extra spaces
100
+ default_border = "" if /[\[(\])]/.match?(string)
101
+ string_start = present_or_default(string[/\A\s*/], default_border)
102
+ string_end = present_or_default(string[/\s*\Z/], default_border)
103
+ result = yield
104
+ string_start + result + string_end
105
+ end
106
+
107
+ def present_or_default(string, default)
108
+ return default if string.nil? || string.empty?
109
+
110
+ string
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,54 @@
1
+ require "tmpdir"
2
+
3
+ module Coradoc::ReverseAdoc
4
+ class Config
5
+ attr_accessor :unknown_tags, :tag_border, :mathml2asciimath, :external_images,
6
+ :destination, :sourcedir, :image_counter, :image_counter_pattern, :input_format
7
+
8
+ def initialize
9
+ @unknown_tags = :pass_through
10
+ @input_format = :html
11
+ @mathml2asciimath = false
12
+ @external_images = false
13
+
14
+ # Destination to save file and images
15
+ @destination = nil
16
+
17
+ # Source of HTML
18
+ # @sourcedir = nil
19
+
20
+ # Image counter, assuming there are max 999 images
21
+ @image_counter = 1
22
+ # pad with 0s
23
+ @image_counter_pattern = "%03d"
24
+
25
+ @em_delimiter = "_".freeze
26
+ @strong_delimiter = "*".freeze
27
+ @inline_options = {}
28
+ @tag_border = " ".freeze
29
+ end
30
+
31
+ def with(options = {})
32
+ @inline_options = options
33
+ result = yield
34
+ @inline_options = {}
35
+ result
36
+ end
37
+
38
+ def unknown_tags
39
+ @inline_options[:unknown_tags] || @unknown_tags
40
+ end
41
+
42
+ def mathml2asciimath
43
+ @inline_options[:mathml2asciimath] || @mathml2asciimath
44
+ end
45
+
46
+ def external_images
47
+ @inline_options[:external_images] || @external_images
48
+ end
49
+
50
+ def tag_border
51
+ @inline_options[:tag_border] || @tag_border
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,42 @@
1
+ require "coradoc"
2
+
3
+ module Coradoc::ReverseAdoc
4
+ module Converters
5
+ class A < Base
6
+ def to_coradoc(node, state = {})
7
+ name = treat_children(node, state)
8
+
9
+ href = node["href"]
10
+ title = extract_title(node)
11
+ id = node["id"] || node["name"]
12
+
13
+ id = id&.gsub(/\s/, "")&.gsub(/__+/, "_")
14
+
15
+ return "" if /^_Toc\d+$|^_GoBack$/.match?(id)
16
+
17
+ if !id.nil? && !id.empty?
18
+ return Coradoc::Element::Inline::Anchor.new(id)
19
+ end
20
+
21
+ if href.to_s.start_with?("#")
22
+ href = href.sub(/^#/, "").gsub(/\s/, "").gsub(/__+/, "_")
23
+ return Coradoc::Element::Inline::CrossReference.new(href, name)
24
+ end
25
+
26
+ if href.to_s.empty?
27
+ return name
28
+ end
29
+
30
+ Coradoc::Element::Inline::Link.new(path: href,
31
+ name: name,
32
+ title: title)
33
+ end
34
+
35
+ def convert(node, state = {})
36
+ Coradoc::Generator.gen_adoc(to_coradoc(node, state))
37
+ end
38
+ end
39
+
40
+ register :a, A.new
41
+ end
42
+ end
@@ -0,0 +1,16 @@
1
+ module Coradoc::ReverseAdoc
2
+ module Converters
3
+ class Aside < Base
4
+ def to_coradoc(node, state = {})
5
+ content = treat_children(node, state)
6
+ Coradoc::Element::Block::Side.new(lines: content.lines)
7
+ end
8
+
9
+ def convert(node, state = {})
10
+ Coradoc::Generator.gen_adoc(to_coradoc(node, state))
11
+ end
12
+ end
13
+
14
+ register :aside, Aside.new
15
+ end
16
+ end
@@ -0,0 +1,29 @@
1
+ module Coradoc::ReverseAdoc
2
+ module Converters
3
+ class Audio < Base
4
+ def to_coradoc(node, _state = {})
5
+ src = node["src"]
6
+ id = node["id"]
7
+ title = extract_title(node)
8
+ attributes = Coradoc::Element::AttributeList.new
9
+ options = options(node)
10
+ attributes.add_named("options", options) if options.any?
11
+ Coradoc::Element::Audio.new(title, id: id, src: src,
12
+ attributes: attributes)
13
+ end
14
+
15
+ def convert(node, state = {})
16
+ Coradoc::Generator.gen_adoc(to_coradoc(node, state))
17
+ end
18
+
19
+ def options(node)
20
+ autoplay = node["autoplay"]
21
+ loop_attr = node["loop"]
22
+ controls = node["controls"]
23
+ [autoplay, loop_attr, controls].compact
24
+ end
25
+ end
26
+
27
+ register :audio, Audio.new
28
+ end
29
+ end
@@ -0,0 +1,100 @@
1
+ module Coradoc::ReverseAdoc
2
+ module Converters
3
+ class Base
4
+ def treat_children(node, state)
5
+ node.children.inject("") do |memo, child|
6
+ memo << treat(child, state)
7
+ end
8
+ end
9
+
10
+ def treat(node, state)
11
+ Coradoc::ReverseAdoc::Converters.lookup(node.name).convert(node, state)
12
+ end
13
+
14
+ def treat_children_coradoc(node, state)
15
+ node.children.inject([]) do |memo, child|
16
+ memo << treat_coradoc(child, state)
17
+ end.flatten.reject { |x| x == "" || x.nil? }
18
+ end
19
+
20
+ def treat_coradoc(node, state)
21
+ Coradoc::ReverseAdoc::Converters.lookup(node.name).to_coradoc(node, state)
22
+ end
23
+
24
+ def escape_keychars(string)
25
+ subs = { "*" => '\*', "_" => '\_' }
26
+ string
27
+ .gsub(/((?<=\s)[\*_]+)|[\*_]+(?=\s)/) do |n|
28
+ n.chars.map do |char|
29
+ subs[char]
30
+ end.join
31
+ end
32
+ end
33
+
34
+ def extract_title(node)
35
+ title = escape_keychars(node["title"].to_s)
36
+ title.empty? ? "" : %[ #{title}]
37
+ end
38
+
39
+ def node_has_ancestor?(node, name)
40
+ case name
41
+ when String
42
+ node.ancestors.map(&:name).include?(name)
43
+ when Array
44
+ (node.ancestors.map(&:name) & name).any?
45
+ end
46
+ end
47
+
48
+ def textnode_before_end_with?(node, str)
49
+ return nil if !str.is_a?(String) || str.empty?
50
+
51
+ node2 = node.at_xpath("preceding-sibling::node()[1]")
52
+ node2.respond_to?(:text) && node2.text.end_with?(str)
53
+ end
54
+
55
+ def unconstrained_before?(node)
56
+ before = node.at_xpath("preceding::node()[1]")
57
+
58
+ before &&
59
+ !before.text.strip.empty? &&
60
+ before.text[-1]&.match?(/\w/)
61
+ end
62
+
63
+ # TODO: This logic ought to be cleaned up.
64
+ def unconstrained_after?(node)
65
+ after = node.at_xpath("following::node()[1]")
66
+
67
+ after && !after.text.strip.empty? &&
68
+ after.text[0]&.match?(/\w|,|;|"|\.\?!/)
69
+ end
70
+
71
+ # def trailing_whitespace?(node)
72
+
73
+ # TODO: This logic ought to be cleaned up.
74
+ def constrained?(node)
75
+ before = node.at_xpath("preceding::node()[1]").to_s[-1]
76
+ before = if before
77
+ before&.match?(/\s/) ? true : false
78
+ else
79
+ true
80
+ end
81
+
82
+ if !before && (node.to_s[0] =~ /\s/)
83
+ before = true
84
+ end
85
+
86
+ after = node.at_xpath("following::node()[1]").to_s[0]
87
+ after = if after
88
+ after&.match?(/\s|,|;|"|\.\?!/) ? true : false
89
+ else
90
+ true
91
+ end
92
+ if !after && (node.to_s[-1] =~ /\s/)
93
+ after = true
94
+ end
95
+
96
+ before && after
97
+ end
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,27 @@
1
+ module Coradoc::ReverseAdoc
2
+ module Converters
3
+ class Blockquote < Base
4
+ def to_coradoc(node, state = {})
5
+ node["id"]
6
+ cite = node["cite"]
7
+ attributes = if cite.nil?
8
+ nil
9
+ else
10
+ Coradoc::Element::AttributeList.new(
11
+ "quote", cite
12
+ )
13
+ end
14
+ content = treat_children(node, state).strip
15
+ content = Coradoc::ReverseAdoc.cleaner.remove_newlines(content)
16
+ Coradoc::Element::Block::Quote.new(nil, lines: content,
17
+ attributes: attributes)
18
+ end
19
+
20
+ def convert(node, state = {})
21
+ Coradoc::Generator.gen_adoc(to_coradoc(node, state))
22
+ end
23
+ end
24
+
25
+ register :blockquote, Blockquote.new
26
+ end
27
+ end