coradoc 0.2.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/.docker/Dockerfile +1 -1
  3. data/.docker/docker-compose.yml +2 -2
  4. data/.editorconfig +15 -0
  5. data/CHANGELOG.md +4 -0
  6. data/README.md +4 -0
  7. data/Rakefile +10 -0
  8. data/coradoc.gemspec +11 -2
  9. data/exe/reverse_adoc +91 -0
  10. data/exe/w2a +72 -0
  11. data/lib/coradoc/document.rb +6 -6
  12. data/lib/coradoc/element/admonition.rb +8 -6
  13. data/lib/coradoc/element/attribute.rb +2 -2
  14. data/lib/coradoc/element/attribute_list.rb +94 -15
  15. data/lib/coradoc/element/audio.rb +14 -3
  16. data/lib/coradoc/element/author.rb +18 -14
  17. data/lib/coradoc/element/base.rb +69 -8
  18. data/lib/coradoc/element/block/core.rb +10 -6
  19. data/lib/coradoc/element/block/literal.rb +1 -1
  20. data/lib/coradoc/element/block/quote.rb +1 -1
  21. data/lib/coradoc/element/block/sourcecode.rb +2 -2
  22. data/lib/coradoc/element/break.rb +1 -1
  23. data/lib/coradoc/element/document_attributes.rb +6 -6
  24. data/lib/coradoc/element/header.rb +4 -2
  25. data/lib/coradoc/element/image/block_image.rb +13 -2
  26. data/lib/coradoc/element/image/core.rb +35 -5
  27. data/lib/coradoc/element/image/inline_image.rb +2 -2
  28. data/lib/coradoc/element/image.rb +0 -1
  29. data/lib/coradoc/element/inline/anchor.rb +4 -2
  30. data/lib/coradoc/element/inline/bold.rb +10 -4
  31. data/lib/coradoc/element/inline/cross_reference.rb +4 -2
  32. data/lib/coradoc/element/inline/hard_line_break.rb +1 -1
  33. data/lib/coradoc/element/inline/highlight.rb +12 -6
  34. data/lib/coradoc/element/inline/italic.rb +10 -4
  35. data/lib/coradoc/element/inline/link.rb +26 -10
  36. data/lib/coradoc/element/inline/monospace.rb +10 -4
  37. data/lib/coradoc/element/inline/quotation.rb +4 -1
  38. data/lib/coradoc/element/inline/subscript.rb +5 -2
  39. data/lib/coradoc/element/inline/superscript.rb +5 -2
  40. data/lib/coradoc/element/inline.rb +0 -1
  41. data/lib/coradoc/element/list/core.rb +10 -8
  42. data/lib/coradoc/element/list/definition.rb +19 -0
  43. data/lib/coradoc/element/list/ordered.rb +1 -1
  44. data/lib/coradoc/element/list/unordered.rb +1 -1
  45. data/lib/coradoc/element/list.rb +1 -1
  46. data/lib/coradoc/element/list_item.rb +9 -4
  47. data/lib/coradoc/element/list_item_definition.rb +32 -0
  48. data/lib/coradoc/element/paragraph.rb +5 -3
  49. data/lib/coradoc/element/revision.rb +20 -16
  50. data/lib/coradoc/element/section.rb +21 -4
  51. data/lib/coradoc/element/table.rb +36 -19
  52. data/lib/coradoc/element/text_element.rb +63 -17
  53. data/lib/coradoc/element/title.rb +27 -7
  54. data/lib/coradoc/element/video.rb +33 -6
  55. data/lib/coradoc/generator.rb +2 -2
  56. data/lib/coradoc/legacy_parser.rb +41 -41
  57. data/lib/coradoc/oscal.rb +2 -4
  58. data/lib/coradoc/parser/asciidoc/content.rb +15 -15
  59. data/lib/coradoc/parser/asciidoc/document_attributes.rb +1 -1
  60. data/lib/coradoc/parser/asciidoc/header.rb +6 -6
  61. data/lib/coradoc/parser/asciidoc/section.rb +1 -1
  62. data/lib/coradoc/reverse_adoc/LICENSE.txt +25 -0
  63. data/lib/coradoc/reverse_adoc/README.adoc +308 -0
  64. data/lib/coradoc/reverse_adoc/cleaner.rb +125 -0
  65. data/lib/coradoc/reverse_adoc/config.rb +73 -0
  66. data/lib/coradoc/reverse_adoc/converters/a.rb +47 -0
  67. data/lib/coradoc/reverse_adoc/converters/aside.rb +12 -0
  68. data/lib/coradoc/reverse_adoc/converters/audio.rb +25 -0
  69. data/lib/coradoc/reverse_adoc/converters/base.rb +104 -0
  70. data/lib/coradoc/reverse_adoc/converters/blockquote.rb +18 -0
  71. data/lib/coradoc/reverse_adoc/converters/br.rb +11 -0
  72. data/lib/coradoc/reverse_adoc/converters/bypass.rb +77 -0
  73. data/lib/coradoc/reverse_adoc/converters/code.rb +19 -0
  74. data/lib/coradoc/reverse_adoc/converters/div.rb +14 -0
  75. data/lib/coradoc/reverse_adoc/converters/dl.rb +55 -0
  76. data/lib/coradoc/reverse_adoc/converters/drop.rb +22 -0
  77. data/lib/coradoc/reverse_adoc/converters/em.rb +17 -0
  78. data/lib/coradoc/reverse_adoc/converters/figure.rb +21 -0
  79. data/lib/coradoc/reverse_adoc/converters/h.rb +38 -0
  80. data/lib/coradoc/reverse_adoc/converters/head.rb +19 -0
  81. data/lib/coradoc/reverse_adoc/converters/hr.rb +11 -0
  82. data/lib/coradoc/reverse_adoc/converters/ignore.rb +16 -0
  83. data/lib/coradoc/reverse_adoc/converters/img.rb +98 -0
  84. data/lib/coradoc/reverse_adoc/converters/li.rb +13 -0
  85. data/lib/coradoc/reverse_adoc/converters/mark.rb +15 -0
  86. data/lib/coradoc/reverse_adoc/converters/markup.rb +27 -0
  87. data/lib/coradoc/reverse_adoc/converters/math.rb +31 -0
  88. data/lib/coradoc/reverse_adoc/converters/ol.rb +60 -0
  89. data/lib/coradoc/reverse_adoc/converters/p.rb +19 -0
  90. data/lib/coradoc/reverse_adoc/converters/pass_through.rb +13 -0
  91. data/lib/coradoc/reverse_adoc/converters/pre.rb +51 -0
  92. data/lib/coradoc/reverse_adoc/converters/q.rb +12 -0
  93. data/lib/coradoc/reverse_adoc/converters/strong.rb +16 -0
  94. data/lib/coradoc/reverse_adoc/converters/sub.rb +18 -0
  95. data/lib/coradoc/reverse_adoc/converters/sup.rb +18 -0
  96. data/lib/coradoc/reverse_adoc/converters/table.rb +280 -0
  97. data/lib/coradoc/reverse_adoc/converters/td.rb +77 -0
  98. data/lib/coradoc/reverse_adoc/converters/text.rb +28 -0
  99. data/lib/coradoc/reverse_adoc/converters/th.rb +14 -0
  100. data/lib/coradoc/reverse_adoc/converters/tr.rb +18 -0
  101. data/lib/coradoc/reverse_adoc/converters/video.rb +25 -0
  102. data/lib/coradoc/reverse_adoc/converters.rb +53 -0
  103. data/lib/coradoc/reverse_adoc/errors.rb +10 -0
  104. data/lib/coradoc/reverse_adoc/html_converter.rb +150 -0
  105. data/lib/coradoc/reverse_adoc/plugin.rb +131 -0
  106. data/lib/coradoc/reverse_adoc/plugins/plateau.rb +174 -0
  107. data/lib/coradoc/reverse_adoc/postprocessor.rb +148 -0
  108. data/lib/coradoc/reverse_adoc.rb +30 -0
  109. data/lib/coradoc/transformer.rb +24 -14
  110. data/lib/coradoc/version.rb +1 -1
  111. data/lib/reverse_adoc.rb +20 -0
  112. metadata +184 -5
  113. data/lib/coradoc/element/inline/image.rb +0 -25
@@ -50,13 +50,13 @@ module Coradoc
50
50
  block_style("=")
51
51
  end
52
52
 
53
- def block_style(delimiter="*", repeater = 4)
53
+ def block_style(delimiter = "*", repeater = 4)
54
54
  block_title.maybe >>
55
- newline.maybe >>
56
- block_type.maybe >>
57
- str(delimiter).repeat(repeater).as(:delimiter) >> newline >>
58
- text_line.repeat(1).as(:lines) >>
59
- str(delimiter).repeat(repeater) >> newline
55
+ newline.maybe >>
56
+ block_type.maybe >>
57
+ str(delimiter).repeat(repeater).as(:delimiter) >> newline >>
58
+ text_line.repeat(1).as(:lines) >>
59
+ str(delimiter).repeat(repeater) >> newline
60
60
  end
61
61
 
62
62
  def block_type
@@ -65,7 +65,7 @@ module Coradoc
65
65
 
66
66
  def highlight
67
67
  text_id >> newline >>
68
- underline >> highlight_text >> newline
68
+ underline >> highlight_text >> newline
69
69
  end
70
70
 
71
71
  def underline
@@ -79,14 +79,14 @@ module Coradoc
79
79
  # Table
80
80
  def table
81
81
  block_title >>
82
- str("|===") >> line_ending >>
83
- table_row.repeat(1).as(:rows) >>
84
- str("|===") >> line_ending
82
+ str("|===") >> line_ending >>
83
+ table_row.repeat(1).as(:rows) >>
84
+ str("|===") >> line_ending
85
85
  end
86
86
 
87
87
  def table_row
88
- (literal_space? >> str("|") >> (cell_content | empty_cell_content)).
89
- repeat(1).as(:cols) >> line_ending
88
+ (literal_space? >> str("|") >> (cell_content | empty_cell_content))
89
+ .repeat(1).as(:cols) >> line_ending
90
90
  end
91
91
 
92
92
  def empty_cell_content
@@ -98,7 +98,7 @@ module Coradoc
98
98
  end
99
99
 
100
100
  def literal_space
101
- (match[' '] | match[' \t']).repeat(1)
101
+ (match[" "] | match[' \t']).repeat(1)
102
102
  end
103
103
 
104
104
  # Override
@@ -113,7 +113,7 @@ module Coradoc
113
113
  # Text
114
114
  def text_line
115
115
  (asciidoc_char_with_id.absent? | text_id) >> literal_space? >>
116
- text.as(:text) >> line_ending.as(:break)
116
+ text.as(:text) >> line_ending.as(:break)
117
117
  end
118
118
 
119
119
  def asciidoc_char
@@ -137,7 +137,7 @@ module Coradoc
137
137
 
138
138
  def glossary
139
139
  keyword.as(:key) >> str("::") >> space? >>
140
- text.as(:value) >> line_ending.as(:break)
140
+ text.as(:value) >> line_ending.as(:break)
141
141
  end
142
142
 
143
143
  def ordered_list
@@ -11,7 +11,7 @@ module Coradoc
11
11
 
12
12
  def document_attributes
13
13
  str(":") >> attribute_name.as(:key) >> str(":") >>
14
- space? >> attribute_value.as(:value) >> line_ending
14
+ space? >> attribute_value.as(:value) >> line_ending
15
15
  end
16
16
  end
17
17
  end
@@ -8,8 +8,8 @@ module Coradoc
8
8
 
9
9
  def header
10
10
  header_title >>
11
- author.maybe.as(:author) >>
12
- revision.maybe.as(:revision) >> newline.maybe
11
+ author.maybe.as(:author) >>
12
+ revision.maybe.as(:revision) >> newline.maybe
13
13
  end
14
14
 
15
15
  def header_title
@@ -18,14 +18,14 @@ module Coradoc
18
18
 
19
19
  def author
20
20
  words.as(:first_name) >> str(",") >>
21
- space? >> words.as(:last_name) >>
22
- space? >> str("<") >> email.as(:email) >> str(">") >> newline
21
+ space? >> words.as(:last_name) >>
22
+ space? >> str("<") >> email.as(:email) >> str(">") >> newline
23
23
  end
24
24
 
25
25
  def revision
26
26
  (word >> (str(".") >> word).maybe).as(:number) >>
27
- str(",") >> space? >> date.as(:date ) >> str(":") >>
28
- space? >> words.as(:remark) >> newline
27
+ str(",") >> space? >> date.as(:date) >> str(":") >>
28
+ space? >> words.as(:remark) >> newline
29
29
  end
30
30
  end
31
31
  end
@@ -23,7 +23,7 @@ module Coradoc
23
23
  # Heading
24
24
  def section_title(level = 2, max_level = 8)
25
25
  match("=").repeat(level, max_level).as(:level) >>
26
- space? >> text.as(:text) >> endline.as(:break)
26
+ space? >> text.as(:text) >> endline.as(:break)
27
27
  end
28
28
 
29
29
  # section
@@ -0,0 +1,25 @@
1
+ BSD 2-Clause License
2
+
3
+ Copyright (c) 2018, Ribose
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ * Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ * Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,308 @@
1
+ = AsciiDoc from HTML and Microsoft Word: reverse_adoc
2
+
3
+ == Purpose
4
+
5
+ Transforms HTML and Microsoft Word into AsciiDoc.
6
+
7
+ Based on https://github.com/xijo/reverse_markdown
8
+
9
+ reverse_adoc used to be a separate Gem, but now it's part of Coradoc.
10
+
11
+
12
+ == Installation
13
+
14
+ Install the gem:
15
+
16
+ [source,console]
17
+ ----
18
+ [sudo] gem install coradoc
19
+ ----
20
+
21
+ or add it to your `Gemfile`:
22
+
23
+ [source,ruby]
24
+ ----
25
+ gem 'coradoc'
26
+ ----
27
+
28
+
29
+ == Command-line usage
30
+
31
+ === HTML to AsciiDoc: `reverse_adoc`
32
+
33
+ Convert HTML files to AsciiDoc:
34
+
35
+ [source,console]
36
+ ----
37
+ $ reverse_adoc file.html > file.adoc
38
+ $ cat file.html | reverse_adoc > file.adoc
39
+ ----
40
+
41
+
42
+ === Microsoft Word to AsciiDoc: `w2a`
43
+
44
+ Convert Word `.doc` or `.docx` files to AsciiDoc:
45
+
46
+ [source,console]
47
+ ----
48
+ $ w2a file.docx > file.adoc
49
+ ----
50
+
51
+ [source,console]
52
+ ----
53
+ $ w2a input.docx -o output.adoc
54
+ ----
55
+
56
+ Help:
57
+
58
+ [source,console]
59
+ ----
60
+ $ w2a -h
61
+ Usage: w2a [options] <file>
62
+ -a, --mathml2asciimath Convert MathML to AsciiMath
63
+ -o, --output=FILENAME Output file to write to
64
+ -e, --external-images Export images if data URI
65
+ -v, --version Version information
66
+ -h, --help Prints this help
67
+ ----
68
+
69
+
70
+ NOTE: `w2a` requires LibreOffice to be installed. It uses LibreOffice's
71
+ export to XHTML. LibreOffice's export of XHTML is superior to the native Microsoft Word export
72
+ to HTML: it exports lists (which Word keeps as paragraphs), and it exports OOMML into MathML.
73
+ On the other hand, the LibreOffice export relies on default styling being used in the
74
+ document, and it may not cope with ordered lists or headings with customised appearance.
75
+ For best results, reset the styles in the document you're converting to those in
76
+ the default `Normal.dot` template.
77
+
78
+ NOTE: `w2a` requires the command-line version of LibreOffice, `soffice`. As it turns out,
79
+ LibreOffice v6 appears to render formulae in HTML as images instead of MathML expressions;
80
+ use LibreOffice v5. If you have both LibreOffice v5 and LibreOffice v6 installed, make sure
81
+ that your OS path searches for the LibreOffice v5 version of `soffice` first; e.g. on Mac,
82
+ include something like `/Applications/LibreOffice5.4.7.2.app/Contents/MacOS` in your PATH
83
+ environment.
84
+
85
+ NOTE: Some information in OOMML is not preserved in the export to MathML from LibreOffice;
86
+ in particular, font shifts such as double-struck fonts.
87
+ The LibreOffice exporter does seem to drop some text (possibly associated with
88
+ MathML); use with caution.
89
+
90
+ NOTE: Adapted from `w2m` of
91
+ https://github.com/benbalter/word-to-markdown[Ben Balter's word-to-markdown]
92
+
93
+
94
+ === Common options
95
+
96
+
97
+ ==== MathML to AsciiMath conversion
98
+
99
+ If you wish to convert the MathML in the document to AsciiMath, run the script with the
100
+ `--mathml2asciimath` option:
101
+
102
+ [source,console]
103
+ ----
104
+ $ w2a --mathml2asciimath document.docx > document.adoc
105
+ ----
106
+
107
+
108
+ ==== Extracting images
109
+
110
+ Images referred by the HTML can be extracted into the destination output folder by using:
111
+
112
+ [source,console]
113
+ ----
114
+ $ reverse_adoc input.docx -o output/file.adoc -e
115
+ $ reverse_adoc input.docx --output output/file.adoc --external-images
116
+ ----
117
+
118
+
119
+ Word embedded images can be extracted into the destination output folder by using:
120
+
121
+ [source,console]
122
+ ----
123
+ $ w2a input.docx -o output/file.adoc -e
124
+ $ w2a input.docx --output output/file.adoc --external-images
125
+ ----
126
+
127
+
128
+ ==== Handling unknown HTML tags
129
+
130
+ The `--unknown_tags` option allows you to specify how to handle unknown tags
131
+ (default `pass_through`).
132
+
133
+ Valid options are:
134
+
135
+ * `pass_through` - Include the unknown tag completely into the result
136
+ * `drop` - Drop the unknown tag and its content
137
+ * `bypass` - Ignore the unknown tag but try to convert its content
138
+ * `raise` - Raise an error to let you know
139
+
140
+
141
+ ==== Tagging of borders
142
+
143
+ Specify how to handle tag borders with the option `--tag_border` (default `' '`).
144
+
145
+ Valid options are:
146
+
147
+ * `' '` - Add whitespace if there is none at tag borders.
148
+ * `''` - Do not not add whitespace.
149
+
150
+
151
+ == Features
152
+
153
+ === General
154
+
155
+ `reverse_adoc` shares features as a port of `reverse_markdown`:
156
+
157
+ * Module based -- if you miss a tag, just add it
158
+ * Can deal with nested lists
159
+ * Inline and block code is supported
160
+ * Supports blockquote
161
+
162
+ It supports the following HTML tags (these are supported by `reverse_markdown`):
163
+
164
+ * `a`
165
+ * `blockquote`
166
+ * `br`
167
+ * `code`, `tt` (added: `kbd`, `samp`, `var`)
168
+ * `div`, `article`
169
+ * `em`, `i` (added: `cite`)
170
+ * `h1`, `h2`, `h3`, `h4`, `h5`, `h6`, `hr`
171
+ * `img`
172
+ * `li`, `ol`, `ul` (added: `dir`)
173
+ * `p`, `pre`
174
+ * `strong`, `b`
175
+ * `table`, `td`, `th`, `tr`
176
+
177
+ [NOTE]
178
+ ====
179
+ * reverse_adoc does *not* support `del` or `strike`, because Asciidoctor does not out of the box.
180
+ * As with reverse_markdown, `pre` is only treated as sourcecode if it is contained in a `div@class = highlight-` element, or has a `@brush` attribute naming the language (Confluence).
181
+ * The gem does not support `p@align`, because Asciidoctor doesn't
182
+ ====
183
+
184
+ In addition, it supports:
185
+
186
+ * `aside`
187
+ * `audio`, `video` (with `@src` attributes)
188
+ * `figure`, `figcaption`
189
+ * `mark`
190
+ * `q`
191
+ * `sub`, `sup`
192
+ * `@id` anchors
193
+ * `blockquote@cite`
194
+ * `img/@width`, `img/@height`
195
+ * `ol/@style`, `ol/@start`, `ol/@reversed`, `ul/@type`
196
+ * `td/@colspan`, `td/@rowspan`, `td@/align`, `td@/valign`
197
+ * `table/caption`, `table/@width`, `table/@frame` (partial), `table/@rules` (partial)
198
+ * Lists and paragraphs within cells
199
+ ** Not tables within cells: Asciidoctor cannot deal with nested tables
200
+
201
+ The gem does not support:
202
+
203
+ * `col`, `colgroup`
204
+ * `source`, `picture`
205
+ * `bdi`, `bdo`, `ruby`, `rt`, `rp`, `wbr`
206
+ * `frame`, `frameset`, `iframe`, `noframes`, `noscript`, `script`, `input`, `output`, `progress`
207
+ * `map`, `canvas`, `dialog`, `embed`, `object`, `param`, `svg`, `track`
208
+ * `fieldset`, `button`, `datalist`, `form`, `label`, `legend`, `menu`, `menulist`, `optgroup`, `option`, `select`, `textarea`
209
+ * `big`, `dfn`, `font`, `s`, `small`, `span`, `strike`, `u`
210
+ * `center`
211
+ * `data`, `meter`
212
+ * `del`, `ins`
213
+ * `footer`, `header`, `main`, `nav`, `details`, `section`, `summary`, `template`
214
+
215
+
216
+ === MathML support
217
+
218
+ If you are using this gem in the context of https://www.metanorma.com[Metanorma],
219
+ Metanorma AsciiDoc accepts MathML as a native mathematical format. So you do not need
220
+ to convert the MathML to AsciiMath.
221
+
222
+ The gem will optionally invoke the https://github.com/metanorma/mathml2asciimath
223
+ gem, to convert MathML to AsciiMath. The conversion is not perfect, and will need to be
224
+ post-edited; but it's a lot better than nothing.
225
+
226
+ NOTE: Asciidoctor does not support MathML input. HTML uses MathML.
227
+ The gem will recognize MathML expressions in HTML, and will wrap them in Asciidoctor
228
+ `stem:[ ]` macros. The result of this gem is not actually legal Asciidoctor for `stem`:
229
+ Asciidoctor will presumably
230
+ think this is AsciiMath in the `stem:[ ]` macro, try to pass it into MathJax as
231
+ AsciiMath, and fail. But of course, MathJax has no problem with MathML, and some postprocessing
232
+ on the Asciidoctor output can ensure that the MathML is treated by MathJax (or whatever else
233
+ uses the output) as such; so this is still much better than nothing for stem processing.
234
+
235
+ === Word cleanup
236
+
237
+ This gem is routinely used in the Metanorma project to export Word documents to AsciiDoc.
238
+ The HTML export from Word that the gem uses, from LibreOffice, is much cleaner than the
239
+ native HTML 4 export from Word; but it has some infelicities which this gem cleans up:
240
+
241
+ * The HTML export has trouble with subscripts, and routinely exports them as headings; the `w2a`
242
+ script tries to clean them up.
243
+ * The `w2a` cleans up spaces, but it does not strip them.
244
+ * Spaces are removed from anchors and cross-references.
245
+ * Double underscores are removed from anchors and cross-references.
246
+ * Cross-references to `_GoBack` and to `_Toc` followed by numbers (used to construct tables of contents) are ignored.
247
+
248
+ == Ruby library usage
249
+
250
+ === General
251
+
252
+ Simple to use.
253
+
254
+ [source,ruby]
255
+ ----
256
+ require 'coradoc/reverse_adoc'
257
+
258
+ result = Coradoc::ReverseAdoc.convert input
259
+ result.inspect # " *feelings* "
260
+ ----
261
+
262
+ === Configure with options
263
+
264
+ Just pass your chosen configuration options in after the input. The given options will last for this operation only.
265
+
266
+ [source,ruby]
267
+ ----
268
+ require 'coradoc/reverse_adoc'
269
+
270
+ Coradoc::ReverseAdoc.convert(input, unknown_tags: :raise, mathml2asciimath: true)
271
+ ----
272
+
273
+
274
+ === Preconfigure using an initializer
275
+
276
+ Or configure it block style on a initializer level. These configurations will last for all conversions until they are set to something different.
277
+
278
+ [source,ruby]
279
+ ----
280
+ require 'coradoc/reverse_adoc'
281
+
282
+ Coradoc::ReverseAdoc.config do |config|
283
+ config.unknown_tags = :bypass
284
+ config.mathml2asciimath = true
285
+ config.tag_border = ''
286
+ end
287
+ ----
288
+
289
+ === Convert HTML to a Coradoc AST
290
+
291
+ [source,ruby]
292
+ ----
293
+ require 'coradoc/reverse_adoc'
294
+
295
+ # Options can be supplied as keyword arguments
296
+ Coradoc::ReverseAdoc::HtmlConverter.to_coradoc("<b><i>Some input</i></b>")
297
+ ----
298
+
299
+
300
+ == Related stuff
301
+
302
+ * https://github.com/xijo/reverse_markdown[Xijo's original reverse_markdown gem]
303
+ * https://github.com/xijo/reverse_markdown/wiki/Write-your-own-converter[Write custom converters] - Wiki entry about how to write your own converter
304
+ * https://github.com/harlantwood/html_massage[html_massage] - A gem by Harlan T. Wood to convert regular sites into markdown using reverse_markdown
305
+ * https://github.com/benbalter/word-to-markdown[word-to-markdown] - Convert word docs into markdown while using reverse_markdown, by Ben Balter
306
+ * https://github.com/asciidocfx/HtmlToAsciidoc[HtmlToAsciidoc] - Javascript regexp-based converter of HTML to Asciidoctor
307
+ * https://asciidoctor.org/docs/user-manual/[The Asciidoctor User Manual]
308
+
@@ -0,0 +1,125 @@
1
+ module Coradoc::ReverseAdoc
2
+ class Cleaner
3
+ def tidy(string)
4
+ result = HtmlConverter.track_time "Removing inner whitespace" do
5
+ remove_inner_whitespaces(String.new(string))
6
+ end
7
+ result = HtmlConverter.track_time "Removing newlines" do
8
+ remove_newlines(result)
9
+ end
10
+ result = HtmlConverter.track_time "Removing leading newlines" do
11
+ remove_leading_newlines(result)
12
+ end
13
+ result = HtmlConverter.track_time "Cleaning tag borders" do
14
+ clean_tag_borders(result)
15
+ end
16
+ result = HtmlConverter.track_time "Cleaning punctuation characters" do
17
+ clean_punctuation_characters(result)
18
+ end
19
+ end
20
+
21
+ def remove_newlines(string)
22
+ string.gsub(/\n{3,}/, "\n\n")
23
+ end
24
+
25
+ def remove_leading_newlines(string)
26
+ string.gsub(/\A\n+/, "")
27
+ end
28
+
29
+ def remove_inner_whitespaces(string)
30
+ unless string.nil?
31
+ string.gsub!(/\n stem:\[/, "\nstem:[")
32
+ string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ")
33
+ string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1")
34
+ end
35
+ result = +""
36
+ string.each_line do |line|
37
+ result << preserve_border_whitespaces(line) do
38
+ line.strip.gsub(/[ \t]{2,}/, " ")
39
+ end
40
+ end
41
+ result
42
+ end
43
+
44
+ # Find non-asterisk content that is enclosed by two or
45
+ # more asterisks. Ensure that only one whitespace occurs
46
+ # in the border area.
47
+ # Same for underscores and brackets.
48
+ def clean_tag_borders(string)
49
+ # result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match|
50
+ # preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do
51
+ # match.strip.sub("** ", "**").sub(" **", "**")
52
+ # end
53
+ # end
54
+
55
+ # result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match|
56
+ # preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do
57
+ # match.strip.sub("__ ", "__").sub(" __", "__")
58
+ # end
59
+ # end
60
+
61
+ result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
62
+ preserve_border_whitespaces(match,
63
+ default_border: Coradoc::ReverseAdoc.config.tag_border) do
64
+ match.strip.sub("~~ ", "~~").sub(" ~~", "~~")
65
+ end
66
+ end
67
+
68
+ result.gsub(/\s?\[.*?\]\s?/) do |match|
69
+ preserve_border_whitespaces(match) do
70
+ match.strip.sub("[ ", "[").sub(" ]", "]")
71
+ end
72
+ end
73
+ end
74
+
75
+ def clean_punctuation_characters(string)
76
+ string.gsub(/(\*\*|~~|__)\s([.!?'"])/, "\\1\\2")
77
+ end
78
+
79
+ # preprocesses HTML, rather than postprocessing it
80
+ def preprocess_word_html(string)
81
+ clean_headings(scrub_whitespace(string.dup))
82
+ end
83
+
84
+ def scrub_whitespace(string)
85
+ string.gsub!(/&nbsp;|&#xA0;|\u00a0/i, "&#xA0;") # HTML encoded spaces
86
+ string.sub!(/^\A[[:space:]]+/m, "") # document leading whitespace
87
+ string.sub!(/[[:space:]]+\z$/m, "") # document trailing whitespace
88
+ string.gsub!(/( +)$/, " ") # line trailing whitespace
89
+ string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
90
+ # string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs
91
+ string
92
+ end
93
+
94
+ # following added by me
95
+ def clean_headings(string)
96
+ string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ")
97
+ # I don't know why Libre Office is inserting them, but they need to go
98
+ string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
99
+ "<sup>\\2</sup>")
100
+ # I absolutely don't know why Libre Office is rendering superscripts as h1
101
+ string
102
+ end
103
+
104
+ private
105
+
106
+ def preserve_border_whitespaces(string, options = {})
107
+ return string if /\A\s*\Z/.match?(string)
108
+
109
+ default_border = options.fetch(:default_border, "")
110
+ # If the string contains part of a link so the characters [,],(,)
111
+ # then don't add any extra spaces
112
+ default_border = "" if /[\[(\])]/.match?(string)
113
+ string_start = present_or_default(string[/\A\s*/], default_border)
114
+ string_end = present_or_default(string[/\s*\Z/], default_border)
115
+ result = yield
116
+ string_start + result + string_end
117
+ end
118
+
119
+ def present_or_default(string, default)
120
+ return default if string.nil? || string.empty?
121
+
122
+ string
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,73 @@
1
+ require "tmpdir"
2
+
3
+ module Coradoc::ReverseAdoc
4
+ class Config
5
+ def initialize
6
+ @unknown_tags = :pass_through
7
+ @input_format = :html
8
+ @mathml2asciimath = false
9
+ @external_images = false
10
+
11
+ # Destination to save file and images
12
+ @destination = nil
13
+
14
+ # Source of HTML
15
+ # @sourcedir = nil
16
+
17
+ # Image counter, assuming there are max 999 images
18
+ @image_counter = 1
19
+ # pad with 0s
20
+ @image_counter_pattern = "%03d"
21
+
22
+ @em_delimiter = "_".freeze
23
+ @strong_delimiter = "*".freeze
24
+ @inline_options = {}
25
+ @tag_border = " ".freeze
26
+
27
+ @split_sections = nil
28
+
29
+ # Document width - used to compute table sizes.
30
+ # This is an assumption for screen size in input document.
31
+ # If column widths are specified in absolute values, then we
32
+ # have to convert them to relative values, as AsciiDoc only
33
+ # supports those.
34
+ @doc_width = 1000
35
+
36
+ # Plugin system
37
+ @plugins = []
38
+
39
+ # Debugging options
40
+ @track_time = false
41
+ end
42
+
43
+ def with(options = {})
44
+ old_options = @inline_options
45
+ @inline_options = options
46
+ result = yield
47
+ @inline_options = old_options
48
+ result
49
+ end
50
+
51
+ def self.declare_option(option)
52
+ define_method(option) do
53
+ @inline_options[option] || instance_variable_get(:"@#{option}")
54
+ end
55
+
56
+ attr_writer option
57
+ end
58
+
59
+ declare_option :unknown_tags
60
+ declare_option :tag_border
61
+ declare_option :mathml2asciimath
62
+ declare_option :external_images
63
+ declare_option :destination
64
+ declare_option :sourcedir
65
+ declare_option :image_counter
66
+ declare_option :image_counter_pattern
67
+ declare_option :input_format
68
+ declare_option :split_sections
69
+ declare_option :doc_width
70
+ declare_option :plugins
71
+ declare_option :track_time
72
+ end
73
+ end