coradoc 0.2.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/.docker/Dockerfile +1 -1
  3. data/.docker/docker-compose.yml +2 -2
  4. data/.editorconfig +15 -0
  5. data/CHANGELOG.md +4 -0
  6. data/README.md +4 -0
  7. data/Rakefile +10 -0
  8. data/coradoc.gemspec +11 -2
  9. data/exe/reverse_adoc +91 -0
  10. data/exe/w2a +72 -0
  11. data/lib/coradoc/document.rb +6 -6
  12. data/lib/coradoc/element/admonition.rb +8 -6
  13. data/lib/coradoc/element/attribute.rb +2 -2
  14. data/lib/coradoc/element/attribute_list.rb +94 -15
  15. data/lib/coradoc/element/audio.rb +14 -3
  16. data/lib/coradoc/element/author.rb +18 -14
  17. data/lib/coradoc/element/base.rb +69 -8
  18. data/lib/coradoc/element/block/core.rb +10 -6
  19. data/lib/coradoc/element/block/literal.rb +1 -1
  20. data/lib/coradoc/element/block/quote.rb +1 -1
  21. data/lib/coradoc/element/block/sourcecode.rb +2 -2
  22. data/lib/coradoc/element/break.rb +1 -1
  23. data/lib/coradoc/element/document_attributes.rb +6 -6
  24. data/lib/coradoc/element/header.rb +4 -2
  25. data/lib/coradoc/element/image/block_image.rb +13 -2
  26. data/lib/coradoc/element/image/core.rb +35 -5
  27. data/lib/coradoc/element/image/inline_image.rb +2 -2
  28. data/lib/coradoc/element/image.rb +0 -1
  29. data/lib/coradoc/element/inline/anchor.rb +4 -2
  30. data/lib/coradoc/element/inline/bold.rb +10 -4
  31. data/lib/coradoc/element/inline/cross_reference.rb +4 -2
  32. data/lib/coradoc/element/inline/hard_line_break.rb +1 -1
  33. data/lib/coradoc/element/inline/highlight.rb +12 -6
  34. data/lib/coradoc/element/inline/italic.rb +10 -4
  35. data/lib/coradoc/element/inline/link.rb +26 -10
  36. data/lib/coradoc/element/inline/monospace.rb +10 -4
  37. data/lib/coradoc/element/inline/quotation.rb +4 -1
  38. data/lib/coradoc/element/inline/subscript.rb +5 -2
  39. data/lib/coradoc/element/inline/superscript.rb +5 -2
  40. data/lib/coradoc/element/inline.rb +0 -1
  41. data/lib/coradoc/element/list/core.rb +10 -8
  42. data/lib/coradoc/element/list/definition.rb +19 -0
  43. data/lib/coradoc/element/list/ordered.rb +1 -1
  44. data/lib/coradoc/element/list/unordered.rb +1 -1
  45. data/lib/coradoc/element/list.rb +1 -1
  46. data/lib/coradoc/element/list_item.rb +9 -4
  47. data/lib/coradoc/element/list_item_definition.rb +32 -0
  48. data/lib/coradoc/element/paragraph.rb +5 -3
  49. data/lib/coradoc/element/revision.rb +20 -16
  50. data/lib/coradoc/element/section.rb +21 -4
  51. data/lib/coradoc/element/table.rb +36 -19
  52. data/lib/coradoc/element/text_element.rb +63 -17
  53. data/lib/coradoc/element/title.rb +27 -7
  54. data/lib/coradoc/element/video.rb +33 -6
  55. data/lib/coradoc/generator.rb +2 -2
  56. data/lib/coradoc/legacy_parser.rb +41 -41
  57. data/lib/coradoc/oscal.rb +2 -4
  58. data/lib/coradoc/parser/asciidoc/content.rb +15 -15
  59. data/lib/coradoc/parser/asciidoc/document_attributes.rb +1 -1
  60. data/lib/coradoc/parser/asciidoc/header.rb +6 -6
  61. data/lib/coradoc/parser/asciidoc/section.rb +1 -1
  62. data/lib/coradoc/reverse_adoc/LICENSE.txt +25 -0
  63. data/lib/coradoc/reverse_adoc/README.adoc +308 -0
  64. data/lib/coradoc/reverse_adoc/cleaner.rb +125 -0
  65. data/lib/coradoc/reverse_adoc/config.rb +73 -0
  66. data/lib/coradoc/reverse_adoc/converters/a.rb +47 -0
  67. data/lib/coradoc/reverse_adoc/converters/aside.rb +12 -0
  68. data/lib/coradoc/reverse_adoc/converters/audio.rb +25 -0
  69. data/lib/coradoc/reverse_adoc/converters/base.rb +104 -0
  70. data/lib/coradoc/reverse_adoc/converters/blockquote.rb +18 -0
  71. data/lib/coradoc/reverse_adoc/converters/br.rb +11 -0
  72. data/lib/coradoc/reverse_adoc/converters/bypass.rb +77 -0
  73. data/lib/coradoc/reverse_adoc/converters/code.rb +19 -0
  74. data/lib/coradoc/reverse_adoc/converters/div.rb +14 -0
  75. data/lib/coradoc/reverse_adoc/converters/dl.rb +55 -0
  76. data/lib/coradoc/reverse_adoc/converters/drop.rb +22 -0
  77. data/lib/coradoc/reverse_adoc/converters/em.rb +17 -0
  78. data/lib/coradoc/reverse_adoc/converters/figure.rb +21 -0
  79. data/lib/coradoc/reverse_adoc/converters/h.rb +38 -0
  80. data/lib/coradoc/reverse_adoc/converters/head.rb +19 -0
  81. data/lib/coradoc/reverse_adoc/converters/hr.rb +11 -0
  82. data/lib/coradoc/reverse_adoc/converters/ignore.rb +16 -0
  83. data/lib/coradoc/reverse_adoc/converters/img.rb +98 -0
  84. data/lib/coradoc/reverse_adoc/converters/li.rb +13 -0
  85. data/lib/coradoc/reverse_adoc/converters/mark.rb +15 -0
  86. data/lib/coradoc/reverse_adoc/converters/markup.rb +27 -0
  87. data/lib/coradoc/reverse_adoc/converters/math.rb +31 -0
  88. data/lib/coradoc/reverse_adoc/converters/ol.rb +60 -0
  89. data/lib/coradoc/reverse_adoc/converters/p.rb +19 -0
  90. data/lib/coradoc/reverse_adoc/converters/pass_through.rb +13 -0
  91. data/lib/coradoc/reverse_adoc/converters/pre.rb +51 -0
  92. data/lib/coradoc/reverse_adoc/converters/q.rb +12 -0
  93. data/lib/coradoc/reverse_adoc/converters/strong.rb +16 -0
  94. data/lib/coradoc/reverse_adoc/converters/sub.rb +18 -0
  95. data/lib/coradoc/reverse_adoc/converters/sup.rb +18 -0
  96. data/lib/coradoc/reverse_adoc/converters/table.rb +280 -0
  97. data/lib/coradoc/reverse_adoc/converters/td.rb +77 -0
  98. data/lib/coradoc/reverse_adoc/converters/text.rb +28 -0
  99. data/lib/coradoc/reverse_adoc/converters/th.rb +14 -0
  100. data/lib/coradoc/reverse_adoc/converters/tr.rb +18 -0
  101. data/lib/coradoc/reverse_adoc/converters/video.rb +25 -0
  102. data/lib/coradoc/reverse_adoc/converters.rb +53 -0
  103. data/lib/coradoc/reverse_adoc/errors.rb +10 -0
  104. data/lib/coradoc/reverse_adoc/html_converter.rb +150 -0
  105. data/lib/coradoc/reverse_adoc/plugin.rb +131 -0
  106. data/lib/coradoc/reverse_adoc/plugins/plateau.rb +174 -0
  107. data/lib/coradoc/reverse_adoc/postprocessor.rb +148 -0
  108. data/lib/coradoc/reverse_adoc.rb +30 -0
  109. data/lib/coradoc/transformer.rb +24 -14
  110. data/lib/coradoc/version.rb +1 -1
  111. data/lib/reverse_adoc.rb +20 -0
  112. metadata +184 -5
  113. data/lib/coradoc/element/inline/image.rb +0 -25
@@ -50,13 +50,13 @@ module Coradoc
50
50
  block_style("=")
51
51
  end
52
52
 
53
- def block_style(delimiter="*", repeater = 4)
53
+ def block_style(delimiter = "*", repeater = 4)
54
54
  block_title.maybe >>
55
- newline.maybe >>
56
- block_type.maybe >>
57
- str(delimiter).repeat(repeater).as(:delimiter) >> newline >>
58
- text_line.repeat(1).as(:lines) >>
59
- str(delimiter).repeat(repeater) >> newline
55
+ newline.maybe >>
56
+ block_type.maybe >>
57
+ str(delimiter).repeat(repeater).as(:delimiter) >> newline >>
58
+ text_line.repeat(1).as(:lines) >>
59
+ str(delimiter).repeat(repeater) >> newline
60
60
  end
61
61
 
62
62
  def block_type
@@ -65,7 +65,7 @@ module Coradoc
65
65
 
66
66
  def highlight
67
67
  text_id >> newline >>
68
- underline >> highlight_text >> newline
68
+ underline >> highlight_text >> newline
69
69
  end
70
70
 
71
71
  def underline
@@ -79,14 +79,14 @@ module Coradoc
79
79
  # Table
80
80
  def table
81
81
  block_title >>
82
- str("|===") >> line_ending >>
83
- table_row.repeat(1).as(:rows) >>
84
- str("|===") >> line_ending
82
+ str("|===") >> line_ending >>
83
+ table_row.repeat(1).as(:rows) >>
84
+ str("|===") >> line_ending
85
85
  end
86
86
 
87
87
  def table_row
88
- (literal_space? >> str("|") >> (cell_content | empty_cell_content)).
89
- repeat(1).as(:cols) >> line_ending
88
+ (literal_space? >> str("|") >> (cell_content | empty_cell_content))
89
+ .repeat(1).as(:cols) >> line_ending
90
90
  end
91
91
 
92
92
  def empty_cell_content
@@ -98,7 +98,7 @@ module Coradoc
98
98
  end
99
99
 
100
100
  def literal_space
101
- (match[' '] | match[' \t']).repeat(1)
101
+ (match[" "] | match[' \t']).repeat(1)
102
102
  end
103
103
 
104
104
  # Override
@@ -113,7 +113,7 @@ module Coradoc
113
113
  # Text
114
114
  def text_line
115
115
  (asciidoc_char_with_id.absent? | text_id) >> literal_space? >>
116
- text.as(:text) >> line_ending.as(:break)
116
+ text.as(:text) >> line_ending.as(:break)
117
117
  end
118
118
 
119
119
  def asciidoc_char
@@ -137,7 +137,7 @@ module Coradoc
137
137
 
138
138
  def glossary
139
139
  keyword.as(:key) >> str("::") >> space? >>
140
- text.as(:value) >> line_ending.as(:break)
140
+ text.as(:value) >> line_ending.as(:break)
141
141
  end
142
142
 
143
143
  def ordered_list
@@ -11,7 +11,7 @@ module Coradoc
11
11
 
12
12
  def document_attributes
13
13
  str(":") >> attribute_name.as(:key) >> str(":") >>
14
- space? >> attribute_value.as(:value) >> line_ending
14
+ space? >> attribute_value.as(:value) >> line_ending
15
15
  end
16
16
  end
17
17
  end
@@ -8,8 +8,8 @@ module Coradoc
8
8
 
9
9
  def header
10
10
  header_title >>
11
- author.maybe.as(:author) >>
12
- revision.maybe.as(:revision) >> newline.maybe
11
+ author.maybe.as(:author) >>
12
+ revision.maybe.as(:revision) >> newline.maybe
13
13
  end
14
14
 
15
15
  def header_title
@@ -18,14 +18,14 @@ module Coradoc
18
18
 
19
19
  def author
20
20
  words.as(:first_name) >> str(",") >>
21
- space? >> words.as(:last_name) >>
22
- space? >> str("<") >> email.as(:email) >> str(">") >> newline
21
+ space? >> words.as(:last_name) >>
22
+ space? >> str("<") >> email.as(:email) >> str(">") >> newline
23
23
  end
24
24
 
25
25
  def revision
26
26
  (word >> (str(".") >> word).maybe).as(:number) >>
27
- str(",") >> space? >> date.as(:date ) >> str(":") >>
28
- space? >> words.as(:remark) >> newline
27
+ str(",") >> space? >> date.as(:date) >> str(":") >>
28
+ space? >> words.as(:remark) >> newline
29
29
  end
30
30
  end
31
31
  end
@@ -23,7 +23,7 @@ module Coradoc
23
23
  # Heading
24
24
  def section_title(level = 2, max_level = 8)
25
25
  match("=").repeat(level, max_level).as(:level) >>
26
- space? >> text.as(:text) >> endline.as(:break)
26
+ space? >> text.as(:text) >> endline.as(:break)
27
27
  end
28
28
 
29
29
  # section
@@ -0,0 +1,25 @@
1
+ BSD 2-Clause License
2
+
3
+ Copyright (c) 2018, Ribose
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ * Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ * Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,308 @@
1
+ = AsciiDoc from HTML and Microsoft Word: reverse_adoc
2
+
3
+ == Purpose
4
+
5
+ Transforms HTML and Microsoft Word into AsciiDoc.
6
+
7
+ Based on https://github.com/xijo/reverse_markdown
8
+
9
+ reverse_adoc used to be a separate Gem, but now it's part of Coradoc.
10
+
11
+
12
+ == Installation
13
+
14
+ Install the gem:
15
+
16
+ [source,console]
17
+ ----
18
+ [sudo] gem install coradoc
19
+ ----
20
+
21
+ or add it to your `Gemfile`:
22
+
23
+ [source,ruby]
24
+ ----
25
+ gem 'coradoc'
26
+ ----
27
+
28
+
29
+ == Command-line usage
30
+
31
+ === HTML to AsciiDoc: `reverse_adoc`
32
+
33
+ Convert HTML files to AsciiDoc:
34
+
35
+ [source,console]
36
+ ----
37
+ $ reverse_adoc file.html > file.adoc
38
+ $ cat file.html | reverse_adoc > file.adoc
39
+ ----
40
+
41
+
42
+ === Microsoft Word to AsciiDoc: `w2a`
43
+
44
+ Convert Word `.doc` or `.docx` files to AsciiDoc:
45
+
46
+ [source,console]
47
+ ----
48
+ $ w2a file.docx > file.adoc
49
+ ----
50
+
51
+ [source,console]
52
+ ----
53
+ $ w2a input.docx -o output.adoc
54
+ ----
55
+
56
+ Help:
57
+
58
+ [source,console]
59
+ ----
60
+ $ w2a -h
61
+ Usage: w2a [options] <file>
62
+ -a, --mathml2asciimath Convert MathML to AsciiMath
63
+ -o, --output=FILENAME Output file to write to
64
+ -e, --external-images Export images if data URI
65
+ -v, --version Version information
66
+ -h, --help Prints this help
67
+ ----
68
+
69
+
70
+ NOTE: `w2a` requires LibreOffice to be installed. It uses LibreOffice's
71
+ export to XHTML. LibreOffice's export of XHTML is superior to the native Microsoft Word export
72
+ to HTML: it exports lists (which Word keeps as paragraphs), and it exports OOMML into MathML.
73
+ On the other hand, the LibreOffice export relies on default styling being used in the
74
+ document, and it may not cope with ordered lists or headings with customised appearance.
75
+ For best results, reset the styles in the document you're converting to those in
76
+ the default `Normal.dot` template.
77
+
78
+ NOTE: `w2a` requires the command-line version of LibreOffice, `soffice`. As it turns out,
79
+ LibreOffice v6 appears to render formulae in HTML as images instead of MathML expressions;
80
+ use LibreOffice v5. If you have both LibreOffice v5 and LibreOffice v6 installed, make sure
81
+ that your OS path searches for the LibreOffice v5 version of `soffice` first; e.g. on Mac,
82
+ include something like `/Applications/LibreOffice5.4.7.2.app/Contents/MacOS` in your PATH
83
+ environment.
84
+
85
+ NOTE: Some information in OOMML is not preserved in the export to MathML from LibreOffice;
86
+ in particular, font shifts such as double-struck fonts.
87
+ The LibreOffice exporter does seem to drop some text (possibly associated with
88
+ MathML); use with caution.
89
+
90
+ NOTE: Adapted from `w2m` of
91
+ https://github.com/benbalter/word-to-markdown[Ben Balter's word-to-markdown]
92
+
93
+
94
+ === Common options
95
+
96
+
97
+ ==== MathML to AsciiMath conversion
98
+
99
+ If you wish to convert the MathML in the document to AsciiMath, run the script with the
100
+ `--mathml2asciimath` option:
101
+
102
+ [source,console]
103
+ ----
104
+ $ w2a --mathml2asciimath document.docx > document.adoc
105
+ ----
106
+
107
+
108
+ ==== Extracting images
109
+
110
+ Images referred by the HTML can be extracted into the destination output folder by using:
111
+
112
+ [source,console]
113
+ ----
114
+ $ reverse_adoc input.docx -o output/file.adoc -e
115
+ $ reverse_adoc input.docx --output output/file.adoc --external-images
116
+ ----
117
+
118
+
119
+ Word embedded images can be extracted into the destination output folder by using:
120
+
121
+ [source,console]
122
+ ----
123
+ $ w2a input.docx -o output/file.adoc -e
124
+ $ w2a input.docx --output output/file.adoc --external-images
125
+ ----
126
+
127
+
128
+ ==== Handling unknown HTML tags
129
+
130
+ The `--unknown_tags` option allows you to specify how to handle unknown tags
131
+ (default `pass_through`).
132
+
133
+ Valid options are:
134
+
135
+ * `pass_through` - Include the unknown tag completely into the result
136
+ * `drop` - Drop the unknown tag and its content
137
+ * `bypass` - Ignore the unknown tag but try to convert its content
138
+ * `raise` - Raise an error to let you know
139
+
140
+
141
+ ==== Tagging of borders
142
+
143
+ Specify how to handle tag borders with the option `--tag_border` (default `' '`).
144
+
145
+ Valid options are:
146
+
147
+ * `' '` - Add whitespace if there is none at tag borders.
148
+ * `''` - Do not not add whitespace.
149
+
150
+
151
+ == Features
152
+
153
+ === General
154
+
155
+ `reverse_adoc` shares features as a port of `reverse_markdown`:
156
+
157
+ * Module based -- if you miss a tag, just add it
158
+ * Can deal with nested lists
159
+ * Inline and block code is supported
160
+ * Supports blockquote
161
+
162
+ It supports the following HTML tags (these are supported by `reverse_markdown`):
163
+
164
+ * `a`
165
+ * `blockquote`
166
+ * `br`
167
+ * `code`, `tt` (added: `kbd`, `samp`, `var`)
168
+ * `div`, `article`
169
+ * `em`, `i` (added: `cite`)
170
+ * `h1`, `h2`, `h3`, `h4`, `h5`, `h6`, `hr`
171
+ * `img`
172
+ * `li`, `ol`, `ul` (added: `dir`)
173
+ * `p`, `pre`
174
+ * `strong`, `b`
175
+ * `table`, `td`, `th`, `tr`
176
+
177
+ [NOTE]
178
+ ====
179
+ * reverse_adoc does *not* support `del` or `strike`, because Asciidoctor does not out of the box.
180
+ * As with reverse_markdown, `pre` is only treated as sourcecode if it is contained in a `div@class = highlight-` element, or has a `@brush` attribute naming the language (Confluence).
181
+ * The gem does not support `p@align`, because Asciidoctor doesn't
182
+ ====
183
+
184
+ In addition, it supports:
185
+
186
+ * `aside`
187
+ * `audio`, `video` (with `@src` attributes)
188
+ * `figure`, `figcaption`
189
+ * `mark`
190
+ * `q`
191
+ * `sub`, `sup`
192
+ * `@id` anchors
193
+ * `blockquote@cite`
194
+ * `img/@width`, `img/@height`
195
+ * `ol/@style`, `ol/@start`, `ol/@reversed`, `ul/@type`
196
+ * `td/@colspan`, `td/@rowspan`, `td@/align`, `td@/valign`
197
+ * `table/caption`, `table/@width`, `table/@frame` (partial), `table/@rules` (partial)
198
+ * Lists and paragraphs within cells
199
+ ** Not tables within cells: Asciidoctor cannot deal with nested tables
200
+
201
+ The gem does not support:
202
+
203
+ * `col`, `colgroup`
204
+ * `source`, `picture`
205
+ * `bdi`, `bdo`, `ruby`, `rt`, `rp`, `wbr`
206
+ * `frame`, `frameset`, `iframe`, `noframes`, `noscript`, `script`, `input`, `output`, `progress`
207
+ * `map`, `canvas`, `dialog`, `embed`, `object`, `param`, `svg`, `track`
208
+ * `fieldset`, `button`, `datalist`, `form`, `label`, `legend`, `menu`, `menulist`, `optgroup`, `option`, `select`, `textarea`
209
+ * `big`, `dfn`, `font`, `s`, `small`, `span`, `strike`, `u`
210
+ * `center`
211
+ * `data`, `meter`
212
+ * `del`, `ins`
213
+ * `footer`, `header`, `main`, `nav`, `details`, `section`, `summary`, `template`
214
+
215
+
216
+ === MathML support
217
+
218
+ If you are using this gem in the context of https://www.metanorma.com[Metanorma],
219
+ Metanorma AsciiDoc accepts MathML as a native mathematical format. So you do not need
220
+ to convert the MathML to AsciiMath.
221
+
222
+ The gem will optionally invoke the https://github.com/metanorma/mathml2asciimath
223
+ gem, to convert MathML to AsciiMath. The conversion is not perfect, and will need to be
224
+ post-edited; but it's a lot better than nothing.
225
+
226
+ NOTE: Asciidoctor does not support MathML input. HTML uses MathML.
227
+ The gem will recognize MathML expressions in HTML, and will wrap them in Asciidoctor
228
+ `stem:[ ]` macros. The result of this gem is not actually legal Asciidoctor for `stem`:
229
+ Asciidoctor will presumably
230
+ think this is AsciiMath in the `stem:[ ]` macro, try to pass it into MathJax as
231
+ AsciiMath, and fail. But of course, MathJax has no problem with MathML, and some postprocessing
232
+ on the Asciidoctor output can ensure that the MathML is treated by MathJax (or whatever else
233
+ uses the output) as such; so this is still much better than nothing for stem processing.
234
+
235
+ === Word cleanup
236
+
237
+ This gem is routinely used in the Metanorma project to export Word documents to AsciiDoc.
238
+ The HTML export from Word that the gem uses, from LibreOffice, is much cleaner than the
239
+ native HTML 4 export from Word; but it has some infelicities which this gem cleans up:
240
+
241
+ * The HTML export has trouble with subscripts, and routinely exports them as headings; the `w2a`
242
+ script tries to clean them up.
243
+ * The `w2a` cleans up spaces, but it does not strip them.
244
+ * Spaces are removed from anchors and cross-references.
245
+ * Double underscores are removed from anchors and cross-references.
246
+ * Cross-references to `_GoBack` and to `_Toc` followed by numbers (used to construct tables of contents) are ignored.
247
+
248
+ == Ruby library usage
249
+
250
+ === General
251
+
252
+ Simple to use.
253
+
254
+ [source,ruby]
255
+ ----
256
+ require 'coradoc/reverse_adoc'
257
+
258
+ result = Coradoc::ReverseAdoc.convert input
259
+ result.inspect # " *feelings* "
260
+ ----
261
+
262
+ === Configure with options
263
+
264
+ Just pass your chosen configuration options in after the input. The given options will last for this operation only.
265
+
266
+ [source,ruby]
267
+ ----
268
+ require 'coradoc/reverse_adoc'
269
+
270
+ Coradoc::ReverseAdoc.convert(input, unknown_tags: :raise, mathml2asciimath: true)
271
+ ----
272
+
273
+
274
+ === Preconfigure using an initializer
275
+
276
+ Or configure it block style on a initializer level. These configurations will last for all conversions until they are set to something different.
277
+
278
+ [source,ruby]
279
+ ----
280
+ require 'coradoc/reverse_adoc'
281
+
282
+ Coradoc::ReverseAdoc.config do |config|
283
+ config.unknown_tags = :bypass
284
+ config.mathml2asciimath = true
285
+ config.tag_border = ''
286
+ end
287
+ ----
288
+
289
+ === Convert HTML to a Coradoc AST
290
+
291
+ [source,ruby]
292
+ ----
293
+ require 'coradoc/reverse_adoc'
294
+
295
+ # Options can be supplied as keyword arguments
296
+ Coradoc::ReverseAdoc::HtmlConverter.to_coradoc("<b><i>Some input</i></b>")
297
+ ----
298
+
299
+
300
+ == Related stuff
301
+
302
+ * https://github.com/xijo/reverse_markdown[Xijo's original reverse_markdown gem]
303
+ * https://github.com/xijo/reverse_markdown/wiki/Write-your-own-converter[Write custom converters] - Wiki entry about how to write your own converter
304
+ * https://github.com/harlantwood/html_massage[html_massage] - A gem by Harlan T. Wood to convert regular sites into markdown using reverse_markdown
305
+ * https://github.com/benbalter/word-to-markdown[word-to-markdown] - Convert word docs into markdown while using reverse_markdown, by Ben Balter
306
+ * https://github.com/asciidocfx/HtmlToAsciidoc[HtmlToAsciidoc] - Javascript regexp-based converter of HTML to Asciidoctor
307
+ * https://asciidoctor.org/docs/user-manual/[The Asciidoctor User Manual]
308
+
@@ -0,0 +1,125 @@
1
+ module Coradoc::ReverseAdoc
2
+ class Cleaner
3
+ def tidy(string)
4
+ result = HtmlConverter.track_time "Removing inner whitespace" do
5
+ remove_inner_whitespaces(String.new(string))
6
+ end
7
+ result = HtmlConverter.track_time "Removing newlines" do
8
+ remove_newlines(result)
9
+ end
10
+ result = HtmlConverter.track_time "Removing leading newlines" do
11
+ remove_leading_newlines(result)
12
+ end
13
+ result = HtmlConverter.track_time "Cleaning tag borders" do
14
+ clean_tag_borders(result)
15
+ end
16
+ result = HtmlConverter.track_time "Cleaning punctuation characters" do
17
+ clean_punctuation_characters(result)
18
+ end
19
+ end
20
+
21
+ def remove_newlines(string)
22
+ string.gsub(/\n{3,}/, "\n\n")
23
+ end
24
+
25
+ def remove_leading_newlines(string)
26
+ string.gsub(/\A\n+/, "")
27
+ end
28
+
29
+ def remove_inner_whitespaces(string)
30
+ unless string.nil?
31
+ string.gsub!(/\n stem:\[/, "\nstem:[")
32
+ string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ")
33
+ string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1")
34
+ end
35
+ result = +""
36
+ string.each_line do |line|
37
+ result << preserve_border_whitespaces(line) do
38
+ line.strip.gsub(/[ \t]{2,}/, " ")
39
+ end
40
+ end
41
+ result
42
+ end
43
+
44
+ # Find non-asterisk content that is enclosed by two or
45
+ # more asterisks. Ensure that only one whitespace occurs
46
+ # in the border area.
47
+ # Same for underscores and brackets.
48
+ def clean_tag_borders(string)
49
+ # result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match|
50
+ # preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do
51
+ # match.strip.sub("** ", "**").sub(" **", "**")
52
+ # end
53
+ # end
54
+
55
+ # result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match|
56
+ # preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do
57
+ # match.strip.sub("__ ", "__").sub(" __", "__")
58
+ # end
59
+ # end
60
+
61
+ result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
62
+ preserve_border_whitespaces(match,
63
+ default_border: Coradoc::ReverseAdoc.config.tag_border) do
64
+ match.strip.sub("~~ ", "~~").sub(" ~~", "~~")
65
+ end
66
+ end
67
+
68
+ result.gsub(/\s?\[.*?\]\s?/) do |match|
69
+ preserve_border_whitespaces(match) do
70
+ match.strip.sub("[ ", "[").sub(" ]", "]")
71
+ end
72
+ end
73
+ end
74
+
75
+ def clean_punctuation_characters(string)
76
+ string.gsub(/(\*\*|~~|__)\s([.!?'"])/, "\\1\\2")
77
+ end
78
+
79
+ # preprocesses HTML, rather than postprocessing it
80
+ def preprocess_word_html(string)
81
+ clean_headings(scrub_whitespace(string.dup))
82
+ end
83
+
84
+ def scrub_whitespace(string)
85
+ string.gsub!(/&nbsp;|&#xA0;|\u00a0/i, "&#xA0;") # HTML encoded spaces
86
+ string.sub!(/^\A[[:space:]]+/m, "") # document leading whitespace
87
+ string.sub!(/[[:space:]]+\z$/m, "") # document trailing whitespace
88
+ string.gsub!(/( +)$/, " ") # line trailing whitespace
89
+ string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
90
+ # string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs
91
+ string
92
+ end
93
+
94
+ # following added by me
95
+ def clean_headings(string)
96
+ string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ")
97
+ # I don't know why Libre Office is inserting them, but they need to go
98
+ string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
99
+ "<sup>\\2</sup>")
100
+ # I absolutely don't know why Libre Office is rendering superscripts as h1
101
+ string
102
+ end
103
+
104
+ private
105
+
106
+ def preserve_border_whitespaces(string, options = {})
107
+ return string if /\A\s*\Z/.match?(string)
108
+
109
+ default_border = options.fetch(:default_border, "")
110
+ # If the string contains part of a link so the characters [,],(,)
111
+ # then don't add any extra spaces
112
+ default_border = "" if /[\[(\])]/.match?(string)
113
+ string_start = present_or_default(string[/\A\s*/], default_border)
114
+ string_end = present_or_default(string[/\s*\Z/], default_border)
115
+ result = yield
116
+ string_start + result + string_end
117
+ end
118
+
119
+ def present_or_default(string, default)
120
+ return default if string.nil? || string.empty?
121
+
122
+ string
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,73 @@
1
+ require "tmpdir"
2
+
3
+ module Coradoc::ReverseAdoc
4
+ class Config
5
+ def initialize
6
+ @unknown_tags = :pass_through
7
+ @input_format = :html
8
+ @mathml2asciimath = false
9
+ @external_images = false
10
+
11
+ # Destination to save file and images
12
+ @destination = nil
13
+
14
+ # Source of HTML
15
+ # @sourcedir = nil
16
+
17
+ # Image counter, assuming there are max 999 images
18
+ @image_counter = 1
19
+ # pad with 0s
20
+ @image_counter_pattern = "%03d"
21
+
22
+ @em_delimiter = "_".freeze
23
+ @strong_delimiter = "*".freeze
24
+ @inline_options = {}
25
+ @tag_border = " ".freeze
26
+
27
+ @split_sections = nil
28
+
29
+ # Document width - used to compute table sizes.
30
+ # This is an assumption for screen size in input document.
31
+ # If column widths are specified in absolute values, then we
32
+ # have to convert them to relative values, as AsciiDoc only
33
+ # supports those.
34
+ @doc_width = 1000
35
+
36
+ # Plugin system
37
+ @plugins = []
38
+
39
+ # Debugging options
40
+ @track_time = false
41
+ end
42
+
43
+ def with(options = {})
44
+ old_options = @inline_options
45
+ @inline_options = options
46
+ result = yield
47
+ @inline_options = old_options
48
+ result
49
+ end
50
+
51
+ def self.declare_option(option)
52
+ define_method(option) do
53
+ @inline_options[option] || instance_variable_get(:"@#{option}")
54
+ end
55
+
56
+ attr_writer option
57
+ end
58
+
59
+ declare_option :unknown_tags
60
+ declare_option :tag_border
61
+ declare_option :mathml2asciimath
62
+ declare_option :external_images
63
+ declare_option :destination
64
+ declare_option :sourcedir
65
+ declare_option :image_counter
66
+ declare_option :image_counter_pattern
67
+ declare_option :input_format
68
+ declare_option :split_sections
69
+ declare_option :doc_width
70
+ declare_option :plugins
71
+ declare_option :track_time
72
+ end
73
+ end