coradoc 1.1.8 → 2.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. checksums.yaml +4 -4
  2. data/.rspec +1 -1
  3. data/Rakefile +3 -12
  4. data/exe/coradoc +21 -2
  5. data/lib/coradoc/cli.rb +185 -91
  6. data/lib/coradoc/configurable.rb +527 -0
  7. data/lib/coradoc/coradoc.rb +463 -0
  8. data/lib/coradoc/core_model/annotation_block.rb +57 -0
  9. data/lib/coradoc/core_model/base.rb +172 -0
  10. data/lib/coradoc/core_model/bibliography.rb +41 -0
  11. data/lib/coradoc/core_model/bibliography_entry.rb +48 -0
  12. data/lib/coradoc/core_model/block.rb +63 -0
  13. data/lib/coradoc/core_model/children_content.rb +53 -0
  14. data/lib/coradoc/core_model/comment_block.rb +10 -0
  15. data/lib/coradoc/core_model/definition_item.rb +46 -0
  16. data/lib/coradoc/core_model/definition_list.rb +28 -0
  17. data/lib/coradoc/core_model/element_attribute.rb +26 -0
  18. data/lib/coradoc/core_model/example_block.rb +10 -0
  19. data/lib/coradoc/core_model/footnote.rb +92 -0
  20. data/lib/coradoc/core_model/horizontal_rule_block.rb +10 -0
  21. data/lib/coradoc/core_model/id_generator.rb +16 -0
  22. data/lib/coradoc/core_model/image.rb +66 -0
  23. data/lib/coradoc/core_model/inline_element.rb +140 -0
  24. data/lib/coradoc/core_model/list_block.rb +135 -0
  25. data/lib/coradoc/core_model/list_item.rb +142 -0
  26. data/lib/coradoc/core_model/listing_block.rb +13 -0
  27. data/lib/coradoc/core_model/literal_block.rb +10 -0
  28. data/lib/coradoc/core_model/metadata.rb +79 -0
  29. data/lib/coradoc/core_model/open_block.rb +10 -0
  30. data/lib/coradoc/core_model/paragraph_block.rb +10 -0
  31. data/lib/coradoc/core_model/pass_block.rb +10 -0
  32. data/lib/coradoc/core_model/quote_block.rb +12 -0
  33. data/lib/coradoc/core_model/reviewer_block.rb +10 -0
  34. data/lib/coradoc/core_model/sidebar_block.rb +10 -0
  35. data/lib/coradoc/core_model/source_block.rb +10 -0
  36. data/lib/coradoc/core_model/structural_element.rb +94 -0
  37. data/lib/coradoc/core_model/table.rb +148 -0
  38. data/lib/coradoc/core_model/term.rb +53 -0
  39. data/lib/coradoc/core_model/text_content.rb +22 -0
  40. data/lib/coradoc/core_model/toc.rb +105 -0
  41. data/lib/coradoc/core_model/toc_generator.rb +151 -0
  42. data/lib/coradoc/core_model/verse_block.rb +12 -0
  43. data/lib/coradoc/core_model.rb +77 -0
  44. data/lib/coradoc/document_builder.rb +184 -0
  45. data/lib/coradoc/document_manipulator.rb +203 -0
  46. data/lib/coradoc/errors.rb +312 -0
  47. data/lib/coradoc/format_module.rb +49 -0
  48. data/lib/coradoc/hooks.rb +176 -0
  49. data/lib/coradoc/input.rb +17 -7
  50. data/lib/coradoc/logger.rb +54 -0
  51. data/lib/coradoc/output.rb +17 -6
  52. data/lib/coradoc/performance_regression.rb +109 -0
  53. data/lib/coradoc/processor_registry.rb +50 -0
  54. data/lib/coradoc/query.rb +455 -0
  55. data/lib/coradoc/registry.rb +156 -0
  56. data/lib/coradoc/serializer/registry.rb +150 -0
  57. data/lib/coradoc/transform.rb +11 -0
  58. data/lib/coradoc/validation.rb +646 -0
  59. data/lib/coradoc/version.rb +1 -1
  60. data/lib/coradoc/visitor.rb +283 -0
  61. data/lib/coradoc.rb +40 -19
  62. metadata +67 -277
  63. data/.editorconfig +0 -15
  64. data/.envrc +0 -1
  65. data/.irbrc +0 -1
  66. data/.pryrc.sample +0 -1
  67. data/.rubocop.yml +0 -14
  68. data/.rubocop_todo.yml +0 -179
  69. data/CHANGELOG.md +0 -9
  70. data/CODE_OF_CONDUCT.md +0 -84
  71. data/Dockerfile +0 -19
  72. data/Gemfile +0 -16
  73. data/LICENSE.txt +0 -21
  74. data/Makefile +0 -35
  75. data/README.Docker.adoc +0 -57
  76. data/README.adoc +0 -119
  77. data/coradoc.gemspec +0 -40
  78. data/docker-compose.yml +0 -14
  79. data/exe/reverse_adoc +0 -81
  80. data/exe/w2a +0 -60
  81. data/flake.lock +0 -114
  82. data/flake.nix +0 -135
  83. data/lib/coradoc/converter.rb +0 -144
  84. data/lib/coradoc/document.rb +0 -77
  85. data/lib/coradoc/element/admonition.rb +0 -18
  86. data/lib/coradoc/element/attribute.rb +0 -36
  87. data/lib/coradoc/element/attribute_list.rb +0 -138
  88. data/lib/coradoc/element/audio.rb +0 -33
  89. data/lib/coradoc/element/author.rb +0 -24
  90. data/lib/coradoc/element/base.rb +0 -92
  91. data/lib/coradoc/element/bibliography.rb +0 -24
  92. data/lib/coradoc/element/bibliography_entry.rb +0 -24
  93. data/lib/coradoc/element/block/core.rb +0 -76
  94. data/lib/coradoc/element/block/example.rb +0 -23
  95. data/lib/coradoc/element/block/listing.rb +0 -21
  96. data/lib/coradoc/element/block/literal.rb +0 -21
  97. data/lib/coradoc/element/block/open.rb +0 -22
  98. data/lib/coradoc/element/block/pass.rb +0 -21
  99. data/lib/coradoc/element/block/quote.rb +0 -19
  100. data/lib/coradoc/element/block/reviewer_comment.rb +0 -19
  101. data/lib/coradoc/element/block/side.rb +0 -19
  102. data/lib/coradoc/element/block/sourcecode.rb +0 -21
  103. data/lib/coradoc/element/block.rb +0 -17
  104. data/lib/coradoc/element/break.rb +0 -11
  105. data/lib/coradoc/element/comment_block.rb +0 -22
  106. data/lib/coradoc/element/comment_line.rb +0 -18
  107. data/lib/coradoc/element/document_attributes.rb +0 -33
  108. data/lib/coradoc/element/header.rb +0 -22
  109. data/lib/coradoc/element/image/block_image.rb +0 -32
  110. data/lib/coradoc/element/image/core.rb +0 -58
  111. data/lib/coradoc/element/image/inline_image.rb +0 -12
  112. data/lib/coradoc/element/image.rb +0 -10
  113. data/lib/coradoc/element/include.rb +0 -18
  114. data/lib/coradoc/element/inline/anchor.rb +0 -19
  115. data/lib/coradoc/element/inline/attribute_reference.rb +0 -19
  116. data/lib/coradoc/element/inline/bold.rb +0 -25
  117. data/lib/coradoc/element/inline/cross_reference.rb +0 -46
  118. data/lib/coradoc/element/inline/footnote.rb +0 -24
  119. data/lib/coradoc/element/inline/hard_line_break.rb +0 -11
  120. data/lib/coradoc/element/inline/highlight.rb +0 -25
  121. data/lib/coradoc/element/inline/italic.rb +0 -25
  122. data/lib/coradoc/element/inline/link.rb +0 -42
  123. data/lib/coradoc/element/inline/monospace.rb +0 -25
  124. data/lib/coradoc/element/inline/quotation.rb +0 -20
  125. data/lib/coradoc/element/inline/small.rb +0 -19
  126. data/lib/coradoc/element/inline/span.rb +0 -37
  127. data/lib/coradoc/element/inline/subscript.rb +0 -20
  128. data/lib/coradoc/element/inline/superscript.rb +0 -20
  129. data/lib/coradoc/element/inline/underline.rb +0 -19
  130. data/lib/coradoc/element/inline.rb +0 -23
  131. data/lib/coradoc/element/list/core.rb +0 -51
  132. data/lib/coradoc/element/list/definition.rb +0 -29
  133. data/lib/coradoc/element/list/ordered.rb +0 -17
  134. data/lib/coradoc/element/list/unordered.rb +0 -17
  135. data/lib/coradoc/element/list.rb +0 -13
  136. data/lib/coradoc/element/list_item.rb +0 -98
  137. data/lib/coradoc/element/list_item_definition.rb +0 -32
  138. data/lib/coradoc/element/paragraph.rb +0 -37
  139. data/lib/coradoc/element/revision.rb +0 -27
  140. data/lib/coradoc/element/section.rb +0 -62
  141. data/lib/coradoc/element/table.rb +0 -91
  142. data/lib/coradoc/element/tag.rb +0 -19
  143. data/lib/coradoc/element/term.rb +0 -22
  144. data/lib/coradoc/element/text_element.rb +0 -92
  145. data/lib/coradoc/element/title.rb +0 -62
  146. data/lib/coradoc/element/video.rb +0 -50
  147. data/lib/coradoc/generator.rb +0 -19
  148. data/lib/coradoc/input/adoc.rb +0 -30
  149. data/lib/coradoc/input/docx.rb +0 -64
  150. data/lib/coradoc/input/html/LICENSE.txt +0 -25
  151. data/lib/coradoc/input/html/README.adoc +0 -308
  152. data/lib/coradoc/input/html/cleaner.rb +0 -142
  153. data/lib/coradoc/input/html/config.rb +0 -77
  154. data/lib/coradoc/input/html/converters/a.rb +0 -52
  155. data/lib/coradoc/input/html/converters/aside.rb +0 -16
  156. data/lib/coradoc/input/html/converters/audio.rb +0 -29
  157. data/lib/coradoc/input/html/converters/base.rb +0 -108
  158. data/lib/coradoc/input/html/converters/blockquote.rb +0 -22
  159. data/lib/coradoc/input/html/converters/br.rb +0 -15
  160. data/lib/coradoc/input/html/converters/bypass.rb +0 -81
  161. data/lib/coradoc/input/html/converters/code.rb +0 -23
  162. data/lib/coradoc/input/html/converters/div.rb +0 -19
  163. data/lib/coradoc/input/html/converters/dl.rb +0 -62
  164. data/lib/coradoc/input/html/converters/drop.rb +0 -26
  165. data/lib/coradoc/input/html/converters/em.rb +0 -21
  166. data/lib/coradoc/input/html/converters/figure.rb +0 -25
  167. data/lib/coradoc/input/html/converters/h.rb +0 -42
  168. data/lib/coradoc/input/html/converters/head.rb +0 -23
  169. data/lib/coradoc/input/html/converters/hr.rb +0 -15
  170. data/lib/coradoc/input/html/converters/ignore.rb +0 -20
  171. data/lib/coradoc/input/html/converters/img.rb +0 -110
  172. data/lib/coradoc/input/html/converters/li.rb +0 -17
  173. data/lib/coradoc/input/html/converters/mark.rb +0 -19
  174. data/lib/coradoc/input/html/converters/markup.rb +0 -31
  175. data/lib/coradoc/input/html/converters/math.rb +0 -38
  176. data/lib/coradoc/input/html/converters/ol.rb +0 -65
  177. data/lib/coradoc/input/html/converters/p.rb +0 -23
  178. data/lib/coradoc/input/html/converters/pass_through.rb +0 -17
  179. data/lib/coradoc/input/html/converters/pre.rb +0 -55
  180. data/lib/coradoc/input/html/converters/q.rb +0 -16
  181. data/lib/coradoc/input/html/converters/strong.rb +0 -20
  182. data/lib/coradoc/input/html/converters/sub.rb +0 -22
  183. data/lib/coradoc/input/html/converters/sup.rb +0 -22
  184. data/lib/coradoc/input/html/converters/table.rb +0 -319
  185. data/lib/coradoc/input/html/converters/td.rb +0 -81
  186. data/lib/coradoc/input/html/converters/text.rb +0 -32
  187. data/lib/coradoc/input/html/converters/th.rb +0 -18
  188. data/lib/coradoc/input/html/converters/tr.rb +0 -22
  189. data/lib/coradoc/input/html/converters/video.rb +0 -29
  190. data/lib/coradoc/input/html/converters.rb +0 -59
  191. data/lib/coradoc/input/html/errors.rb +0 -14
  192. data/lib/coradoc/input/html/html_converter.rb +0 -168
  193. data/lib/coradoc/input/html/plugin.rb +0 -131
  194. data/lib/coradoc/input/html/plugins/plateau.rb +0 -213
  195. data/lib/coradoc/input/html/postprocessor.rb +0 -220
  196. data/lib/coradoc/input/html.rb +0 -61
  197. data/lib/coradoc/legacy_parser.rb +0 -200
  198. data/lib/coradoc/oscal.rb +0 -99
  199. data/lib/coradoc/output/adoc.rb +0 -19
  200. data/lib/coradoc/output/coradoc_tree_debug.rb +0 -21
  201. data/lib/coradoc/parser/asciidoc/admonition.rb +0 -24
  202. data/lib/coradoc/parser/asciidoc/attribute_list.rb +0 -89
  203. data/lib/coradoc/parser/asciidoc/base.rb +0 -87
  204. data/lib/coradoc/parser/asciidoc/bibliography.rb +0 -29
  205. data/lib/coradoc/parser/asciidoc/block.rb +0 -94
  206. data/lib/coradoc/parser/asciidoc/citation.rb +0 -30
  207. data/lib/coradoc/parser/asciidoc/content.rb +0 -64
  208. data/lib/coradoc/parser/asciidoc/document_attributes.rb +0 -25
  209. data/lib/coradoc/parser/asciidoc/header.rb +0 -29
  210. data/lib/coradoc/parser/asciidoc/inline.rb +0 -195
  211. data/lib/coradoc/parser/asciidoc/list.rb +0 -115
  212. data/lib/coradoc/parser/asciidoc/paragraph.rb +0 -54
  213. data/lib/coradoc/parser/asciidoc/section.rb +0 -61
  214. data/lib/coradoc/parser/asciidoc/table.rb +0 -32
  215. data/lib/coradoc/parser/asciidoc/term.rb +0 -41
  216. data/lib/coradoc/parser/asciidoc/text.rb +0 -158
  217. data/lib/coradoc/parser/base.rb +0 -40
  218. data/lib/coradoc/parser.rb +0 -11
  219. data/lib/coradoc/reverse_adoc.rb +0 -18
  220. data/lib/coradoc/transformer.rb +0 -476
  221. data/lib/coradoc/util.rb +0 -12
  222. data/lib/reverse_adoc.rb +0 -20
  223. data/utils/inspect_asciidoc.rb +0 -29
  224. data/utils/parser_analyzer.rb +0 -66
  225. data/utils/round_trip.rb +0 -53
@@ -1,308 +0,0 @@
1
- = AsciiDoc from HTML and Microsoft Word: formerly reverse_adoc
2
-
3
- == Purpose
4
-
5
- Transforms HTML and Microsoft Word into AsciiDoc.
6
-
7
- Based on https://github.com/xijo/reverse_markdown
8
-
9
- reverse_adoc used to be a separate Gem, but now it's part of Coradoc.
10
-
11
-
12
- == Installation
13
-
14
- Install the gem:
15
-
16
- [source,console]
17
- ----
18
- [sudo] gem install coradoc
19
- ----
20
-
21
- or add it to your `Gemfile`:
22
-
23
- [source,ruby]
24
- ----
25
- gem 'coradoc'
26
- ----
27
-
28
-
29
- == Command-line usage
30
-
31
- === HTML to AsciiDoc: `reverse_adoc`
32
-
33
- Convert HTML files to AsciiDoc:
34
-
35
- [source,console]
36
- ----
37
- $ reverse_adoc file.html > file.adoc
38
- $ cat file.html | reverse_adoc > file.adoc
39
- ----
40
-
41
-
42
- === Microsoft Word to AsciiDoc: `w2a`
43
-
44
- Convert Word `.doc` or `.docx` files to AsciiDoc:
45
-
46
- [source,console]
47
- ----
48
- $ w2a file.docx > file.adoc
49
- ----
50
-
51
- [source,console]
52
- ----
53
- $ w2a input.docx -o output.adoc
54
- ----
55
-
56
- Help:
57
-
58
- [source,console]
59
- ----
60
- $ w2a -h
61
- Usage: w2a [options] <file>
62
- -a, --mathml2asciimath Convert MathML to AsciiMath
63
- -o, --output=FILENAME Output file to write to
64
- -e, --external-images Export images if data URI
65
- -v, --version Version information
66
- -h, --help Prints this help
67
- ----
68
-
69
-
70
- NOTE: `w2a` requires LibreOffice to be installed. It uses LibreOffice's
71
- export to XHTML. LibreOffice's export of XHTML is superior to the native Microsoft Word export
72
- to HTML: it exports lists (which Word keeps as paragraphs), and it exports OOMML into MathML.
73
- On the other hand, the LibreOffice export relies on default styling being used in the
74
- document, and it may not cope with ordered lists or headings with customised appearance.
75
- For best results, reset the styles in the document you're converting to those in
76
- the default `Normal.dot` template.
77
-
78
- NOTE: `w2a` requires the command-line version of LibreOffice, `soffice`. As it turns out,
79
- LibreOffice v6 appears to render formulae in HTML as images instead of MathML expressions;
80
- use LibreOffice v5. If you have both LibreOffice v5 and LibreOffice v6 installed, make sure
81
- that your OS path searches for the LibreOffice v5 version of `soffice` first; e.g. on Mac,
82
- include something like `/Applications/LibreOffice5.4.7.2.app/Contents/MacOS` in your PATH
83
- environment.
84
-
85
- NOTE: Some information in OOMML is not preserved in the export to MathML from LibreOffice;
86
- in particular, font shifts such as double-struck fonts.
87
- The LibreOffice exporter does seem to drop some text (possibly associated with
88
- MathML); use with caution.
89
-
90
- NOTE: Adapted from `w2m` of
91
- https://github.com/benbalter/word-to-markdown[Ben Balter's word-to-markdown]
92
-
93
-
94
- === Common options
95
-
96
-
97
- ==== MathML to AsciiMath conversion
98
-
99
- If you wish to convert the MathML in the document to AsciiMath, run the script with the
100
- `--mathml2asciimath` option:
101
-
102
- [source,console]
103
- ----
104
- $ w2a --mathml2asciimath document.docx > document.adoc
105
- ----
106
-
107
-
108
- ==== Extracting images
109
-
110
- Images referred by the HTML can be extracted into the destination output folder by using:
111
-
112
- [source,console]
113
- ----
114
- $ reverse_adoc input.docx -o output/file.adoc -e
115
- $ reverse_adoc input.docx --output output/file.adoc --external-images
116
- ----
117
-
118
-
119
- Word embedded images can be extracted into the destination output folder by using:
120
-
121
- [source,console]
122
- ----
123
- $ w2a input.docx -o output/file.adoc -e
124
- $ w2a input.docx --output output/file.adoc --external-images
125
- ----
126
-
127
-
128
- ==== Handling unknown HTML tags
129
-
130
- The `--unknown_tags` option allows you to specify how to handle unknown tags
131
- (default `pass_through`).
132
-
133
- Valid options are:
134
-
135
- * `pass_through` - Include the unknown tag completely into the result
136
- * `drop` - Drop the unknown tag and its content
137
- * `bypass` - Ignore the unknown tag but try to convert its content
138
- * `raise` - Raise an error to let you know
139
-
140
-
141
- ==== Tagging of borders
142
-
143
- Specify how to handle tag borders with the option `--tag_border` (default `' '`).
144
-
145
- Valid options are:
146
-
147
- * `' '` - Add whitespace if there is none at tag borders.
148
- * `''` - Do not not add whitespace.
149
-
150
-
151
- == Features
152
-
153
- === General
154
-
155
- `reverse_adoc` shares features as a port of `reverse_markdown`:
156
-
157
- * Module based -- if you miss a tag, just add it
158
- * Can deal with nested lists
159
- * Inline and block code is supported
160
- * Supports blockquote
161
-
162
- It supports the following HTML tags (these are supported by `reverse_markdown`):
163
-
164
- * `a`
165
- * `blockquote`
166
- * `br`
167
- * `code`, `tt` (added: `kbd`, `samp`, `var`)
168
- * `div`, `article`
169
- * `em`, `i` (added: `cite`)
170
- * `h1`, `h2`, `h3`, `h4`, `h5`, `h6`, `hr`
171
- * `img`
172
- * `li`, `ol`, `ul` (added: `dir`)
173
- * `p`, `pre`
174
- * `strong`, `b`
175
- * `table`, `td`, `th`, `tr`
176
-
177
- [NOTE]
178
- ====
179
- * reverse_adoc does *not* support `del` or `strike`, because Asciidoctor does not out of the box.
180
- * As with reverse_markdown, `pre` is only treated as sourcecode if it is contained in a `div@class = highlight-` element, or has a `@brush` attribute naming the language (Confluence).
181
- * The gem does not support `p@align`, because Asciidoctor doesn't
182
- ====
183
-
184
- In addition, it supports:
185
-
186
- * `aside`
187
- * `audio`, `video` (with `@src` attributes)
188
- * `figure`, `figcaption`
189
- * `mark`
190
- * `q`
191
- * `sub`, `sup`
192
- * `@id` anchors
193
- * `blockquote@cite`
194
- * `img/@width`, `img/@height`
195
- * `ol/@style`, `ol/@start`, `ol/@reversed`, `ul/@type`
196
- * `td/@colspan`, `td/@rowspan`, `td@/align`, `td@/valign`
197
- * `table/caption`, `table/@width`, `table/@frame` (partial), `table/@rules` (partial)
198
- * Lists and paragraphs within cells
199
- ** Not tables within cells: Asciidoctor cannot deal with nested tables
200
-
201
- The gem does not support:
202
-
203
- * `col`, `colgroup`
204
- * `source`, `picture`
205
- * `bdi`, `bdo`, `ruby`, `rt`, `rp`, `wbr`
206
- * `frame`, `frameset`, `iframe`, `noframes`, `noscript`, `script`, `input`, `output`, `progress`
207
- * `map`, `canvas`, `dialog`, `embed`, `object`, `param`, `svg`, `track`
208
- * `fieldset`, `button`, `datalist`, `form`, `label`, `legend`, `menu`, `menulist`, `optgroup`, `option`, `select`, `textarea`
209
- * `big`, `dfn`, `font`, `s`, `small`, `span`, `strike`, `u`
210
- * `center`
211
- * `data`, `meter`
212
- * `del`, `ins`
213
- * `footer`, `header`, `main`, `nav`, `details`, `section`, `summary`, `template`
214
-
215
-
216
- === MathML support
217
-
218
- If you are using this gem in the context of https://www.metanorma.com[Metanorma],
219
- Metanorma AsciiDoc accepts MathML as a native mathematical format. So you do not need
220
- to convert the MathML to AsciiMath.
221
-
222
- The gem will optionally invoke the https://github.com/metanorma/mathml2asciimath
223
- gem, to convert MathML to AsciiMath. The conversion is not perfect, and will need to be
224
- post-edited; but it's a lot better than nothing.
225
-
226
- NOTE: Asciidoctor does not support MathML input. HTML uses MathML.
227
- The gem will recognize MathML expressions in HTML, and will wrap them in Asciidoctor
228
- `stem:[ ]` macros. The result of this gem is not actually legal Asciidoctor for `stem`:
229
- Asciidoctor will presumably
230
- think this is AsciiMath in the `stem:[ ]` macro, try to pass it into MathJax as
231
- AsciiMath, and fail. But of course, MathJax has no problem with MathML, and some postprocessing
232
- on the Asciidoctor output can ensure that the MathML is treated by MathJax (or whatever else
233
- uses the output) as such; so this is still much better than nothing for stem processing.
234
-
235
- === Word cleanup
236
-
237
- This gem is routinely used in the Metanorma project to export Word documents to AsciiDoc.
238
- The HTML export from Word that the gem uses, from LibreOffice, is much cleaner than the
239
- native HTML 4 export from Word; but it has some infelicities which this gem cleans up:
240
-
241
- * The HTML export has trouble with subscripts, and routinely exports them as headings; the `w2a`
242
- script tries to clean them up.
243
- * The `w2a` cleans up spaces, but it does not strip them.
244
- * Spaces are removed from anchors and cross-references.
245
- * Double underscores are removed from anchors and cross-references.
246
- * Cross-references to `_GoBack` and to `_Toc` followed by numbers (used to construct tables of contents) are ignored.
247
-
248
- == Ruby library usage
249
-
250
- === General
251
-
252
- Simple to use.
253
-
254
- [source,ruby]
255
- ----
256
- require 'coradoc/input/html'
257
-
258
- result = Coradoc::Input::HTML.convert input
259
- result.inspect # " *feelings* "
260
- ----
261
-
262
- === Configure with options
263
-
264
- Just pass your chosen configuration options in after the input. The given options will last for this operation only.
265
-
266
- [source,ruby]
267
- ----
268
- require 'coradoc/input/html'
269
-
270
- Coradoc::Input::HTML.convert(input, unknown_tags: :raise, mathml2asciimath: true)
271
- ----
272
-
273
-
274
- === Preconfigure using an initializer
275
-
276
- Or configure it block style on a initializer level. These configurations will last for all conversions until they are set to something different.
277
-
278
- [source,ruby]
279
- ----
280
- require 'coradoc/input/html'
281
-
282
- Coradoc::Input::HTML.config do |config|
283
- config.unknown_tags = :bypass
284
- config.mathml2asciimath = true
285
- config.tag_border = ''
286
- end
287
- ----
288
-
289
- === Convert HTML to a Coradoc AST
290
-
291
- [source,ruby]
292
- ----
293
- require 'coradoc/input/html'
294
-
295
- # Options can be supplied as keyword arguments
296
- Coradoc::Input::Html::HtmlConverter.to_coradoc("<b><i>Some input</i></b>")
297
- ----
298
-
299
-
300
- == Related stuff
301
-
302
- * https://github.com/xijo/reverse_markdown[Xijo's original reverse_markdown gem]
303
- * https://github.com/xijo/reverse_markdown/wiki/Write-your-own-converter[Write custom converters] - Wiki entry about how to write your own converter
304
- * https://github.com/harlantwood/html_massage[html_massage] - A gem by Harlan T. Wood to convert regular sites into markdown using reverse_markdown
305
- * https://github.com/benbalter/word-to-markdown[word-to-markdown] - Convert word docs into markdown while using reverse_markdown, by Ben Balter
306
- * https://github.com/asciidocfx/HtmlToAsciidoc[HtmlToAsciidoc] - Javascript regexp-based converter of HTML to Asciidoctor
307
- * https://asciidoctor.org/docs/user-manual/[The Asciidoctor User Manual]
308
-
@@ -1,142 +0,0 @@
1
- module Coradoc
2
- module Input
3
- module Html
4
- class Cleaner
5
- def tidy(string)
6
- if string.is_a? Hash
7
- return string.transform_values { |i| tidy(i) }
8
- end
9
-
10
- result = HtmlConverter.track_time "Removing inner whitespace" do
11
- remove_inner_whitespaces(String.new(string))
12
- end
13
- result = HtmlConverter.track_time "Removing newlines" do
14
- remove_newlines(result)
15
- end
16
- result = HtmlConverter.track_time "Removing leading newlines" do
17
- remove_leading_newlines(result)
18
- end
19
- result = HtmlConverter.track_time "Cleaning tag borders" do
20
- clean_tag_borders(result)
21
- end
22
- result = HtmlConverter.track_time "Cleaning punctuation characters" do
23
- clean_punctuation_characters(result)
24
- end
25
- result = remove_block_leading_newlines(result)
26
- result = remove_section_attribute_newlines(result)
27
- end
28
-
29
- def remove_block_leading_newlines(string)
30
- string.gsub("]\n****\n\n", "]\n****\n")
31
- end
32
-
33
- def remove_section_attribute_newlines(string)
34
- string.gsub("]\n\n==", "]\n==")
35
- end
36
-
37
- def remove_newlines(string)
38
- string.gsub(/\n{3,}/, "\n\n")
39
- end
40
-
41
- def remove_leading_newlines(string)
42
- string.gsub(/\A\n+/, "")
43
- end
44
-
45
- def remove_inner_whitespaces(string)
46
- unless string.nil?
47
- string.gsub!(/\n stem:\[/, "\nstem:[")
48
- string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ")
49
- string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1")
50
- end
51
- result = +""
52
- string.each_line do |line|
53
- result << preserve_border_whitespaces(line) do
54
- line.strip.gsub(/[ \t]{2,}/, " ")
55
- end
56
- end
57
- result
58
- end
59
-
60
- # Find non-asterisk content that is enclosed by two or
61
- # more asterisks. Ensure that only one whitespace occurs
62
- # in the border area.
63
- # Same for underscores and brackets.
64
- def clean_tag_borders(string)
65
- # result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match|
66
- # preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do
67
- # match.strip.sub("** ", "**").sub(" **", "**")
68
- # end
69
- # end
70
-
71
- # result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match|
72
- # preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do
73
- # match.strip.sub("__ ", "__").sub(" __", "__")
74
- # end
75
- # end
76
-
77
- result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
78
- preserve_border_whitespaces(match,
79
- default_border: Coradoc::Input::Html.config.tag_border) do
80
- match.strip.sub("~~ ", "~~").sub(" ~~", "~~")
81
- end
82
- end
83
-
84
- result.gsub(/\s?\[.*?\]\s?/) do |match|
85
- preserve_border_whitespaces(match) do
86
- match.strip.sub("[ ", "[").sub(" ]", "]")
87
- end
88
- end
89
- end
90
-
91
- def clean_punctuation_characters(string)
92
- string.gsub(/(\*\*|~~|__)\s([.!?'"])/, "\\1\\2")
93
- end
94
-
95
- # preprocesses HTML, rather than postprocessing it
96
- def preprocess_word_html(string)
97
- clean_headings(scrub_whitespace(string.dup))
98
- end
99
-
100
- def scrub_whitespace(string)
101
- string.gsub!(/&nbsp;|&#xA0;|\u00a0/i, "&#xA0;") # HTML encoded spaces
102
- string = Coradoc.strip_unicode(string) # Strip document-level leading and trailing whitespace
103
- string.gsub!(/( +)$/, " ") # line trailing whitespace
104
- string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
105
- # string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs
106
- string
107
- end
108
-
109
- # following added by me
110
- def clean_headings(string)
111
- string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ")
112
- # I don't know why Libre Office is inserting them, but they need to go
113
- string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
114
- "<sup>\\2</sup>")
115
- # I absolutely don't know why Libre Office is rendering superscripts as h1
116
- string
117
- end
118
-
119
- private
120
-
121
- def preserve_border_whitespaces(string, options = {})
122
- return string if /\A\s*\Z/.match?(string)
123
-
124
- default_border = options.fetch(:default_border, "")
125
- # If the string contains part of a link so the characters [,],(,)
126
- # then don't add any extra spaces
127
- default_border = "" if /[\[(\])]/.match?(string)
128
- string_start = present_or_default(string[/\A\s*/], default_border)
129
- string_end = present_or_default(string[/\s*\Z/], default_border)
130
- result = yield
131
- string_start + result + string_end
132
- end
133
-
134
- def present_or_default(string, default)
135
- return default if string.nil? || string.empty?
136
-
137
- string
138
- end
139
- end
140
- end
141
- end
142
- end
@@ -1,77 +0,0 @@
1
- require "tmpdir"
2
-
3
- module Coradoc
4
- module Input
5
- module Html
6
- class Config
7
- def initialize
8
- @unknown_tags = :pass_through
9
- @input_format = :html
10
- @mathml2asciimath = false
11
- @external_images = false
12
-
13
- # Destination to save file and images
14
- @destination = nil
15
-
16
- # Source of HTML
17
- # @sourcedir = nil
18
-
19
- # Image counter, assuming there are max 999 images
20
- @image_counter = 1
21
- # pad with 0s
22
- @image_counter_pattern = "%03d"
23
-
24
- @em_delimiter = "_".freeze
25
- @strong_delimiter = "*".freeze
26
- @inline_options = {}
27
- @tag_border = " ".freeze
28
-
29
- @split_sections = nil
30
-
31
- # Document width - used to compute table sizes.
32
- # This is an assumption for screen size in input document.
33
- # If column widths are specified in absolute values, then we
34
- # have to convert them to relative values, as AsciiDoc only
35
- # supports those.
36
- @doc_width = 1000
37
-
38
- # Plugin system
39
- @plugins = []
40
-
41
- # Debugging options
42
- @track_time = false
43
- end
44
-
45
- def with(options = {})
46
- old_options = @inline_options
47
- @inline_options = options
48
- result = yield
49
- @inline_options = old_options
50
- result
51
- end
52
-
53
- def self.declare_option(option)
54
- define_method(option) do
55
- @inline_options[option] || instance_variable_get(:"@#{option}")
56
- end
57
-
58
- attr_writer option
59
- end
60
-
61
- declare_option :unknown_tags
62
- declare_option :tag_border
63
- declare_option :mathml2asciimath
64
- declare_option :external_images
65
- declare_option :destination
66
- declare_option :sourcedir
67
- declare_option :image_counter
68
- declare_option :image_counter_pattern
69
- declare_option :input_format
70
- declare_option :split_sections
71
- declare_option :doc_width
72
- declare_option :plugins
73
- declare_option :track_time
74
- end
75
- end
76
- end
77
- end
@@ -1,52 +0,0 @@
1
- require "coradoc"
2
-
3
- module Coradoc
4
- module Input
5
- module Html
6
- module Converters
7
- class A < Base
8
- def to_coradoc(node, state = {})
9
- name = treat_children(node, state)
10
-
11
- href = node["href"]
12
- title = extract_title(node)
13
- id = node["id"] || node["name"]
14
-
15
- id = id&.gsub(/\s/, "")&.gsub(/__+/, "_")
16
- id = nil if id&.empty?
17
-
18
- return "" if /^_Toc\d+$|^_GoBack$/.match?(id)
19
-
20
- return Coradoc::Element::Inline::Anchor.new(id) if id
21
-
22
- if href.to_s.start_with?("#")
23
- href = href.sub(/^#/, "").gsub(/\s/, "").gsub(/__+/, "_")
24
- return Coradoc::Element::Inline::CrossReference.new(href, name)
25
- end
26
-
27
- return name if href.to_s.empty?
28
-
29
- ambigous_characters = /[\w.?&#=%;\[\u{ff}-\u{10ffff}]/
30
- if name&.strip == href
31
- name = ""
32
- right_constrain = textnode_after_start_with?(node,
33
- ambigous_characters)
34
- end
35
-
36
- out = []
37
- out << " " if textnode_before_end_with?(node, ambigous_characters)
38
- out << Coradoc::Element::Inline::Link.new(
39
- path: href,
40
- name: name.strip,
41
- title: title.strip,
42
- right_constrain: right_constrain,
43
- )
44
- out
45
- end
46
- end
47
-
48
- register :a, A.new
49
- end
50
- end
51
- end
52
- end
@@ -1,16 +0,0 @@
1
- module Coradoc
2
- module Input
3
- module Html
4
- module Converters
5
- class Aside < Base
6
- def to_coradoc(node, state = {})
7
- content = treat_children(node, state)
8
- Coradoc::Element::Block::Side.new(lines: content.lines)
9
- end
10
- end
11
-
12
- register :aside, Aside.new
13
- end
14
- end
15
- end
16
- end
@@ -1,29 +0,0 @@
1
- module Coradoc
2
- module Input
3
- module Html
4
- module Converters
5
- class Audio < Base
6
- def to_coradoc(node, _state = {})
7
- src = node["src"]
8
- id = node["id"]
9
- title = extract_title(node)
10
- attributes = Coradoc::Element::AttributeList.new
11
- options = options(node)
12
- attributes.add_named("options", options) if options.any?
13
- Coradoc::Element::Audio.new(title, id: id, src: src,
14
- attributes: attributes)
15
- end
16
-
17
- def options(node)
18
- autoplay = node["autoplay"]
19
- loop_attr = node["loop"]
20
- controls = node["controls"]
21
- [autoplay, loop_attr, controls].compact
22
- end
23
- end
24
-
25
- register :audio, Audio.new
26
- end
27
- end
28
- end
29
- end