coradoc 1.1.8 → 2.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. checksums.yaml +4 -4
  2. data/.rspec +1 -1
  3. data/Rakefile +3 -12
  4. data/exe/coradoc +21 -2
  5. data/lib/coradoc/cli.rb +185 -91
  6. data/lib/coradoc/configurable.rb +527 -0
  7. data/lib/coradoc/coradoc.rb +463 -0
  8. data/lib/coradoc/core_model/annotation_block.rb +57 -0
  9. data/lib/coradoc/core_model/base.rb +172 -0
  10. data/lib/coradoc/core_model/bibliography.rb +41 -0
  11. data/lib/coradoc/core_model/bibliography_entry.rb +48 -0
  12. data/lib/coradoc/core_model/block.rb +63 -0
  13. data/lib/coradoc/core_model/children_content.rb +53 -0
  14. data/lib/coradoc/core_model/comment_block.rb +10 -0
  15. data/lib/coradoc/core_model/definition_item.rb +46 -0
  16. data/lib/coradoc/core_model/definition_list.rb +28 -0
  17. data/lib/coradoc/core_model/element_attribute.rb +26 -0
  18. data/lib/coradoc/core_model/example_block.rb +10 -0
  19. data/lib/coradoc/core_model/footnote.rb +92 -0
  20. data/lib/coradoc/core_model/horizontal_rule_block.rb +10 -0
  21. data/lib/coradoc/core_model/id_generator.rb +16 -0
  22. data/lib/coradoc/core_model/image.rb +66 -0
  23. data/lib/coradoc/core_model/inline_element.rb +140 -0
  24. data/lib/coradoc/core_model/list_block.rb +135 -0
  25. data/lib/coradoc/core_model/list_item.rb +142 -0
  26. data/lib/coradoc/core_model/listing_block.rb +13 -0
  27. data/lib/coradoc/core_model/literal_block.rb +10 -0
  28. data/lib/coradoc/core_model/metadata.rb +79 -0
  29. data/lib/coradoc/core_model/open_block.rb +10 -0
  30. data/lib/coradoc/core_model/paragraph_block.rb +10 -0
  31. data/lib/coradoc/core_model/pass_block.rb +10 -0
  32. data/lib/coradoc/core_model/quote_block.rb +12 -0
  33. data/lib/coradoc/core_model/reviewer_block.rb +10 -0
  34. data/lib/coradoc/core_model/sidebar_block.rb +10 -0
  35. data/lib/coradoc/core_model/source_block.rb +10 -0
  36. data/lib/coradoc/core_model/structural_element.rb +94 -0
  37. data/lib/coradoc/core_model/table.rb +148 -0
  38. data/lib/coradoc/core_model/term.rb +53 -0
  39. data/lib/coradoc/core_model/text_content.rb +22 -0
  40. data/lib/coradoc/core_model/toc.rb +105 -0
  41. data/lib/coradoc/core_model/toc_generator.rb +151 -0
  42. data/lib/coradoc/core_model/verse_block.rb +12 -0
  43. data/lib/coradoc/core_model.rb +77 -0
  44. data/lib/coradoc/document_builder.rb +184 -0
  45. data/lib/coradoc/document_manipulator.rb +203 -0
  46. data/lib/coradoc/errors.rb +312 -0
  47. data/lib/coradoc/format_module.rb +49 -0
  48. data/lib/coradoc/hooks.rb +176 -0
  49. data/lib/coradoc/input.rb +17 -7
  50. data/lib/coradoc/logger.rb +54 -0
  51. data/lib/coradoc/output.rb +17 -6
  52. data/lib/coradoc/performance_regression.rb +109 -0
  53. data/lib/coradoc/processor_registry.rb +50 -0
  54. data/lib/coradoc/query.rb +455 -0
  55. data/lib/coradoc/registry.rb +156 -0
  56. data/lib/coradoc/serializer/registry.rb +150 -0
  57. data/lib/coradoc/transform.rb +11 -0
  58. data/lib/coradoc/validation.rb +646 -0
  59. data/lib/coradoc/version.rb +1 -1
  60. data/lib/coradoc/visitor.rb +283 -0
  61. data/lib/coradoc.rb +40 -19
  62. metadata +67 -277
  63. data/.editorconfig +0 -15
  64. data/.envrc +0 -1
  65. data/.irbrc +0 -1
  66. data/.pryrc.sample +0 -1
  67. data/.rubocop.yml +0 -14
  68. data/.rubocop_todo.yml +0 -179
  69. data/CHANGELOG.md +0 -9
  70. data/CODE_OF_CONDUCT.md +0 -84
  71. data/Dockerfile +0 -19
  72. data/Gemfile +0 -16
  73. data/LICENSE.txt +0 -21
  74. data/Makefile +0 -35
  75. data/README.Docker.adoc +0 -57
  76. data/README.adoc +0 -119
  77. data/coradoc.gemspec +0 -40
  78. data/docker-compose.yml +0 -14
  79. data/exe/reverse_adoc +0 -81
  80. data/exe/w2a +0 -60
  81. data/flake.lock +0 -114
  82. data/flake.nix +0 -135
  83. data/lib/coradoc/converter.rb +0 -144
  84. data/lib/coradoc/document.rb +0 -77
  85. data/lib/coradoc/element/admonition.rb +0 -18
  86. data/lib/coradoc/element/attribute.rb +0 -36
  87. data/lib/coradoc/element/attribute_list.rb +0 -138
  88. data/lib/coradoc/element/audio.rb +0 -33
  89. data/lib/coradoc/element/author.rb +0 -24
  90. data/lib/coradoc/element/base.rb +0 -92
  91. data/lib/coradoc/element/bibliography.rb +0 -24
  92. data/lib/coradoc/element/bibliography_entry.rb +0 -24
  93. data/lib/coradoc/element/block/core.rb +0 -76
  94. data/lib/coradoc/element/block/example.rb +0 -23
  95. data/lib/coradoc/element/block/listing.rb +0 -21
  96. data/lib/coradoc/element/block/literal.rb +0 -21
  97. data/lib/coradoc/element/block/open.rb +0 -22
  98. data/lib/coradoc/element/block/pass.rb +0 -21
  99. data/lib/coradoc/element/block/quote.rb +0 -19
  100. data/lib/coradoc/element/block/reviewer_comment.rb +0 -19
  101. data/lib/coradoc/element/block/side.rb +0 -19
  102. data/lib/coradoc/element/block/sourcecode.rb +0 -21
  103. data/lib/coradoc/element/block.rb +0 -17
  104. data/lib/coradoc/element/break.rb +0 -11
  105. data/lib/coradoc/element/comment_block.rb +0 -22
  106. data/lib/coradoc/element/comment_line.rb +0 -18
  107. data/lib/coradoc/element/document_attributes.rb +0 -33
  108. data/lib/coradoc/element/header.rb +0 -22
  109. data/lib/coradoc/element/image/block_image.rb +0 -32
  110. data/lib/coradoc/element/image/core.rb +0 -58
  111. data/lib/coradoc/element/image/inline_image.rb +0 -12
  112. data/lib/coradoc/element/image.rb +0 -10
  113. data/lib/coradoc/element/include.rb +0 -18
  114. data/lib/coradoc/element/inline/anchor.rb +0 -19
  115. data/lib/coradoc/element/inline/attribute_reference.rb +0 -19
  116. data/lib/coradoc/element/inline/bold.rb +0 -25
  117. data/lib/coradoc/element/inline/cross_reference.rb +0 -46
  118. data/lib/coradoc/element/inline/footnote.rb +0 -24
  119. data/lib/coradoc/element/inline/hard_line_break.rb +0 -11
  120. data/lib/coradoc/element/inline/highlight.rb +0 -25
  121. data/lib/coradoc/element/inline/italic.rb +0 -25
  122. data/lib/coradoc/element/inline/link.rb +0 -42
  123. data/lib/coradoc/element/inline/monospace.rb +0 -25
  124. data/lib/coradoc/element/inline/quotation.rb +0 -20
  125. data/lib/coradoc/element/inline/small.rb +0 -19
  126. data/lib/coradoc/element/inline/span.rb +0 -37
  127. data/lib/coradoc/element/inline/subscript.rb +0 -20
  128. data/lib/coradoc/element/inline/superscript.rb +0 -20
  129. data/lib/coradoc/element/inline/underline.rb +0 -19
  130. data/lib/coradoc/element/inline.rb +0 -23
  131. data/lib/coradoc/element/list/core.rb +0 -51
  132. data/lib/coradoc/element/list/definition.rb +0 -29
  133. data/lib/coradoc/element/list/ordered.rb +0 -17
  134. data/lib/coradoc/element/list/unordered.rb +0 -17
  135. data/lib/coradoc/element/list.rb +0 -13
  136. data/lib/coradoc/element/list_item.rb +0 -98
  137. data/lib/coradoc/element/list_item_definition.rb +0 -32
  138. data/lib/coradoc/element/paragraph.rb +0 -37
  139. data/lib/coradoc/element/revision.rb +0 -27
  140. data/lib/coradoc/element/section.rb +0 -62
  141. data/lib/coradoc/element/table.rb +0 -91
  142. data/lib/coradoc/element/tag.rb +0 -19
  143. data/lib/coradoc/element/term.rb +0 -22
  144. data/lib/coradoc/element/text_element.rb +0 -92
  145. data/lib/coradoc/element/title.rb +0 -62
  146. data/lib/coradoc/element/video.rb +0 -50
  147. data/lib/coradoc/generator.rb +0 -19
  148. data/lib/coradoc/input/adoc.rb +0 -30
  149. data/lib/coradoc/input/docx.rb +0 -64
  150. data/lib/coradoc/input/html/LICENSE.txt +0 -25
  151. data/lib/coradoc/input/html/README.adoc +0 -308
  152. data/lib/coradoc/input/html/cleaner.rb +0 -142
  153. data/lib/coradoc/input/html/config.rb +0 -77
  154. data/lib/coradoc/input/html/converters/a.rb +0 -52
  155. data/lib/coradoc/input/html/converters/aside.rb +0 -16
  156. data/lib/coradoc/input/html/converters/audio.rb +0 -29
  157. data/lib/coradoc/input/html/converters/base.rb +0 -108
  158. data/lib/coradoc/input/html/converters/blockquote.rb +0 -22
  159. data/lib/coradoc/input/html/converters/br.rb +0 -15
  160. data/lib/coradoc/input/html/converters/bypass.rb +0 -81
  161. data/lib/coradoc/input/html/converters/code.rb +0 -23
  162. data/lib/coradoc/input/html/converters/div.rb +0 -19
  163. data/lib/coradoc/input/html/converters/dl.rb +0 -62
  164. data/lib/coradoc/input/html/converters/drop.rb +0 -26
  165. data/lib/coradoc/input/html/converters/em.rb +0 -21
  166. data/lib/coradoc/input/html/converters/figure.rb +0 -25
  167. data/lib/coradoc/input/html/converters/h.rb +0 -42
  168. data/lib/coradoc/input/html/converters/head.rb +0 -23
  169. data/lib/coradoc/input/html/converters/hr.rb +0 -15
  170. data/lib/coradoc/input/html/converters/ignore.rb +0 -20
  171. data/lib/coradoc/input/html/converters/img.rb +0 -110
  172. data/lib/coradoc/input/html/converters/li.rb +0 -17
  173. data/lib/coradoc/input/html/converters/mark.rb +0 -19
  174. data/lib/coradoc/input/html/converters/markup.rb +0 -31
  175. data/lib/coradoc/input/html/converters/math.rb +0 -38
  176. data/lib/coradoc/input/html/converters/ol.rb +0 -65
  177. data/lib/coradoc/input/html/converters/p.rb +0 -23
  178. data/lib/coradoc/input/html/converters/pass_through.rb +0 -17
  179. data/lib/coradoc/input/html/converters/pre.rb +0 -55
  180. data/lib/coradoc/input/html/converters/q.rb +0 -16
  181. data/lib/coradoc/input/html/converters/strong.rb +0 -20
  182. data/lib/coradoc/input/html/converters/sub.rb +0 -22
  183. data/lib/coradoc/input/html/converters/sup.rb +0 -22
  184. data/lib/coradoc/input/html/converters/table.rb +0 -319
  185. data/lib/coradoc/input/html/converters/td.rb +0 -81
  186. data/lib/coradoc/input/html/converters/text.rb +0 -32
  187. data/lib/coradoc/input/html/converters/th.rb +0 -18
  188. data/lib/coradoc/input/html/converters/tr.rb +0 -22
  189. data/lib/coradoc/input/html/converters/video.rb +0 -29
  190. data/lib/coradoc/input/html/converters.rb +0 -59
  191. data/lib/coradoc/input/html/errors.rb +0 -14
  192. data/lib/coradoc/input/html/html_converter.rb +0 -168
  193. data/lib/coradoc/input/html/plugin.rb +0 -131
  194. data/lib/coradoc/input/html/plugins/plateau.rb +0 -213
  195. data/lib/coradoc/input/html/postprocessor.rb +0 -220
  196. data/lib/coradoc/input/html.rb +0 -61
  197. data/lib/coradoc/legacy_parser.rb +0 -200
  198. data/lib/coradoc/oscal.rb +0 -99
  199. data/lib/coradoc/output/adoc.rb +0 -19
  200. data/lib/coradoc/output/coradoc_tree_debug.rb +0 -21
  201. data/lib/coradoc/parser/asciidoc/admonition.rb +0 -24
  202. data/lib/coradoc/parser/asciidoc/attribute_list.rb +0 -89
  203. data/lib/coradoc/parser/asciidoc/base.rb +0 -87
  204. data/lib/coradoc/parser/asciidoc/bibliography.rb +0 -29
  205. data/lib/coradoc/parser/asciidoc/block.rb +0 -94
  206. data/lib/coradoc/parser/asciidoc/citation.rb +0 -30
  207. data/lib/coradoc/parser/asciidoc/content.rb +0 -64
  208. data/lib/coradoc/parser/asciidoc/document_attributes.rb +0 -25
  209. data/lib/coradoc/parser/asciidoc/header.rb +0 -29
  210. data/lib/coradoc/parser/asciidoc/inline.rb +0 -195
  211. data/lib/coradoc/parser/asciidoc/list.rb +0 -115
  212. data/lib/coradoc/parser/asciidoc/paragraph.rb +0 -54
  213. data/lib/coradoc/parser/asciidoc/section.rb +0 -61
  214. data/lib/coradoc/parser/asciidoc/table.rb +0 -32
  215. data/lib/coradoc/parser/asciidoc/term.rb +0 -41
  216. data/lib/coradoc/parser/asciidoc/text.rb +0 -158
  217. data/lib/coradoc/parser/base.rb +0 -40
  218. data/lib/coradoc/parser.rb +0 -11
  219. data/lib/coradoc/reverse_adoc.rb +0 -18
  220. data/lib/coradoc/transformer.rb +0 -476
  221. data/lib/coradoc/util.rb +0 -12
  222. data/lib/reverse_adoc.rb +0 -20
  223. data/utils/inspect_asciidoc.rb +0 -29
  224. data/utils/parser_analyzer.rb +0 -66
  225. data/utils/round_trip.rb +0 -53
@@ -1,213 +0,0 @@
1
- module Coradoc
2
- module Input
3
- module Html
4
- class Plugin
5
- # This plugin enhances documents from the PLATEAU project
6
- # to extract more data.
7
- #
8
- # Usage:
9
- # reverse_adoc -rcoradoc/input/html/plugins/plateau
10
- # --external-images -u raise --output _out/index.adoc index.html
11
- class Plateau < Plugin
12
- def name
13
- "PLATEAU"
14
- end
15
-
16
- def preprocess_html_tree
17
- # Let's simplify the tree by removing what's extraneous
18
- # html_tree_remove_by_css("script, style, img.container_imagebox:not([src])")
19
- # html_tree_replace_with_children_by_css("div.container_box")
20
- # html_tree_replace_with_children_by_css("div.col.col-12")
21
- # html_tree_replace_with_children_by_css(".tabledatatext, .tabledatatextY")
22
- # html_tree_replace_with_children_by_css("div.row")
23
- #
24
- # We can remove that, but it messes up the images and paragraphs.
25
-
26
- # Remove side menu, so we can generate TOC ourselves
27
- html_tree_remove_by_css(".sideMenu")
28
-
29
- # Correct non-semantic classes into semantic HTML tags
30
- html_tree_change_tag_name_by_css(".titledata", "h1")
31
- html_tree_change_tag_name_by_css(".subtitledata", "h2")
32
- html_tree_change_tag_name_by_css(".pitemdata", "h3")
33
- html_tree_change_tag_name_by_css(".sitemdata", "h4")
34
- html_tree_change_tag_name_by_css('td[bgcolor="#D0CECE"]', "th")
35
- html_tree_change_tag_name_by_css('td[bgcolor="#d0cece"]', "th")
36
- html_tree_change_tag_name_by_css(".framedata, .frame_container_box",
37
- "aside")
38
- html_tree_change_tag_name_by_css(".frame2data", "pre")
39
- # Assumption that all code snippets in those documents are XML...
40
- html_tree_change_properties_by_css(".frame2data",
41
- class: "brush:xml;")
42
-
43
- # Remove some CSS ids that are not important to us
44
- html_tree_change_properties_by_css("#__nuxt", id: nil)
45
- html_tree_change_properties_by_css("#__layout", id: nil)
46
- html_tree_change_properties_by_css("#app", id: nil)
47
-
48
- # Handle lists of document 02
49
- html_tree_replace_with_children_by_css(".list_num-wrap")
50
-
51
- # Convert table/img caption to become a caption
52
- html_tree.css(".imagedata").each do |e|
53
- table = e.parent.next&.children&.first
54
- if table&.name == "table"
55
- e.name = "caption"
56
- table.prepend_child(e)
57
- next
58
- end
59
-
60
- img = e.parent.previous&.children&.first
61
- if img&.name == "img" && img["src"]
62
- title = e.text.strip
63
- img["title"] = title
64
- e.remove
65
- next
66
- end
67
- end
68
-
69
- # Add hooks for H1, H2, H3, H4
70
- html_tree_add_hook_post_by_css("h1, h2, h3",
71
- &method(:handle_headers))
72
- html_tree_add_hook_post_by_css("h4", &method(:handle_headers_h4))
73
-
74
- # Table cells aligned to center
75
- html_tree_change_properties_by_css(".tableTopCenter",
76
- align: "center")
77
-
78
- # Handle non-semantic lists and indentation
79
- html_tree_add_hook_pre_by_css ".text2data" do |node,|
80
- text = html_tree_process_to_adoc(node).strip
81
- next "" if text.empty? || text == "\u3000"
82
-
83
- if text.start_with?(/\d+\./)
84
- text = text.sub(/\A\d+.\s*/, "")
85
- ".. #{text}\n"
86
- else
87
- text = text.gsub(/^/, "** ")
88
- "\n\n//-PT2D\n#{text}\n//-ENDPT2D\n\n"
89
- end
90
- end
91
-
92
- (3..4).each do |i|
93
- html_tree_add_hook_pre_by_css ".text#{i}data" do |node,|
94
- text = html_tree_process_to_adoc(node).strip
95
- next "" if text.empty? || text == "\u3000"
96
-
97
- text = text.strip.gsub(/^/, "#{'*' * i} ")
98
- "\n\n//-PT#{i}D\n#{text}\n//-ENDPT#{i}D\n\n"
99
- end
100
- end
101
-
102
- (2..3).each do |i|
103
- html_tree_add_hook_pre_by_css ".text#{i}data_point ul" do |node,|
104
- text = html_tree_process_to_adoc(node.children.first.children).strip
105
-
106
- "#{'*' * i} #{text}\n"
107
- end
108
- end
109
-
110
- (1..20).each do |i|
111
- html_tree_add_hook_pre_by_css ".numtextdata_num .list_num#{i}" do |node,|
112
- text = html_tree_process_to_adoc(node).strip
113
-
114
- "[start=#{i}]\n. #{text}\n"
115
- end
116
- end
117
-
118
- # html_tree_preview
119
- end
120
-
121
- IM = /[A-Z0-9]{1,3}/
122
-
123
- def handle_headers(node, coradoc, _state)
124
- content = coradoc.content.map(&:content).join
125
-
126
- if %w[toc0 toc_0].any? { |i| coradoc.id&.start_with?(i) }
127
- # Special content
128
- case content.strip
129
- when "はじめに" # Introduction
130
- coradoc.style = "abstract" # The older version document has ".preface"
131
- coradoc.level_int = 1
132
- when "改定の概要" # Revision overview
133
- coradoc.style = "abstract" # The older version document has ".preface"
134
- coradoc.level_int = 1
135
- when "参考文献" # Bibliography
136
- coradoc.style = "bibliography"
137
- coradoc.level_int = 1
138
- when "改訂履歴" # Document history
139
- coradoc.style = "appendix"
140
- coradoc.level_int = 1
141
- when "0 概要" # Overview
142
- coradoc.style = "abstract" # I'm not sure this is correct
143
- coradoc.level_int = 1
144
- when "索引" # Index
145
- coradoc.style = "index" # I'm not sure this is correct
146
- coradoc.level_int = 1
147
- else
148
- warn "Unknown section #{content.inspect}"
149
- end
150
- end
151
-
152
- if node.name == "h1" && content.start_with?("Annex")
153
- coradoc.style = "appendix"
154
- coradoc.content.first.content.sub!(/\AAnnex [A-Z]/, "")
155
- end
156
-
157
- # Remove numbers
158
- coradoc.content.first.content.sub!(/\A(#{IM}\.)*#{IM}[[:space:]]/o,
159
- "")
160
-
161
- coradoc
162
- end
163
-
164
- def handle_headers_h4(_node, coradoc, _state)
165
- title = Coradoc.strip_unicode(coradoc.content.first.content)
166
- case title
167
- when /\A\(\d+\)(.*)/
168
- coradoc.level_int = 4
169
- coradoc.content.first.content = $1.strip
170
- coradoc
171
- when /\A\d+\)(.*)/
172
- coradoc.level_int = 5
173
- coradoc.content.first.content = $1.strip
174
- coradoc
175
- when /\A#{IM}\.#{IM}\.#{IM}\.#{IM}(.*)/o
176
- coradoc.level_int = 4
177
- coradoc.content.first.content = $1.strip
178
- else
179
- if title.empty?
180
- # Strip instances of faulty empty paragraphs
181
- nil
182
- else
183
- ["// FIXME\n", coradoc]
184
- end
185
- end
186
- end
187
-
188
- def postprocess_asciidoc_string
189
- str = asciidoc_string
190
-
191
- ### Custom indentation handling
192
- # If there's a step up, add [none]
193
- str = str.gsub(%r{\s+//-ENDPT2D\s+//-PT3D\s+}, "\n[none]\n")
194
- str = str.gsub(%r{\s+//-ENDPT2D\s+//-PT4D\s+}, "\n[none]\n")
195
- str = str.gsub(%r{\s+//-ENDPT3D\s+//-PT4D\s+}, "\n[none]\n")
196
- # Collapse blocks of text[2,3]data
197
- str = str.gsub(%r{\s+//-ENDPT[234]D\s+//-PT[234]D\s+}, "\n\n")
198
- # In the beginning, add [none]
199
- str = str.gsub(%r{\s+//-PT[234]D\s+}, "\n\n[none]\n")
200
- # If following with another list, ensure we readd styling
201
- str = str.gsub(%r{\s+//-ENDPT[234]D\s+\*}, "\n\n[disc]\n*")
202
- # Otherwise, clean up
203
- str = str.gsub(%r{\s+//-ENDPT[234]D\s+}, "\n\n")
204
-
205
- self.asciidoc_string = str
206
- end
207
- end
208
- end
209
- end
210
- end
211
- end
212
-
213
- Coradoc::Input::Html.config.plugins << Coradoc::Input::Html::Plugin::Plateau
@@ -1,220 +0,0 @@
1
- module Coradoc
2
- module Input
3
- module Html
4
- # Postprocessor's aim is to convert a Coradoc tree from
5
- # a mess that has been created from HTML into a tree that
6
- # is compatible with what we would get out of Coradoc, if
7
- # it parsed it directly.
8
- class Postprocessor
9
- Element = Coradoc::Element
10
-
11
- def self.process(coradoc)
12
- new(coradoc).process
13
- end
14
-
15
- def initialize(coradoc)
16
- @tree = coradoc
17
- end
18
-
19
- # Extracts titles from lists. This happens in HTML files
20
- # generated from DOCX documents by LibreOffice.
21
- #
22
- # We are interested in a particular tree:
23
- # Element::List::Ordered items:
24
- # Element::List::Ordered items: (any depth)
25
- # Element::ListItem content:
26
- # Element::Title
27
- # (any number of other titles of the same scheme)
28
- #
29
- # This tree is flattened into:
30
- # Element::Title
31
- # Element::Title (any number of titles)
32
- def extract_titles_from_lists
33
- @tree = Element::Base.visit(@tree) do |elem, dir|
34
- next elem unless dir == :pre
35
- next elem unless elem.is_a?(Element::List::Ordered)
36
- next elem if elem.items.length != 1
37
-
38
- anchors = []
39
- anchors << elem.anchor if elem.anchor
40
-
41
- # Extract ListItem from any depth of List::Ordered
42
- processed = elem
43
- while processed.is_a?(Element::List::Ordered)
44
- if processed.items.length != 1
45
- backtrack = true
46
- break
47
- end
48
- anchors << processed.anchor if processed.anchor
49
- processed = processed.items.first
50
- end
51
-
52
- # Something went wrong? Anything not matching on the way?
53
- next elem if backtrack
54
- next elem unless processed.is_a?(Element::ListItem)
55
-
56
- anchors << processed.anchor if processed.anchor
57
-
58
- # Now we must have a title (or titles).
59
- titles = processed.content.flatten
60
-
61
- # Don't bother if there's no title in there.
62
- next elem unless titles.any? { |i| i.is_a? Element::Title }
63
-
64
- # Ordered is another iteration for our cleanup.
65
- next elem unless titles.all? do |i|
66
- i.is_a?(Element::Title) || i.is_a?(Element::List::Ordered)
67
- end
68
-
69
- # We are done now.
70
- titles + anchors
71
- end
72
- end
73
-
74
- # Collapse DIVs that only have a title, or nest another DIV.
75
- def collapse_meaningless_sections
76
- @tree = Element::Base.visit(@tree) do |elem, _dir|
77
- if elem.is_a?(Element::Section) && elem.safe_to_collapse?
78
- children_classes = Array(elem.contents).map(&:class)
79
- count = children_classes.length
80
- safe_classes = [Element::Section, Element::Title]
81
-
82
- # Count > 0 because some documents use <div> as a <br>.
83
- if count.positive? && children_classes.all? do |i|
84
- safe_classes.include?(i)
85
- end
86
- contents = elem.contents.dup
87
- contents.prepend(elem.anchor) if elem.anchor
88
- next contents
89
- end
90
- end
91
- elem
92
- end
93
- end
94
-
95
- # tree should now be more cleaned up, so we can progress with
96
- # creating meaningful sections
97
- def generate_meaningful_sections
98
- @tree = Element::Base.visit(@tree) do |elem, dir|
99
- # We are searching for an array, that has a title. This
100
- # will be a candidate for our section array.
101
- if dir == :post &&
102
- elem.is_a?(Array) &&
103
- !elem.flatten.grep(Element::Title).empty?
104
-
105
- elem = elem.flatten
106
-
107
- new_array = []
108
- content_array = new_array
109
- section_arrays_by_level = [new_array] * 8
110
-
111
- # For each title element, we create a new section. Then we push
112
- # all descendant sections into those sections. Otherwise, we push
113
- # an element as content of current section.
114
- elem.each do |e|
115
- if e.is_a? Element::Title
116
- title = e
117
- content_array = []
118
- section_array = []
119
- level = title.level_int
120
- section = Element::Section.new(
121
- title, contents: content_array, sections: section_array
122
- )
123
- # Some documents may not be consistent and eg. follow H4 after
124
- # H2. Let's ensure that proceeding sections will land in a
125
- # correct place.
126
- (8 - level).times do |j|
127
- section_arrays_by_level[level + j] = section_array
128
- end
129
- section_arrays_by_level[level - 1] << section
130
- else
131
- content_array << e
132
- end
133
- end
134
- next new_array
135
- end
136
- elem
137
- end
138
- end
139
-
140
- def split_sections
141
- max_level = Coradoc::Input::Html.config.split_sections
142
-
143
- return unless max_level
144
-
145
- sections = {}
146
- parent_sections = []
147
- previous_sections = {}
148
-
149
- determine_section_id = ->(elem) do
150
- level = if elem.title.style == "appendix"
151
- "A"
152
- else
153
- 1
154
- end
155
-
156
- section = previous_sections[elem]
157
- while section
158
- level = level.succ if elem.title.style == section.title.style
159
- section = previous_sections[section]
160
- end
161
- level.is_a?(Integer) ? "%02d" % level : level
162
- end
163
-
164
- determine_style = ->(elem) do
165
- style = elem.title.style || "section"
166
- style += "-"
167
- style
168
- end
169
-
170
- @tree = Element::Base.visit(@tree) do |elem, dir|
171
- title = elem.title if elem.is_a?(Element::Section)
172
-
173
- if title && title.level_int <= max_level
174
- if dir == :pre
175
- # In the PRE pass, we build a tree of sections, so that
176
- # we can compute numbers
177
- previous_sections[elem] = parent_sections[title.level_int]
178
- parent_sections[title.level_int] = elem
179
- parent_sections[(title.level_int + 1)..nil] = nil
180
-
181
- elem
182
- else
183
- # In the POST pass, we replace the sections with their
184
- # include tag.
185
- section_file = "sections/"
186
- section_file += parent_sections[1..title.level_int].map do |parent|
187
- determine_style.(parent) + determine_section_id.(parent)
188
- end.join("/")
189
- section_file += ".adoc"
190
-
191
- sections[section_file] = elem
192
- up = "../" * (title.level_int - 1)
193
- "\ninclude::#{up}#{section_file}[]\n"
194
- end
195
- else
196
- elem
197
- end
198
- end
199
-
200
- sections[nil] = @tree
201
- @tree = sections
202
- end
203
-
204
- def process
205
- extract_titles_from_lists
206
- collapse_meaningless_sections
207
- generate_meaningful_sections
208
- # Do it again to simplify the document further.
209
- # Since the structure is changed, we may have new meaningful
210
- # sections as only children of some meaningless sections.
211
- collapse_meaningless_sections
212
-
213
- split_sections
214
-
215
- @tree
216
- end
217
- end
218
- end
219
- end
220
- end
@@ -1,61 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "digest"
4
- require "nokogiri"
5
- require "coradoc/input"
6
- require_relative "html/errors"
7
- require_relative "html/cleaner"
8
- require_relative "html/config"
9
- require_relative "html/converters"
10
- require_relative "html/converters/base"
11
- require_relative "html/html_converter"
12
- require_relative "html/plugin"
13
- require_relative "html/postprocessor"
14
-
15
- module Coradoc
16
- module Input
17
- module Html
18
- def self.convert(input, options = {})
19
- Coradoc::Input::Html::HtmlConverter.convert(input, options)
20
- end
21
-
22
- def self.to_coradoc(input, options = {})
23
- Input::Html::HtmlConverter.to_coradoc(input, options)
24
- end
25
-
26
- def self.config
27
- @config ||= Config.new
28
- yield @config if block_given?
29
- @config
30
- end
31
-
32
- def self.cleaner
33
- @cleaner ||= Cleaner.new
34
- end
35
-
36
- def self.processor_id
37
- :html
38
- end
39
-
40
- def self.processor_match?(filename)
41
- %w[.html .htm].any? { |i| filename.downcase.end_with?(i) }
42
- end
43
-
44
- def self.processor_execute(input, options = {})
45
- to_coradoc(input, options)
46
- end
47
-
48
- def self.processor_postprocess(data, options)
49
- if options[:output_processor] == :adoc
50
- data.transform_values do |v|
51
- Input::Html::HtmlConverter.cleanup_result(v, options)
52
- end
53
- else
54
- data
55
- end
56
- end
57
-
58
- Coradoc::Input.define(self)
59
- end
60
- end
61
- end
@@ -1,200 +0,0 @@
1
- require "parslet"
2
- require "parslet/convenience"
3
-
4
- module Coradoc
5
- class LegacyParser < Parslet::Parser
6
- root :document
7
-
8
- # Basic Elements
9
- rule(:space) { match('\s') }
10
- rule(:space?) { spaces.maybe }
11
- rule(:spaces) { space.repeat(1) }
12
- rule(:empty_line) { match("^\n") }
13
-
14
- rule(:endline) { newline | any.absent? }
15
- rule(:newline) { match["\r\n"].repeat(1) }
16
- rule(:line_ending) { match("[\n]") }
17
-
18
- rule(:inline_element) { text }
19
- rule(:text) { match("[^\n]").repeat(1) }
20
- rule(:digits) { match("[0-9]").repeat(1) }
21
- rule(:word) { match("[a-zA-Z0-9_-]").repeat(1) }
22
- rule(:special_character) { match("^[*_:=-]") | str("[#") }
23
-
24
- rule(:text_line) do
25
- special_character.absent? >>
26
- match("[^\n]").repeat(1).as(:text) >>
27
- line_ending.as(:break)
28
- end
29
-
30
- # Common Helpers
31
- rule(:words) { word >> (space? >> word).repeat }
32
- rule(:email) { word >> str("@") >> word >> str(".") >> word }
33
-
34
- # Document
35
- rule(:document) do
36
- (
37
- document_attributes.repeat(1).as(:document_attributes) |
38
- section.as(:section) |
39
- header.as(:header) |
40
- block_with_title.as(:block) |
41
- empty_line.repeat(1) |
42
- any.as(:unparsed)
43
- ).repeat(1).as(:document)
44
- end
45
-
46
- # Header
47
- rule(:header) do
48
- match("=") >> space? >> text.as(:title) >> newline >>
49
- author.maybe.as(:author) >> revision.maybe.as(:revision)
50
- end
51
-
52
- rule(:author) do
53
- words.as(:first_name) >> str(",") >> space? >> words.as(:last_name) >>
54
- space? >> str("<") >> email.as(:email) >> str(">") >> endline
55
- end
56
-
57
- rule(:revision) do
58
- (word >> (str(".") >> word).maybe).as(:number) >>
59
- str(",") >> space? >> word.as(:date) >>
60
- str(":") >> space? >> words.as(:remark) >> newline
61
- end
62
-
63
- # DocumentAttributes
64
- rule(:document_attributes) do
65
- str(":") >> attribute_name.as(:key) >> str(":") >>
66
- space? >> attribute_value.as(:value) >> endline
67
- end
68
-
69
- # Section
70
- rule(:section) do
71
- heading.as(:title) >>
72
- (list.as(:list) |
73
- blocks.as(:blocks) |
74
- paragraphs.as(:paragraphs)).maybe
75
- end
76
-
77
- # Heading
78
- rule(:heading) do
79
- (anchor_name >> newline).maybe >>
80
- match("=").repeat(2, 8).as(:level) >>
81
- space? >> text.as(:text) >> endline.as(:break)
82
- end
83
-
84
- rule(:anchor_name) { str("[#") >> keyword.as(:name) >> str("]") }
85
-
86
- # List
87
- rule(:list) do
88
- unordered_list.as(:unordered) |
89
- definition_list.as(:definition) | ordered_list.as(:ordered)
90
- end
91
-
92
- rule(:ordered_list) { olist_item.repeat(1) }
93
- rule(:unordered_list) { ulist_item.repeat(1) }
94
- rule(:definition_list) { dlist_item.repeat(1) }
95
-
96
- rule(:olist_item) { match("\.") >> space >> text_line }
97
- rule(:ulist_item) { match("\\*") >> space >> text_line }
98
- rule(:dlist_item) do
99
- str("term") >> space >> digits >> str("::") >> space >> text_line
100
- end
101
-
102
- # Block
103
- rule(:block) { simple_block | open_block }
104
- rule(:attribute_name) { keyword }
105
- rule(:attribute_value) { text | str("") }
106
- rule(:keyword) { match("[a-zA-Z0-9_-]").repeat(1) }
107
- rule(:blocks) { block.repeat(1) >> (newline >> block.repeat(1)).maybe }
108
-
109
- rule(:block_title) { str(".") >> text.as(:title) >> line_ending }
110
- rule(:block_type) { str("[") >> keyword.as(:type) >> str("]") >> newline }
111
-
112
- rule(:block_attribute) do
113
- str("[") >> keyword.as(:key) >>
114
- str("=") >> keyword.as(:value) >> str("]")
115
- end
116
-
117
- rule(:simple_block) do
118
- block_attribute.as(:attributes) >> newline >>
119
- text_line.repeat(1).as(:lines)
120
- end
121
-
122
- rule(:open_block) do
123
- block_title >>
124
- block_type >>
125
- str("--").as(:delimiter) >> newline >>
126
- text_line.repeat.as(:lines) >>
127
- str("--") >> line_ending
128
- end
129
-
130
- rule(:example_block) do
131
- block_title >>
132
- block_type >>
133
- str("====").as(:delimiter) >> newline >>
134
- text_line.repeat(1).as(:lines) >>
135
- str("====") >> newline
136
- end
137
-
138
- rule(:sidebar_block) do
139
- block_title >>
140
- block_type.maybe >>
141
- str("****").as(:delimiter) >> newline >>
142
- text_line.repeat(1).as(:lines) >>
143
- str("****") >> newline
144
- end
145
-
146
- rule(:source_block) do
147
- block_title >>
148
- str("----").as(:delimiter) >> newline >>
149
- text_line.repeat(1).as(:lines) >>
150
- str("----") >> newline
151
- end
152
-
153
- rule(:quote_block) do
154
- block_title >>
155
- str("____").as(:delimiter) >> newline >>
156
- text_line.repeat.as(:lines) >>
157
- str("____") >> newline
158
- end
159
-
160
- rule(:block_with_title) do
161
- example_block | quote_block |
162
- sidebar_block | source_block | open_block |
163
- (block_title >> text_line.repeat(1).as(:lines))
164
- end
165
-
166
- # Paragraph
167
- rule(:paragraphs) do
168
- paragraph >> (line_ending.repeat(1) >> paragraph).repeat.maybe
169
- end
170
-
171
- rule(:paragraph) { admonitions.repeat(1) | text_line.repeat(1) }
172
-
173
- # Admonition
174
- rule(:admonition_type) do
175
- (str("NOTE") |
176
- str("TIP") |
177
- str("EDITOR") |
178
- str("DANGER") |
179
- str("CAUTION") |
180
- str("WARNING") |
181
- str("IMPORTANT")).as(:type)
182
- end
183
-
184
- rule(:admonitions) { admonition.as(:admonition).repeat(1) }
185
- rule(:admonition) { inline_admonition | block_admonition }
186
-
187
- rule(:inline_admonition) do
188
- admonition_type >> str(":") >> space? >> text_line >> newline
189
- end
190
-
191
- rule(:block_admonition) do
192
- str("[") >> admonition_type >> str("]") >> newline >> text_line >> newline
193
- end
194
-
195
- def self.parse(filename)
196
- content = File.read(filename)
197
- new.parse_with_debug(content)
198
- end
199
- end
200
- end