coradoc-html 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/lib/coradoc/html/base.rb +157 -0
  4. data/lib/coradoc/html/config.rb +467 -0
  5. data/lib/coradoc/html/converter_base.rb +177 -0
  6. data/lib/coradoc/html/converters/admonition.rb +180 -0
  7. data/lib/coradoc/html/converters/attribute.rb +68 -0
  8. data/lib/coradoc/html/converters/attribute_reference.rb +60 -0
  9. data/lib/coradoc/html/converters/audio.rb +165 -0
  10. data/lib/coradoc/html/converters/base.rb +615 -0
  11. data/lib/coradoc/html/converters/bibliography.rb +82 -0
  12. data/lib/coradoc/html/converters/bibliography_entry.rb +108 -0
  13. data/lib/coradoc/html/converters/block_image.rb +72 -0
  14. data/lib/coradoc/html/converters/bold.rb +34 -0
  15. data/lib/coradoc/html/converters/break.rb +32 -0
  16. data/lib/coradoc/html/converters/comment_block.rb +42 -0
  17. data/lib/coradoc/html/converters/comment_line.rb +54 -0
  18. data/lib/coradoc/html/converters/cross_reference.rb +59 -0
  19. data/lib/coradoc/html/converters/document.rb +108 -0
  20. data/lib/coradoc/html/converters/example.rb +114 -0
  21. data/lib/coradoc/html/converters/highlight.rb +34 -0
  22. data/lib/coradoc/html/converters/include.rb +68 -0
  23. data/lib/coradoc/html/converters/inline_image.rb +41 -0
  24. data/lib/coradoc/html/converters/italic.rb +34 -0
  25. data/lib/coradoc/html/converters/line_break.rb +31 -0
  26. data/lib/coradoc/html/converters/link.rb +46 -0
  27. data/lib/coradoc/html/converters/list_item.rb +75 -0
  28. data/lib/coradoc/html/converters/listing.rb +99 -0
  29. data/lib/coradoc/html/converters/literal.rb +102 -0
  30. data/lib/coradoc/html/converters/monospace.rb +34 -0
  31. data/lib/coradoc/html/converters/open.rb +78 -0
  32. data/lib/coradoc/html/converters/ordered.rb +53 -0
  33. data/lib/coradoc/html/converters/paragraph.rb +46 -0
  34. data/lib/coradoc/html/converters/quote.rb +113 -0
  35. data/lib/coradoc/html/converters/reviewer_comment.rb +74 -0
  36. data/lib/coradoc/html/converters/reviewer_note.rb +134 -0
  37. data/lib/coradoc/html/converters/section.rb +90 -0
  38. data/lib/coradoc/html/converters/sidebar.rb +113 -0
  39. data/lib/coradoc/html/converters/source.rb +137 -0
  40. data/lib/coradoc/html/converters/source_code.rb +16 -0
  41. data/lib/coradoc/html/converters/span.rb +61 -0
  42. data/lib/coradoc/html/converters/strikethrough.rb +34 -0
  43. data/lib/coradoc/html/converters/subscript.rb +34 -0
  44. data/lib/coradoc/html/converters/superscript.rb +34 -0
  45. data/lib/coradoc/html/converters/table.rb +85 -0
  46. data/lib/coradoc/html/converters/table_cell.rb +203 -0
  47. data/lib/coradoc/html/converters/table_row.rb +45 -0
  48. data/lib/coradoc/html/converters/template_html_converter.rb +105 -0
  49. data/lib/coradoc/html/converters/term.rb +58 -0
  50. data/lib/coradoc/html/converters/text_element.rb +44 -0
  51. data/lib/coradoc/html/converters/underline.rb +34 -0
  52. data/lib/coradoc/html/converters/unordered.rb +47 -0
  53. data/lib/coradoc/html/converters/verse.rb +105 -0
  54. data/lib/coradoc/html/converters/video.rb +179 -0
  55. data/lib/coradoc/html/element_mapping.rb +210 -0
  56. data/lib/coradoc/html/entity.rb +137 -0
  57. data/lib/coradoc/html/input/cleaner.rb +163 -0
  58. data/lib/coradoc/html/input/config.rb +79 -0
  59. data/lib/coradoc/html/input/converters/a.rb +90 -0
  60. data/lib/coradoc/html/input/converters/aside.rb +23 -0
  61. data/lib/coradoc/html/input/converters/audio.rb +50 -0
  62. data/lib/coradoc/html/input/converters/base.rb +116 -0
  63. data/lib/coradoc/html/input/converters/blockquote.rb +25 -0
  64. data/lib/coradoc/html/input/converters/br.rb +19 -0
  65. data/lib/coradoc/html/input/converters/bypass.rb +83 -0
  66. data/lib/coradoc/html/input/converters/code.rb +25 -0
  67. data/lib/coradoc/html/input/converters/div.rb +25 -0
  68. data/lib/coradoc/html/input/converters/dl.rb +106 -0
  69. data/lib/coradoc/html/input/converters/drop.rb +28 -0
  70. data/lib/coradoc/html/input/converters/em.rb +23 -0
  71. data/lib/coradoc/html/input/converters/figure.rb +58 -0
  72. data/lib/coradoc/html/input/converters/h.rb +76 -0
  73. data/lib/coradoc/html/input/converters/head.rb +30 -0
  74. data/lib/coradoc/html/input/converters/hr.rb +20 -0
  75. data/lib/coradoc/html/input/converters/ignore.rb +22 -0
  76. data/lib/coradoc/html/input/converters/img.rb +110 -0
  77. data/lib/coradoc/html/input/converters/li.rb +35 -0
  78. data/lib/coradoc/html/input/converters/mark.rb +21 -0
  79. data/lib/coradoc/html/input/converters/markup.rb +107 -0
  80. data/lib/coradoc/html/input/converters/math.rb +46 -0
  81. data/lib/coradoc/html/input/converters/ol.rb +46 -0
  82. data/lib/coradoc/html/input/converters/p.rb +81 -0
  83. data/lib/coradoc/html/input/converters/pass_through.rb +19 -0
  84. data/lib/coradoc/html/input/converters/pre.rb +59 -0
  85. data/lib/coradoc/html/input/converters/q.rb +24 -0
  86. data/lib/coradoc/html/input/converters/strong.rb +22 -0
  87. data/lib/coradoc/html/input/converters/sub.rb +40 -0
  88. data/lib/coradoc/html/input/converters/sup.rb +40 -0
  89. data/lib/coradoc/html/input/converters/table.rb +64 -0
  90. data/lib/coradoc/html/input/converters/td.rb +70 -0
  91. data/lib/coradoc/html/input/converters/text.rb +67 -0
  92. data/lib/coradoc/html/input/converters/th.rb +20 -0
  93. data/lib/coradoc/html/input/converters/tr.rb +28 -0
  94. data/lib/coradoc/html/input/converters/video.rb +53 -0
  95. data/lib/coradoc/html/input/converters.rb +122 -0
  96. data/lib/coradoc/html/input/errors.rb +22 -0
  97. data/lib/coradoc/html/input/html_converter.rb +170 -0
  98. data/lib/coradoc/html/input/plugin.rb +169 -0
  99. data/lib/coradoc/html/input/plugins/plateau.rb +229 -0
  100. data/lib/coradoc/html/input/postprocessor.rb +31 -0
  101. data/lib/coradoc/html/input.rb +68 -0
  102. data/lib/coradoc/html/output.rb +95 -0
  103. data/lib/coradoc/html/renderer.rb +409 -0
  104. data/lib/coradoc/html/spa.rb +309 -0
  105. data/lib/coradoc/html/static.rb +293 -0
  106. data/lib/coradoc/html/template_config.rb +151 -0
  107. data/lib/coradoc/html/template_helpers.rb +58 -0
  108. data/lib/coradoc/html/template_locator.rb +114 -0
  109. data/lib/coradoc/html/theme/base.rb +231 -0
  110. data/lib/coradoc/html/theme/classic_renderer.rb +390 -0
  111. data/lib/coradoc/html/theme/modern/components/ui_components.rb +344 -0
  112. data/lib/coradoc/html/theme/modern/css_generator.rb +311 -0
  113. data/lib/coradoc/html/theme/modern/javascript_generator.rb +314 -0
  114. data/lib/coradoc/html/theme/modern/serializers/document_serializer.rb +382 -0
  115. data/lib/coradoc/html/theme/modern/tailwind_config_builder.rb +164 -0
  116. data/lib/coradoc/html/theme/modern/vue_template_generator.rb +374 -0
  117. data/lib/coradoc/html/theme/modern_renderer.rb +250 -0
  118. data/lib/coradoc/html/theme/registry.rb +153 -0
  119. data/lib/coradoc/html/theme.rb +13 -0
  120. data/lib/coradoc/html/transform/from_core_model.rb +32 -0
  121. data/lib/coradoc/html/transform/to_core_model.rb +39 -0
  122. data/lib/coradoc/html/version.rb +7 -0
  123. data/lib/coradoc/html.rb +255 -0
  124. metadata +264 -0
@@ -0,0 +1,210 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Html
5
+ # Element mapping between CoreModel and HTML elements
6
+ #
7
+ # This module provides bidirectional mapping between CoreModel types
8
+ # and HTML elements for conversion purposes.
9
+ module ElementMapping
10
+ class << self
11
+ # Map CoreModel class to HTML element
12
+ def model_to_html_element(model_class)
13
+ model_name = model_class.name.split('::').last.downcase.to_sym
14
+ MODEL_TO_HTML[model_name] || default_element_for(model_class)
15
+ end
16
+
17
+ # Map HTML element to CoreModel class
18
+ def html_element_to_model(tag_name, context = {})
19
+ tag = tag_name.to_s.downcase.to_sym
20
+ HTML_TO_MODEL[tag] || default_model_for(tag, context)
21
+ end
22
+
23
+ # Get default element for a model class
24
+ def default_element_for(model_class)
25
+ if model_class.ancestors.any? { |a| a.name&.include?('InlineElement') }
26
+ { tag: 'span', semantic: false }
27
+ else
28
+ { tag: 'div', semantic: false }
29
+ end
30
+ end
31
+
32
+ # Get default model for an HTML element
33
+ def default_model_for(tag, _context)
34
+ # Return CoreModel types
35
+ case tag
36
+ when :p, :div, :section, :article
37
+ 'Coradoc::CoreModel::Block'
38
+ when :strong, :b, :em, :i, :code
39
+ 'Coradoc::CoreModel::InlineElement'
40
+ else
41
+ 'Coradoc::CoreModel::InlineElement'
42
+ end
43
+ end
44
+
45
+ # Mapping from CoreModel types to HTML elements
46
+ MODEL_TO_HTML = {
47
+ # Document structure
48
+ document: { tag: 'article', semantic: true },
49
+ section: { tag: 'section', semantic: true },
50
+ header: { tag: 'header', semantic: true },
51
+ title: { tag: 'h1', semantic: true },
52
+ structuralelement: { tag: 'section', semantic: true },
53
+
54
+ # Block elements
55
+ paragraph: { tag: 'p', semantic: true },
56
+ block: { tag: 'div', semantic: true },
57
+ example: { tag: 'div', class: 'example', semantic: false },
58
+ annotationblock: { tag: 'aside', semantic: true },
59
+ quote: { tag: 'blockquote', semantic: true },
60
+ verse: { tag: 'div', class: 'verse', semantic: false },
61
+ listing: { tag: 'pre', semantic: true },
62
+ literal: { tag: 'pre', semantic: true },
63
+ source: { tag: 'pre', semantic: true },
64
+ open: { tag: 'div', semantic: false },
65
+ pass: { tag: 'div', class: 'pass', semantic: false },
66
+
67
+ # Inline elements
68
+ inlineelement: { tag: 'span', semantic: false },
69
+ bold: { tag: 'strong', semantic: true },
70
+ italic: { tag: 'em', semantic: true },
71
+ monospace: { tag: 'code', semantic: true },
72
+ highlight: { tag: 'mark', semantic: true },
73
+ superscript: { tag: 'sup', semantic: true },
74
+ subscript: { tag: 'sub', semantic: true },
75
+ underline: { tag: 'u', semantic: false },
76
+ strikethrough: { tag: 'del', semantic: true },
77
+ smallcaps: { tag: 'span', class: 'small-caps', semantic: false },
78
+ link: { tag: 'a', semantic: true },
79
+ anchor: { tag: 'a', semantic: true },
80
+ xref: { tag: 'a', class: 'xref', semantic: true },
81
+ quotation: { tag: 'q', semantic: true },
82
+
83
+ # Lists
84
+ listblock: { tag: 'ul', semantic: true },
85
+ listitem: { tag: 'li', semantic: true },
86
+ orderedlist: { tag: 'ol', semantic: true },
87
+ unorderedlist: { tag: 'ul', semantic: true },
88
+
89
+ # Tables
90
+ table: { tag: 'table', semantic: true },
91
+ tablerow: { tag: 'tr', semantic: true },
92
+ tablecell: { tag: 'td', semantic: true },
93
+
94
+ # Media
95
+ image: { tag: 'img', semantic: true, self_closing: true },
96
+ video: { tag: 'video', semantic: true },
97
+ audio: { tag: 'audio', semantic: true },
98
+
99
+ # Other
100
+ break: { tag: 'hr', semantic: true, self_closing: true },
101
+ linebreak: { tag: 'br', semantic: true, self_closing: true },
102
+
103
+ # Text
104
+ textelement: { tag: 'text', semantic: false }
105
+ }.freeze
106
+
107
+ # Mapping from HTML elements to CoreModel types
108
+ HTML_TO_MODEL = {
109
+ # Block elements
110
+ p: 'Coradoc::CoreModel::Block',
111
+ div: 'Coradoc::CoreModel::Block',
112
+ section: 'Coradoc::CoreModel::StructuralElement',
113
+ article: 'Coradoc::CoreModel::StructuralElement',
114
+ header: 'Coradoc::CoreModel::StructuralElement',
115
+ aside: 'Coradoc::CoreModel::AnnotationBlock',
116
+ blockquote: 'Coradoc::CoreModel::Block',
117
+ pre: 'Coradoc::CoreModel::Block',
118
+
119
+ # Inline elements
120
+ strong: 'Coradoc::CoreModel::InlineElement',
121
+ b: 'Coradoc::CoreModel::InlineElement',
122
+ em: 'Coradoc::CoreModel::InlineElement',
123
+ i: 'Coradoc::CoreModel::InlineElement',
124
+ code: 'Coradoc::CoreModel::InlineElement',
125
+ mark: 'Coradoc::CoreModel::InlineElement',
126
+ sup: 'Coradoc::CoreModel::InlineElement',
127
+ sub: 'Coradoc::CoreModel::InlineElement',
128
+ u: 'Coradoc::CoreModel::InlineElement',
129
+ del: 'Coradoc::CoreModel::InlineElement',
130
+ s: 'Coradoc::CoreModel::InlineElement',
131
+ strike: 'Coradoc::CoreModel::InlineElement',
132
+
133
+ # Links
134
+ a: 'Coradoc::CoreModel::InlineElement',
135
+
136
+ # Lists
137
+ ul: 'Coradoc::CoreModel::ListBlock',
138
+ ol: 'Coradoc::CoreModel::ListBlock',
139
+ li: 'Coradoc::CoreModel::ListItem',
140
+ dl: 'Coradoc::CoreModel::ListBlock',
141
+ dt: 'Coradoc::CoreModel::ListItem',
142
+ dd: 'Coradoc::CoreModel::ListItem',
143
+
144
+ # Tables
145
+ table: 'Coradoc::CoreModel::Table',
146
+ tr: 'Coradoc::CoreModel::TableRow',
147
+ td: 'Coradoc::CoreModel::TableCell',
148
+ th: 'Coradoc::CoreModel::TableCell',
149
+
150
+ # Media
151
+ img: 'Coradoc::CoreModel::Image',
152
+ video: 'Coradoc::CoreModel::Block',
153
+ audio: 'Coradoc::CoreModel::Block',
154
+
155
+ # Other
156
+ hr: 'Coradoc::CoreModel::Block',
157
+ br: 'Coradoc::CoreModel::InlineElement',
158
+
159
+ # Headings
160
+ h1: 'Coradoc::CoreModel::StructuralElement',
161
+ h2: 'Coradoc::CoreModel::StructuralElement',
162
+ h3: 'Coradoc::CoreModel::StructuralElement',
163
+ h4: 'Coradoc::CoreModel::StructuralElement',
164
+ h5: 'Coradoc::CoreModel::StructuralElement',
165
+ h6: 'Coradoc::CoreModel::StructuralElement'
166
+ }.freeze
167
+
168
+ # Check if HTML element is block-level
169
+ def block_element?(tag)
170
+ BLOCK_ELEMENTS.include?(tag.to_sym)
171
+ end
172
+
173
+ # Check if HTML element is inline-level
174
+ def inline_element?(tag)
175
+ INLINE_ELEMENTS.include?(tag.to_sym)
176
+ end
177
+
178
+ # Check if HTML element is self-closing
179
+ def self_closing?(tag)
180
+ SELF_CLOSING_ELEMENTS.include?(tag.to_sym)
181
+ end
182
+
183
+ # Block-level HTML elements
184
+ BLOCK_ELEMENTS = %i[
185
+ div p section article aside header footer main nav
186
+ blockquote pre ul ol li dl dt dd
187
+ table tr td th thead tbody tfoot
188
+ h1 h2 h3 h4 h5 h6
189
+ hr
190
+ figure figcaption
191
+ ].freeze
192
+
193
+ # Inline-level HTML elements
194
+ INLINE_ELEMENTS = %i[
195
+ span strong em b i u s del ins mark small
196
+ code kbd samp var
197
+ a abbr cite dfn q
198
+ sub sup
199
+ time
200
+ br wbr
201
+ ].freeze
202
+
203
+ # Self-closing HTML elements
204
+ SELF_CLOSING_ELEMENTS = %i[
205
+ area base br col embed hr img input link meta param source track wbr
206
+ ].freeze
207
+ end
208
+ end
209
+ end
210
+ end
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Html
5
+ # HTML entity handling
6
+ module Entity
7
+ # Named HTML entities
8
+ NAMED_ENTITIES = {
9
+ 'nbsp' => "\u00A0",
10
+ 'lt' => '<',
11
+ 'gt' => '>',
12
+ 'amp' => '&',
13
+ 'quot' => '"',
14
+ 'apos' => "'",
15
+ 'cent' => "\u00A2",
16
+ 'pound' => "\u00A3",
17
+ 'yen' => "\u00A5",
18
+ 'euro' => "\u20AC",
19
+ 'copy' => "\u00A9",
20
+ 'reg' => "\u00AE",
21
+ 'trade' => "\u2122",
22
+ 'mdash' => "\u2014",
23
+ 'ndash' => "\u2013",
24
+ 'hellip' => "\u2026",
25
+ 'laquo' => "\u00AB",
26
+ 'raquo' => "\u00BB",
27
+ 'ldquo' => "\u201C",
28
+ 'rdquo' => "\u201D",
29
+ 'lsquo' => "\u2018",
30
+ 'rsquo' => "\u2019"
31
+ }.freeze
32
+
33
+ class << self
34
+ # Encode text to HTML entities
35
+ def encode(text, options = {})
36
+ return '' if text.nil?
37
+ return text unless text.is_a?(String)
38
+
39
+ encoded = text.dup
40
+
41
+ # Basic HTML entities
42
+ encoded = encoded
43
+ .gsub('&', '&amp;')
44
+ .gsub('<', '&lt;')
45
+ .gsub('>', '&gt;')
46
+ .gsub('"', '&quot;')
47
+
48
+ # Optionally encode additional characters
49
+ encoded = encoded.gsub("'", '&#39;') if options[:encode_quotes]
50
+
51
+ encoded = encoded.gsub("\u00A0", '&nbsp;') if options[:encode_nbsp]
52
+
53
+ encoded
54
+ end
55
+
56
+ # Decode HTML entities to text
57
+ def decode(text)
58
+ return '' if text.nil?
59
+ return text unless text.is_a?(String)
60
+
61
+ decoded = text.dup
62
+
63
+ # Decode named entities
64
+ NAMED_ENTITIES.each do |name, char|
65
+ decoded = decoded.gsub("&#{name};", char)
66
+ end
67
+
68
+ # Decode numeric entities (decimal)
69
+ decoded = decoded.gsub(/&#(\d+);/) do
70
+ [::Regexp.last_match(1).to_i].pack('U')
71
+ end
72
+
73
+ # Decode numeric entities (hexadecimal)
74
+ decoded = decoded.gsub(/&#x([0-9a-fA-F]+);/) do
75
+ [::Regexp.last_match(1).to_i(16)].pack('U')
76
+ end
77
+
78
+ # Decode basic entities last
79
+ decoded
80
+ .gsub('&quot;', '"')
81
+ .gsub('&#39;', "'")
82
+ .gsub('&#x27;', "'")
83
+ .gsub('&lt;', '<')
84
+ .gsub('&gt;', '>')
85
+ .gsub('&amp;', '&')
86
+ end
87
+
88
+ # Convert character to named entity if available
89
+ def to_named_entity(char)
90
+ entity_name = NAMED_ENTITIES.key(char)
91
+ entity_name ? "&#{entity_name};" : char
92
+ end
93
+
94
+ # Convert character to numeric entity
95
+ def to_numeric_entity(char, format: :decimal)
96
+ codepoint = char.ord
97
+
98
+ case format
99
+ when :decimal
100
+ "&##{codepoint};"
101
+ when :hex, :hexadecimal
102
+ "&#x#{codepoint.to_s(16)};"
103
+ else
104
+ char
105
+ end
106
+ end
107
+
108
+ # Check if text contains HTML entities
109
+ def has_entities?(text)
110
+ return false unless text.is_a?(String)
111
+
112
+ text.match?(/&[a-zA-Z]+;|&#\d+;|&#x[0-9a-fA-F]+;/)
113
+ end
114
+
115
+ # Normalize entities (convert all to named where possible, numeric otherwise)
116
+ def normalize(text)
117
+ return '' if text.nil?
118
+ return text unless text.is_a?(String)
119
+
120
+ # First decode to get actual characters
121
+ decoded = decode(text)
122
+
123
+ # Then encode back using named entities where possible
124
+ decoded.chars.map do |char|
125
+ case char
126
+ when '&', '<', '>', '"', "'"
127
+ encode(char)
128
+ else
129
+ named = to_named_entity(char)
130
+ named == char ? char : named
131
+ end
132
+ end.join
133
+ end
134
+ end
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,163 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Input
5
+ module Html
6
+ class Cleaner
7
+ # Pre-compiled regexes for performance
8
+ INNER_WHITESPACE_REGEX_1 = /\n stem:\[/
9
+ INNER_WHITESPACE_REGEX_2 = /(stem:\[([^\]]|\\\])*\])\n(?=\S)/
10
+ INNER_WHITESPACE_REGEX_3 = /(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/
11
+ NEWLINES_REGEX = /\n{3,}/
12
+ LEADING_NEWLINE_REGEX = /\A\n+/
13
+ WHITESPACE_REGEX = /[ \t\r\n]+/
14
+ TRAILING_WHITESPACE_REGEX = /[ \t\r\n]+\z/
15
+ MULTIPLE_WHITESPACE_REGEX = /[ \t]{2,}/
16
+ TAG_BORDER_REGEXES = {
17
+ asterisk: /\s?\*{2,}/,
18
+ underscore: /\s?_{2,}/,
19
+ tilde: /\s?~{2,}/,
20
+ bracket: /\s?\[.*?\]\s?/
21
+ }.freeze
22
+
23
+ def tidy(string)
24
+ return string.transform_values { |i| tidy(i) } if string.is_a? Hash
25
+
26
+ result = HtmlConverter.track_time 'Removing inner whitespace' do
27
+ remove_inner_whitespaces(String.new(string))
28
+ end
29
+ result = HtmlConverter.track_time 'Removing newlines' do
30
+ remove_newlines(result)
31
+ end
32
+ result = HtmlConverter.track_time 'Removing leading newlines' do
33
+ remove_leading_newlines(result)
34
+ end
35
+ result = HtmlConverter.track_time 'Cleaning tag borders' do
36
+ clean_tag_borders(result)
37
+ end
38
+ result = HtmlConverter.track_time 'Cleaning punctuation characters' do
39
+ clean_punctuation_characters(result)
40
+ end
41
+ result = remove_block_leading_newlines(result)
42
+ result = remove_section_attribute_newlines(result)
43
+ end
44
+
45
+ def remove_block_leading_newlines(string)
46
+ string.gsub("]\n****\n\n", "]\n****\n")
47
+ end
48
+
49
+ def remove_section_attribute_newlines(string)
50
+ string.gsub("]\n\n==", "]\n==")
51
+ end
52
+
53
+ def remove_newlines(string)
54
+ string.gsub(NEWLINES_REGEX, "\n\n")
55
+ end
56
+
57
+ def remove_leading_newlines(string)
58
+ string.gsub(LEADING_NEWLINE_REGEX, '')
59
+ end
60
+
61
+ def remove_inner_whitespaces(string)
62
+ unless string.nil?
63
+ string.gsub!("\n stem:[", "\nstem:[")
64
+ string.gsub!(INNER_WHITESPACE_REGEX_1, '\\1 ')
65
+ string.gsub!(INNER_WHITESPACE_REGEX_2, '\\1')
66
+ end
67
+ result = +''
68
+ string.each_line do |line|
69
+ result << preserve_border_whitespaces(line) do
70
+ # Use ASCII-only strip to preserve CJK fullwidth spaces
71
+ line.gsub(/\A[ \t\r\n]+/, '').gsub(/[ \t\r\n]+\z/, '').gsub(/[ \t]{2,}/, ' ')
72
+ end
73
+ end
74
+ result
75
+ end
76
+
77
+ # Find non-asterisk content that is enclosed by two or
78
+ # more asterisks. Ensure that only one whitespace occurs
79
+ # in the border area.
80
+ # Same for underscores and brackets.
81
+ def clean_tag_borders(string)
82
+ # result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match|
83
+ # preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do
84
+ # match.strip.sub("** ", "**").sub(" **", "**")
85
+ # end
86
+ # end
87
+
88
+ # result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match|
89
+ # preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do
90
+ # match.strip.sub("__ ", "__").sub(" __", "__")
91
+ # end
92
+ # end
93
+
94
+ result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
95
+ preserve_border_whitespaces(
96
+ match,
97
+ default_border: Coradoc::Html::Input.config.tag_border
98
+ ) do
99
+ match.strip.sub('~~ ', '~~').sub(' ~~', '~~')
100
+ end
101
+ end
102
+
103
+ result.gsub(/\s?\[.*?\]\s?/) do |match|
104
+ preserve_border_whitespaces(match) do
105
+ match.strip.sub('[ ', '[').sub(' ]', ']')
106
+ end
107
+ end
108
+ end
109
+
110
+ def clean_punctuation_characters(string)
111
+ string.gsub(/(\*\*|~~|__)\s([.!?'"])/, '\\1\\2')
112
+ end
113
+
114
+ # preprocesses HTML, rather than postprocessing it
115
+ def preprocess_word_html(string)
116
+ clean_headings(scrub_whitespace(string.dup))
117
+ end
118
+
119
+ def scrub_whitespace(string)
120
+ string.gsub!(/&nbsp;|&#xA0;|\u00a0/i, '&#xA0;') # HTML encoded spaces
121
+ string = Coradoc.strip_unicode(string) # Strip document-level leading and trailing whitespace
122
+ string.gsub!(/( +)$/, ' ') # line trailing whitespace
123
+ string.gsub!("\n\n\n\n", "\n\n") # Quadruple line breaks
124
+ # string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs
125
+ string
126
+ end
127
+
128
+ # following added by me
129
+ def clean_headings(string)
130
+ string.gsub!(%r{<h([1-9])[^>]*></h\1>}, ' ')
131
+ # I don't know why Libre Office is inserting them, but they need to go
132
+ string.gsub!(
133
+ %r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
134
+ '<sup>\\2</sup>'
135
+ )
136
+ # I absolutely don't know why Libre Office is rendering superscripts as h1
137
+ string
138
+ end
139
+
140
+ private
141
+
142
+ def preserve_border_whitespaces(string, options = {})
143
+ return string if /\A\s*\Z/.match?(string)
144
+
145
+ default_border = options.fetch(:default_border, '')
146
+ # If the string contains part of a link so the characters [,],(,)
147
+ # then don't add any extra spaces
148
+ default_border = '' if /[\[(\])]/.match?(string)
149
+ string_start = present_or_default(string[/\A\s*/], default_border)
150
+ string_end = present_or_default(string[/\s*\Z/], default_border)
151
+ result = yield
152
+ string_start + result + string_end
153
+ end
154
+
155
+ def present_or_default(string, default)
156
+ return default if string.nil? || string.empty?
157
+
158
+ string
159
+ end
160
+ end
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'tmpdir'
4
+
5
+ module Coradoc
6
+ module Input
7
+ module Html
8
+ class Config
9
+ def initialize
10
+ @unknown_tags = :pass_through
11
+ @input_format = :html
12
+ @mathml2asciimath = false
13
+ @external_images = false
14
+
15
+ # Destination to save file and images
16
+ @destination = nil
17
+
18
+ # Source of HTML
19
+ # @sourcedir = nil
20
+
21
+ # Image counter, assuming there are max 999 images
22
+ @image_counter = 1
23
+ # pad with 0s
24
+ @image_counter_pattern = '%03d'
25
+
26
+ @em_delimiter = '_'
27
+ @strong_delimiter = '*'
28
+ @inline_options = {}
29
+ @tag_border = ' '
30
+
31
+ @split_sections = nil
32
+
33
+ # Document width - used to compute table sizes.
34
+ # This is an assumption for screen size in input document.
35
+ # If column widths are specified in absolute values, then we
36
+ # have to convert them to relative values for better portability
37
+ # across output formats.
38
+ @doc_width = 1000
39
+
40
+ # Plugin system
41
+ @plugins = []
42
+
43
+ # Debugging options
44
+ @track_time = false
45
+ end
46
+
47
+ def with(options = {})
48
+ old_options = @inline_options
49
+ @inline_options = options
50
+ result = yield
51
+ @inline_options = old_options
52
+ result
53
+ end
54
+
55
+ def self.declare_option(option)
56
+ define_method(option) do
57
+ @inline_options[option] || instance_variable_get(:"@#{option}")
58
+ end
59
+
60
+ attr_writer option
61
+ end
62
+
63
+ declare_option :unknown_tags
64
+ declare_option :tag_border
65
+ declare_option :mathml2asciimath
66
+ declare_option :external_images
67
+ declare_option :destination
68
+ declare_option :sourcedir
69
+ declare_option :image_counter
70
+ declare_option :image_counter_pattern
71
+ declare_option :input_format
72
+ declare_option :split_sections
73
+ declare_option :doc_width
74
+ declare_option :plugins
75
+ declare_option :track_time
76
+ end
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'coradoc'
4
+
5
+ module Coradoc
6
+ module Input
7
+ module Html
8
+ module Converters
9
+ class A < Base
10
+ def to_coradoc(node, state = {})
11
+ # Use treat_children_coradoc to get CoreModel elements
12
+ content = treat_children_coradoc(node, state)
13
+
14
+ href = node['href']
15
+ title = extract_title(node)
16
+ id = node['id'] || node['name']
17
+
18
+ id = id&.gsub(/\s/, '')&.gsub(/__+/, '_')
19
+ id = nil if id&.empty?
20
+
21
+ return nil if /^_Toc\d+$|^_GoBack$/.match?(id)
22
+
23
+ # For inline anchors - return CoreModel InlineElement with format_type "anchor"
24
+ if id
25
+ return Coradoc::CoreModel::InlineElement.new(
26
+ format_type: 'anchor',
27
+ target: id
28
+ )
29
+ end
30
+
31
+ # For cross-references
32
+ if href.to_s.start_with?('#')
33
+ ref_id = href.sub(/^#/, '').gsub(/\s/, '').gsub(/__+/, '_')
34
+ # Convert content to string
35
+ content_str = if content.is_a?(Array)
36
+ content.map { |c| c.is_a?(Coradoc::CoreModel::Base) ? c.content : c.to_s }.join
37
+ else
38
+ content.to_s
39
+ end
40
+ return Coradoc::CoreModel::InlineElement.new(
41
+ format_type: 'xref',
42
+ target: ref_id,
43
+ content: content_str.strip.empty? ? nil : content_str.strip
44
+ )
45
+ end
46
+
47
+ return nil if href.to_s.empty?
48
+
49
+ # For links
50
+ ambigous_characters = /[\w.?&#=%;\[\u{ff}-\u{10ffff}]/
51
+ right_constrain = textnode_after_start_with?(node, ambigous_characters)
52
+
53
+ # Convert content to string for the link
54
+ content_str = if content.is_a?(Array)
55
+ content.map { |c| c.is_a?(Coradoc::CoreModel::Base) && c.content ? c.content : c.to_s }.join
56
+ else
57
+ content.to_s
58
+ end
59
+
60
+ out = []
61
+ # Add leading space if needed
62
+ if textnode_before_end_with?(node, ambigous_characters)
63
+ out << Coradoc::CoreModel::InlineElement.new(
64
+ format_type: 'text',
65
+ content: ' '
66
+ )
67
+ end
68
+
69
+ # Create link element
70
+ link = Coradoc::CoreModel::InlineElement.new(
71
+ format_type: 'link',
72
+ target: href,
73
+ content: content_str.strip,
74
+ metadata: {
75
+ title: (title.strip unless title.to_s.strip.empty?),
76
+ right_constrain: right_constrain
77
+ }.compact
78
+ )
79
+ out << link
80
+
81
+ # Return single element or array
82
+ out.length == 1 ? out.first : out
83
+ end
84
+ end
85
+
86
+ register :a, A.new
87
+ end
88
+ end
89
+ end
90
+ end