coradoc-html 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/lib/coradoc/html/base.rb +157 -0
- data/lib/coradoc/html/config.rb +467 -0
- data/lib/coradoc/html/converter_base.rb +177 -0
- data/lib/coradoc/html/converters/admonition.rb +180 -0
- data/lib/coradoc/html/converters/attribute.rb +68 -0
- data/lib/coradoc/html/converters/attribute_reference.rb +60 -0
- data/lib/coradoc/html/converters/audio.rb +165 -0
- data/lib/coradoc/html/converters/base.rb +615 -0
- data/lib/coradoc/html/converters/bibliography.rb +82 -0
- data/lib/coradoc/html/converters/bibliography_entry.rb +108 -0
- data/lib/coradoc/html/converters/block_image.rb +72 -0
- data/lib/coradoc/html/converters/bold.rb +34 -0
- data/lib/coradoc/html/converters/break.rb +32 -0
- data/lib/coradoc/html/converters/comment_block.rb +42 -0
- data/lib/coradoc/html/converters/comment_line.rb +54 -0
- data/lib/coradoc/html/converters/cross_reference.rb +59 -0
- data/lib/coradoc/html/converters/document.rb +108 -0
- data/lib/coradoc/html/converters/example.rb +114 -0
- data/lib/coradoc/html/converters/highlight.rb +34 -0
- data/lib/coradoc/html/converters/include.rb +68 -0
- data/lib/coradoc/html/converters/inline_image.rb +41 -0
- data/lib/coradoc/html/converters/italic.rb +34 -0
- data/lib/coradoc/html/converters/line_break.rb +31 -0
- data/lib/coradoc/html/converters/link.rb +46 -0
- data/lib/coradoc/html/converters/list_item.rb +75 -0
- data/lib/coradoc/html/converters/listing.rb +99 -0
- data/lib/coradoc/html/converters/literal.rb +102 -0
- data/lib/coradoc/html/converters/monospace.rb +34 -0
- data/lib/coradoc/html/converters/open.rb +78 -0
- data/lib/coradoc/html/converters/ordered.rb +53 -0
- data/lib/coradoc/html/converters/paragraph.rb +46 -0
- data/lib/coradoc/html/converters/quote.rb +113 -0
- data/lib/coradoc/html/converters/reviewer_comment.rb +74 -0
- data/lib/coradoc/html/converters/reviewer_note.rb +134 -0
- data/lib/coradoc/html/converters/section.rb +90 -0
- data/lib/coradoc/html/converters/sidebar.rb +113 -0
- data/lib/coradoc/html/converters/source.rb +137 -0
- data/lib/coradoc/html/converters/source_code.rb +16 -0
- data/lib/coradoc/html/converters/span.rb +61 -0
- data/lib/coradoc/html/converters/strikethrough.rb +34 -0
- data/lib/coradoc/html/converters/subscript.rb +34 -0
- data/lib/coradoc/html/converters/superscript.rb +34 -0
- data/lib/coradoc/html/converters/table.rb +85 -0
- data/lib/coradoc/html/converters/table_cell.rb +203 -0
- data/lib/coradoc/html/converters/table_row.rb +45 -0
- data/lib/coradoc/html/converters/template_html_converter.rb +105 -0
- data/lib/coradoc/html/converters/term.rb +58 -0
- data/lib/coradoc/html/converters/text_element.rb +44 -0
- data/lib/coradoc/html/converters/underline.rb +34 -0
- data/lib/coradoc/html/converters/unordered.rb +47 -0
- data/lib/coradoc/html/converters/verse.rb +105 -0
- data/lib/coradoc/html/converters/video.rb +179 -0
- data/lib/coradoc/html/element_mapping.rb +210 -0
- data/lib/coradoc/html/entity.rb +137 -0
- data/lib/coradoc/html/input/cleaner.rb +163 -0
- data/lib/coradoc/html/input/config.rb +79 -0
- data/lib/coradoc/html/input/converters/a.rb +90 -0
- data/lib/coradoc/html/input/converters/aside.rb +23 -0
- data/lib/coradoc/html/input/converters/audio.rb +50 -0
- data/lib/coradoc/html/input/converters/base.rb +116 -0
- data/lib/coradoc/html/input/converters/blockquote.rb +25 -0
- data/lib/coradoc/html/input/converters/br.rb +19 -0
- data/lib/coradoc/html/input/converters/bypass.rb +83 -0
- data/lib/coradoc/html/input/converters/code.rb +25 -0
- data/lib/coradoc/html/input/converters/div.rb +25 -0
- data/lib/coradoc/html/input/converters/dl.rb +106 -0
- data/lib/coradoc/html/input/converters/drop.rb +28 -0
- data/lib/coradoc/html/input/converters/em.rb +23 -0
- data/lib/coradoc/html/input/converters/figure.rb +58 -0
- data/lib/coradoc/html/input/converters/h.rb +76 -0
- data/lib/coradoc/html/input/converters/head.rb +30 -0
- data/lib/coradoc/html/input/converters/hr.rb +20 -0
- data/lib/coradoc/html/input/converters/ignore.rb +22 -0
- data/lib/coradoc/html/input/converters/img.rb +110 -0
- data/lib/coradoc/html/input/converters/li.rb +35 -0
- data/lib/coradoc/html/input/converters/mark.rb +21 -0
- data/lib/coradoc/html/input/converters/markup.rb +107 -0
- data/lib/coradoc/html/input/converters/math.rb +46 -0
- data/lib/coradoc/html/input/converters/ol.rb +46 -0
- data/lib/coradoc/html/input/converters/p.rb +81 -0
- data/lib/coradoc/html/input/converters/pass_through.rb +19 -0
- data/lib/coradoc/html/input/converters/pre.rb +59 -0
- data/lib/coradoc/html/input/converters/q.rb +24 -0
- data/lib/coradoc/html/input/converters/strong.rb +22 -0
- data/lib/coradoc/html/input/converters/sub.rb +40 -0
- data/lib/coradoc/html/input/converters/sup.rb +40 -0
- data/lib/coradoc/html/input/converters/table.rb +64 -0
- data/lib/coradoc/html/input/converters/td.rb +70 -0
- data/lib/coradoc/html/input/converters/text.rb +67 -0
- data/lib/coradoc/html/input/converters/th.rb +20 -0
- data/lib/coradoc/html/input/converters/tr.rb +28 -0
- data/lib/coradoc/html/input/converters/video.rb +53 -0
- data/lib/coradoc/html/input/converters.rb +122 -0
- data/lib/coradoc/html/input/errors.rb +22 -0
- data/lib/coradoc/html/input/html_converter.rb +170 -0
- data/lib/coradoc/html/input/plugin.rb +169 -0
- data/lib/coradoc/html/input/plugins/plateau.rb +229 -0
- data/lib/coradoc/html/input/postprocessor.rb +31 -0
- data/lib/coradoc/html/input.rb +68 -0
- data/lib/coradoc/html/output.rb +95 -0
- data/lib/coradoc/html/renderer.rb +409 -0
- data/lib/coradoc/html/spa.rb +309 -0
- data/lib/coradoc/html/static.rb +293 -0
- data/lib/coradoc/html/template_config.rb +151 -0
- data/lib/coradoc/html/template_helpers.rb +58 -0
- data/lib/coradoc/html/template_locator.rb +114 -0
- data/lib/coradoc/html/theme/base.rb +231 -0
- data/lib/coradoc/html/theme/classic_renderer.rb +390 -0
- data/lib/coradoc/html/theme/modern/components/ui_components.rb +344 -0
- data/lib/coradoc/html/theme/modern/css_generator.rb +311 -0
- data/lib/coradoc/html/theme/modern/javascript_generator.rb +314 -0
- data/lib/coradoc/html/theme/modern/serializers/document_serializer.rb +382 -0
- data/lib/coradoc/html/theme/modern/tailwind_config_builder.rb +164 -0
- data/lib/coradoc/html/theme/modern/vue_template_generator.rb +374 -0
- data/lib/coradoc/html/theme/modern_renderer.rb +250 -0
- data/lib/coradoc/html/theme/registry.rb +153 -0
- data/lib/coradoc/html/theme.rb +13 -0
- data/lib/coradoc/html/transform/from_core_model.rb +32 -0
- data/lib/coradoc/html/transform/to_core_model.rb +39 -0
- data/lib/coradoc/html/version.rb +7 -0
- data/lib/coradoc/html.rb +255 -0
- metadata +264 -0
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Html
|
|
5
|
+
# Element mapping between CoreModel and HTML elements
|
|
6
|
+
#
|
|
7
|
+
# This module provides bidirectional mapping between CoreModel types
|
|
8
|
+
# and HTML elements for conversion purposes.
|
|
9
|
+
module ElementMapping
|
|
10
|
+
class << self
|
|
11
|
+
# Map CoreModel class to HTML element
|
|
12
|
+
def model_to_html_element(model_class)
|
|
13
|
+
model_name = model_class.name.split('::').last.downcase.to_sym
|
|
14
|
+
MODEL_TO_HTML[model_name] || default_element_for(model_class)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Map HTML element to CoreModel class
|
|
18
|
+
def html_element_to_model(tag_name, context = {})
|
|
19
|
+
tag = tag_name.to_s.downcase.to_sym
|
|
20
|
+
HTML_TO_MODEL[tag] || default_model_for(tag, context)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Get default element for a model class
|
|
24
|
+
def default_element_for(model_class)
|
|
25
|
+
if model_class.ancestors.any? { |a| a.name&.include?('InlineElement') }
|
|
26
|
+
{ tag: 'span', semantic: false }
|
|
27
|
+
else
|
|
28
|
+
{ tag: 'div', semantic: false }
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Get default model for an HTML element
|
|
33
|
+
def default_model_for(tag, _context)
|
|
34
|
+
# Return CoreModel types
|
|
35
|
+
case tag
|
|
36
|
+
when :p, :div, :section, :article
|
|
37
|
+
'Coradoc::CoreModel::Block'
|
|
38
|
+
when :strong, :b, :em, :i, :code
|
|
39
|
+
'Coradoc::CoreModel::InlineElement'
|
|
40
|
+
else
|
|
41
|
+
'Coradoc::CoreModel::InlineElement'
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Mapping from CoreModel types to HTML elements
|
|
46
|
+
MODEL_TO_HTML = {
|
|
47
|
+
# Document structure
|
|
48
|
+
document: { tag: 'article', semantic: true },
|
|
49
|
+
section: { tag: 'section', semantic: true },
|
|
50
|
+
header: { tag: 'header', semantic: true },
|
|
51
|
+
title: { tag: 'h1', semantic: true },
|
|
52
|
+
structuralelement: { tag: 'section', semantic: true },
|
|
53
|
+
|
|
54
|
+
# Block elements
|
|
55
|
+
paragraph: { tag: 'p', semantic: true },
|
|
56
|
+
block: { tag: 'div', semantic: true },
|
|
57
|
+
example: { tag: 'div', class: 'example', semantic: false },
|
|
58
|
+
annotationblock: { tag: 'aside', semantic: true },
|
|
59
|
+
quote: { tag: 'blockquote', semantic: true },
|
|
60
|
+
verse: { tag: 'div', class: 'verse', semantic: false },
|
|
61
|
+
listing: { tag: 'pre', semantic: true },
|
|
62
|
+
literal: { tag: 'pre', semantic: true },
|
|
63
|
+
source: { tag: 'pre', semantic: true },
|
|
64
|
+
open: { tag: 'div', semantic: false },
|
|
65
|
+
pass: { tag: 'div', class: 'pass', semantic: false },
|
|
66
|
+
|
|
67
|
+
# Inline elements
|
|
68
|
+
inlineelement: { tag: 'span', semantic: false },
|
|
69
|
+
bold: { tag: 'strong', semantic: true },
|
|
70
|
+
italic: { tag: 'em', semantic: true },
|
|
71
|
+
monospace: { tag: 'code', semantic: true },
|
|
72
|
+
highlight: { tag: 'mark', semantic: true },
|
|
73
|
+
superscript: { tag: 'sup', semantic: true },
|
|
74
|
+
subscript: { tag: 'sub', semantic: true },
|
|
75
|
+
underline: { tag: 'u', semantic: false },
|
|
76
|
+
strikethrough: { tag: 'del', semantic: true },
|
|
77
|
+
smallcaps: { tag: 'span', class: 'small-caps', semantic: false },
|
|
78
|
+
link: { tag: 'a', semantic: true },
|
|
79
|
+
anchor: { tag: 'a', semantic: true },
|
|
80
|
+
xref: { tag: 'a', class: 'xref', semantic: true },
|
|
81
|
+
quotation: { tag: 'q', semantic: true },
|
|
82
|
+
|
|
83
|
+
# Lists
|
|
84
|
+
listblock: { tag: 'ul', semantic: true },
|
|
85
|
+
listitem: { tag: 'li', semantic: true },
|
|
86
|
+
orderedlist: { tag: 'ol', semantic: true },
|
|
87
|
+
unorderedlist: { tag: 'ul', semantic: true },
|
|
88
|
+
|
|
89
|
+
# Tables
|
|
90
|
+
table: { tag: 'table', semantic: true },
|
|
91
|
+
tablerow: { tag: 'tr', semantic: true },
|
|
92
|
+
tablecell: { tag: 'td', semantic: true },
|
|
93
|
+
|
|
94
|
+
# Media
|
|
95
|
+
image: { tag: 'img', semantic: true, self_closing: true },
|
|
96
|
+
video: { tag: 'video', semantic: true },
|
|
97
|
+
audio: { tag: 'audio', semantic: true },
|
|
98
|
+
|
|
99
|
+
# Other
|
|
100
|
+
break: { tag: 'hr', semantic: true, self_closing: true },
|
|
101
|
+
linebreak: { tag: 'br', semantic: true, self_closing: true },
|
|
102
|
+
|
|
103
|
+
# Text
|
|
104
|
+
textelement: { tag: 'text', semantic: false }
|
|
105
|
+
}.freeze
|
|
106
|
+
|
|
107
|
+
# Mapping from HTML elements to CoreModel types
|
|
108
|
+
HTML_TO_MODEL = {
|
|
109
|
+
# Block elements
|
|
110
|
+
p: 'Coradoc::CoreModel::Block',
|
|
111
|
+
div: 'Coradoc::CoreModel::Block',
|
|
112
|
+
section: 'Coradoc::CoreModel::StructuralElement',
|
|
113
|
+
article: 'Coradoc::CoreModel::StructuralElement',
|
|
114
|
+
header: 'Coradoc::CoreModel::StructuralElement',
|
|
115
|
+
aside: 'Coradoc::CoreModel::AnnotationBlock',
|
|
116
|
+
blockquote: 'Coradoc::CoreModel::Block',
|
|
117
|
+
pre: 'Coradoc::CoreModel::Block',
|
|
118
|
+
|
|
119
|
+
# Inline elements
|
|
120
|
+
strong: 'Coradoc::CoreModel::InlineElement',
|
|
121
|
+
b: 'Coradoc::CoreModel::InlineElement',
|
|
122
|
+
em: 'Coradoc::CoreModel::InlineElement',
|
|
123
|
+
i: 'Coradoc::CoreModel::InlineElement',
|
|
124
|
+
code: 'Coradoc::CoreModel::InlineElement',
|
|
125
|
+
mark: 'Coradoc::CoreModel::InlineElement',
|
|
126
|
+
sup: 'Coradoc::CoreModel::InlineElement',
|
|
127
|
+
sub: 'Coradoc::CoreModel::InlineElement',
|
|
128
|
+
u: 'Coradoc::CoreModel::InlineElement',
|
|
129
|
+
del: 'Coradoc::CoreModel::InlineElement',
|
|
130
|
+
s: 'Coradoc::CoreModel::InlineElement',
|
|
131
|
+
strike: 'Coradoc::CoreModel::InlineElement',
|
|
132
|
+
|
|
133
|
+
# Links
|
|
134
|
+
a: 'Coradoc::CoreModel::InlineElement',
|
|
135
|
+
|
|
136
|
+
# Lists
|
|
137
|
+
ul: 'Coradoc::CoreModel::ListBlock',
|
|
138
|
+
ol: 'Coradoc::CoreModel::ListBlock',
|
|
139
|
+
li: 'Coradoc::CoreModel::ListItem',
|
|
140
|
+
dl: 'Coradoc::CoreModel::ListBlock',
|
|
141
|
+
dt: 'Coradoc::CoreModel::ListItem',
|
|
142
|
+
dd: 'Coradoc::CoreModel::ListItem',
|
|
143
|
+
|
|
144
|
+
# Tables
|
|
145
|
+
table: 'Coradoc::CoreModel::Table',
|
|
146
|
+
tr: 'Coradoc::CoreModel::TableRow',
|
|
147
|
+
td: 'Coradoc::CoreModel::TableCell',
|
|
148
|
+
th: 'Coradoc::CoreModel::TableCell',
|
|
149
|
+
|
|
150
|
+
# Media
|
|
151
|
+
img: 'Coradoc::CoreModel::Image',
|
|
152
|
+
video: 'Coradoc::CoreModel::Block',
|
|
153
|
+
audio: 'Coradoc::CoreModel::Block',
|
|
154
|
+
|
|
155
|
+
# Other
|
|
156
|
+
hr: 'Coradoc::CoreModel::Block',
|
|
157
|
+
br: 'Coradoc::CoreModel::InlineElement',
|
|
158
|
+
|
|
159
|
+
# Headings
|
|
160
|
+
h1: 'Coradoc::CoreModel::StructuralElement',
|
|
161
|
+
h2: 'Coradoc::CoreModel::StructuralElement',
|
|
162
|
+
h3: 'Coradoc::CoreModel::StructuralElement',
|
|
163
|
+
h4: 'Coradoc::CoreModel::StructuralElement',
|
|
164
|
+
h5: 'Coradoc::CoreModel::StructuralElement',
|
|
165
|
+
h6: 'Coradoc::CoreModel::StructuralElement'
|
|
166
|
+
}.freeze
|
|
167
|
+
|
|
168
|
+
# Check if HTML element is block-level
|
|
169
|
+
def block_element?(tag)
|
|
170
|
+
BLOCK_ELEMENTS.include?(tag.to_sym)
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Check if HTML element is inline-level
|
|
174
|
+
def inline_element?(tag)
|
|
175
|
+
INLINE_ELEMENTS.include?(tag.to_sym)
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Check if HTML element is self-closing
|
|
179
|
+
def self_closing?(tag)
|
|
180
|
+
SELF_CLOSING_ELEMENTS.include?(tag.to_sym)
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Block-level HTML elements
|
|
184
|
+
BLOCK_ELEMENTS = %i[
|
|
185
|
+
div p section article aside header footer main nav
|
|
186
|
+
blockquote pre ul ol li dl dt dd
|
|
187
|
+
table tr td th thead tbody tfoot
|
|
188
|
+
h1 h2 h3 h4 h5 h6
|
|
189
|
+
hr
|
|
190
|
+
figure figcaption
|
|
191
|
+
].freeze
|
|
192
|
+
|
|
193
|
+
# Inline-level HTML elements
|
|
194
|
+
INLINE_ELEMENTS = %i[
|
|
195
|
+
span strong em b i u s del ins mark small
|
|
196
|
+
code kbd samp var
|
|
197
|
+
a abbr cite dfn q
|
|
198
|
+
sub sup
|
|
199
|
+
time
|
|
200
|
+
br wbr
|
|
201
|
+
].freeze
|
|
202
|
+
|
|
203
|
+
# Self-closing HTML elements
|
|
204
|
+
SELF_CLOSING_ELEMENTS = %i[
|
|
205
|
+
area base br col embed hr img input link meta param source track wbr
|
|
206
|
+
].freeze
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
end
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Html
|
|
5
|
+
# HTML entity handling
|
|
6
|
+
module Entity
|
|
7
|
+
# Named HTML entities
|
|
8
|
+
NAMED_ENTITIES = {
|
|
9
|
+
'nbsp' => "\u00A0",
|
|
10
|
+
'lt' => '<',
|
|
11
|
+
'gt' => '>',
|
|
12
|
+
'amp' => '&',
|
|
13
|
+
'quot' => '"',
|
|
14
|
+
'apos' => "'",
|
|
15
|
+
'cent' => "\u00A2",
|
|
16
|
+
'pound' => "\u00A3",
|
|
17
|
+
'yen' => "\u00A5",
|
|
18
|
+
'euro' => "\u20AC",
|
|
19
|
+
'copy' => "\u00A9",
|
|
20
|
+
'reg' => "\u00AE",
|
|
21
|
+
'trade' => "\u2122",
|
|
22
|
+
'mdash' => "\u2014",
|
|
23
|
+
'ndash' => "\u2013",
|
|
24
|
+
'hellip' => "\u2026",
|
|
25
|
+
'laquo' => "\u00AB",
|
|
26
|
+
'raquo' => "\u00BB",
|
|
27
|
+
'ldquo' => "\u201C",
|
|
28
|
+
'rdquo' => "\u201D",
|
|
29
|
+
'lsquo' => "\u2018",
|
|
30
|
+
'rsquo' => "\u2019"
|
|
31
|
+
}.freeze
|
|
32
|
+
|
|
33
|
+
class << self
|
|
34
|
+
# Encode text to HTML entities
|
|
35
|
+
def encode(text, options = {})
|
|
36
|
+
return '' if text.nil?
|
|
37
|
+
return text unless text.is_a?(String)
|
|
38
|
+
|
|
39
|
+
encoded = text.dup
|
|
40
|
+
|
|
41
|
+
# Basic HTML entities
|
|
42
|
+
encoded = encoded
|
|
43
|
+
.gsub('&', '&')
|
|
44
|
+
.gsub('<', '<')
|
|
45
|
+
.gsub('>', '>')
|
|
46
|
+
.gsub('"', '"')
|
|
47
|
+
|
|
48
|
+
# Optionally encode additional characters
|
|
49
|
+
encoded = encoded.gsub("'", ''') if options[:encode_quotes]
|
|
50
|
+
|
|
51
|
+
encoded = encoded.gsub("\u00A0", ' ') if options[:encode_nbsp]
|
|
52
|
+
|
|
53
|
+
encoded
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Decode HTML entities to text
|
|
57
|
+
def decode(text)
|
|
58
|
+
return '' if text.nil?
|
|
59
|
+
return text unless text.is_a?(String)
|
|
60
|
+
|
|
61
|
+
decoded = text.dup
|
|
62
|
+
|
|
63
|
+
# Decode named entities
|
|
64
|
+
NAMED_ENTITIES.each do |name, char|
|
|
65
|
+
decoded = decoded.gsub("&#{name};", char)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Decode numeric entities (decimal)
|
|
69
|
+
decoded = decoded.gsub(/&#(\d+);/) do
|
|
70
|
+
[::Regexp.last_match(1).to_i].pack('U')
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Decode numeric entities (hexadecimal)
|
|
74
|
+
decoded = decoded.gsub(/&#x([0-9a-fA-F]+);/) do
|
|
75
|
+
[::Regexp.last_match(1).to_i(16)].pack('U')
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Decode basic entities last
|
|
79
|
+
decoded
|
|
80
|
+
.gsub('"', '"')
|
|
81
|
+
.gsub(''', "'")
|
|
82
|
+
.gsub(''', "'")
|
|
83
|
+
.gsub('<', '<')
|
|
84
|
+
.gsub('>', '>')
|
|
85
|
+
.gsub('&', '&')
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Convert character to named entity if available
|
|
89
|
+
def to_named_entity(char)
|
|
90
|
+
entity_name = NAMED_ENTITIES.key(char)
|
|
91
|
+
entity_name ? "&#{entity_name};" : char
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Convert character to numeric entity
|
|
95
|
+
def to_numeric_entity(char, format: :decimal)
|
|
96
|
+
codepoint = char.ord
|
|
97
|
+
|
|
98
|
+
case format
|
|
99
|
+
when :decimal
|
|
100
|
+
"&##{codepoint};"
|
|
101
|
+
when :hex, :hexadecimal
|
|
102
|
+
"&#x#{codepoint.to_s(16)};"
|
|
103
|
+
else
|
|
104
|
+
char
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Check if text contains HTML entities
|
|
109
|
+
def has_entities?(text)
|
|
110
|
+
return false unless text.is_a?(String)
|
|
111
|
+
|
|
112
|
+
text.match?(/&[a-zA-Z]+;|&#\d+;|&#x[0-9a-fA-F]+;/)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Normalize entities (convert all to named where possible, numeric otherwise)
|
|
116
|
+
def normalize(text)
|
|
117
|
+
return '' if text.nil?
|
|
118
|
+
return text unless text.is_a?(String)
|
|
119
|
+
|
|
120
|
+
# First decode to get actual characters
|
|
121
|
+
decoded = decode(text)
|
|
122
|
+
|
|
123
|
+
# Then encode back using named entities where possible
|
|
124
|
+
decoded.chars.map do |char|
|
|
125
|
+
case char
|
|
126
|
+
when '&', '<', '>', '"', "'"
|
|
127
|
+
encode(char)
|
|
128
|
+
else
|
|
129
|
+
named = to_named_entity(char)
|
|
130
|
+
named == char ? char : named
|
|
131
|
+
end
|
|
132
|
+
end.join
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Input
|
|
5
|
+
module Html
|
|
6
|
+
class Cleaner
|
|
7
|
+
# Pre-compiled regexes for performance
|
|
8
|
+
INNER_WHITESPACE_REGEX_1 = /\n stem:\[/
|
|
9
|
+
INNER_WHITESPACE_REGEX_2 = /(stem:\[([^\]]|\\\])*\])\n(?=\S)/
|
|
10
|
+
INNER_WHITESPACE_REGEX_3 = /(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/
|
|
11
|
+
NEWLINES_REGEX = /\n{3,}/
|
|
12
|
+
LEADING_NEWLINE_REGEX = /\A\n+/
|
|
13
|
+
WHITESPACE_REGEX = /[ \t\r\n]+/
|
|
14
|
+
TRAILING_WHITESPACE_REGEX = /[ \t\r\n]+\z/
|
|
15
|
+
MULTIPLE_WHITESPACE_REGEX = /[ \t]{2,}/
|
|
16
|
+
TAG_BORDER_REGEXES = {
|
|
17
|
+
asterisk: /\s?\*{2,}/,
|
|
18
|
+
underscore: /\s?_{2,}/,
|
|
19
|
+
tilde: /\s?~{2,}/,
|
|
20
|
+
bracket: /\s?\[.*?\]\s?/
|
|
21
|
+
}.freeze
|
|
22
|
+
|
|
23
|
+
def tidy(string)
|
|
24
|
+
return string.transform_values { |i| tidy(i) } if string.is_a? Hash
|
|
25
|
+
|
|
26
|
+
result = HtmlConverter.track_time 'Removing inner whitespace' do
|
|
27
|
+
remove_inner_whitespaces(String.new(string))
|
|
28
|
+
end
|
|
29
|
+
result = HtmlConverter.track_time 'Removing newlines' do
|
|
30
|
+
remove_newlines(result)
|
|
31
|
+
end
|
|
32
|
+
result = HtmlConverter.track_time 'Removing leading newlines' do
|
|
33
|
+
remove_leading_newlines(result)
|
|
34
|
+
end
|
|
35
|
+
result = HtmlConverter.track_time 'Cleaning tag borders' do
|
|
36
|
+
clean_tag_borders(result)
|
|
37
|
+
end
|
|
38
|
+
result = HtmlConverter.track_time 'Cleaning punctuation characters' do
|
|
39
|
+
clean_punctuation_characters(result)
|
|
40
|
+
end
|
|
41
|
+
result = remove_block_leading_newlines(result)
|
|
42
|
+
result = remove_section_attribute_newlines(result)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def remove_block_leading_newlines(string)
|
|
46
|
+
string.gsub("]\n****\n\n", "]\n****\n")
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def remove_section_attribute_newlines(string)
|
|
50
|
+
string.gsub("]\n\n==", "]\n==")
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def remove_newlines(string)
|
|
54
|
+
string.gsub(NEWLINES_REGEX, "\n\n")
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def remove_leading_newlines(string)
|
|
58
|
+
string.gsub(LEADING_NEWLINE_REGEX, '')
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def remove_inner_whitespaces(string)
|
|
62
|
+
unless string.nil?
|
|
63
|
+
string.gsub!("\n stem:[", "\nstem:[")
|
|
64
|
+
string.gsub!(INNER_WHITESPACE_REGEX_1, '\\1 ')
|
|
65
|
+
string.gsub!(INNER_WHITESPACE_REGEX_2, '\\1')
|
|
66
|
+
end
|
|
67
|
+
result = +''
|
|
68
|
+
string.each_line do |line|
|
|
69
|
+
result << preserve_border_whitespaces(line) do
|
|
70
|
+
# Use ASCII-only strip to preserve CJK fullwidth spaces
|
|
71
|
+
line.gsub(/\A[ \t\r\n]+/, '').gsub(/[ \t\r\n]+\z/, '').gsub(/[ \t]{2,}/, ' ')
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
result
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Find non-asterisk content that is enclosed by two or
|
|
78
|
+
# more asterisks. Ensure that only one whitespace occurs
|
|
79
|
+
# in the border area.
|
|
80
|
+
# Same for underscores and brackets.
|
|
81
|
+
def clean_tag_borders(string)
|
|
82
|
+
# result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match|
|
|
83
|
+
# preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do
|
|
84
|
+
# match.strip.sub("** ", "**").sub(" **", "**")
|
|
85
|
+
# end
|
|
86
|
+
# end
|
|
87
|
+
|
|
88
|
+
# result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match|
|
|
89
|
+
# preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do
|
|
90
|
+
# match.strip.sub("__ ", "__").sub(" __", "__")
|
|
91
|
+
# end
|
|
92
|
+
# end
|
|
93
|
+
|
|
94
|
+
result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
|
|
95
|
+
preserve_border_whitespaces(
|
|
96
|
+
match,
|
|
97
|
+
default_border: Coradoc::Html::Input.config.tag_border
|
|
98
|
+
) do
|
|
99
|
+
match.strip.sub('~~ ', '~~').sub(' ~~', '~~')
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
result.gsub(/\s?\[.*?\]\s?/) do |match|
|
|
104
|
+
preserve_border_whitespaces(match) do
|
|
105
|
+
match.strip.sub('[ ', '[').sub(' ]', ']')
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def clean_punctuation_characters(string)
|
|
111
|
+
string.gsub(/(\*\*|~~|__)\s([.!?'"])/, '\\1\\2')
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# preprocesses HTML, rather than postprocessing it
|
|
115
|
+
def preprocess_word_html(string)
|
|
116
|
+
clean_headings(scrub_whitespace(string.dup))
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def scrub_whitespace(string)
|
|
120
|
+
string.gsub!(/ | |\u00a0/i, ' ') # HTML encoded spaces
|
|
121
|
+
string = Coradoc.strip_unicode(string) # Strip document-level leading and trailing whitespace
|
|
122
|
+
string.gsub!(/( +)$/, ' ') # line trailing whitespace
|
|
123
|
+
string.gsub!("\n\n\n\n", "\n\n") # Quadruple line breaks
|
|
124
|
+
# string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs
|
|
125
|
+
string
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# following added by me
|
|
129
|
+
def clean_headings(string)
|
|
130
|
+
string.gsub!(%r{<h([1-9])[^>]*></h\1>}, ' ')
|
|
131
|
+
# I don't know why Libre Office is inserting them, but they need to go
|
|
132
|
+
string.gsub!(
|
|
133
|
+
%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
|
|
134
|
+
'<sup>\\2</sup>'
|
|
135
|
+
)
|
|
136
|
+
# I absolutely don't know why Libre Office is rendering superscripts as h1
|
|
137
|
+
string
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
private
|
|
141
|
+
|
|
142
|
+
def preserve_border_whitespaces(string, options = {})
|
|
143
|
+
return string if /\A\s*\Z/.match?(string)
|
|
144
|
+
|
|
145
|
+
default_border = options.fetch(:default_border, '')
|
|
146
|
+
# If the string contains part of a link so the characters [,],(,)
|
|
147
|
+
# then don't add any extra spaces
|
|
148
|
+
default_border = '' if /[\[(\])]/.match?(string)
|
|
149
|
+
string_start = present_or_default(string[/\A\s*/], default_border)
|
|
150
|
+
string_end = present_or_default(string[/\s*\Z/], default_border)
|
|
151
|
+
result = yield
|
|
152
|
+
string_start + result + string_end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def present_or_default(string, default)
|
|
156
|
+
return default if string.nil? || string.empty?
|
|
157
|
+
|
|
158
|
+
string
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
end
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'tmpdir'
|
|
4
|
+
|
|
5
|
+
module Coradoc
|
|
6
|
+
module Input
|
|
7
|
+
module Html
|
|
8
|
+
class Config
|
|
9
|
+
def initialize
|
|
10
|
+
@unknown_tags = :pass_through
|
|
11
|
+
@input_format = :html
|
|
12
|
+
@mathml2asciimath = false
|
|
13
|
+
@external_images = false
|
|
14
|
+
|
|
15
|
+
# Destination to save file and images
|
|
16
|
+
@destination = nil
|
|
17
|
+
|
|
18
|
+
# Source of HTML
|
|
19
|
+
# @sourcedir = nil
|
|
20
|
+
|
|
21
|
+
# Image counter, assuming there are max 999 images
|
|
22
|
+
@image_counter = 1
|
|
23
|
+
# pad with 0s
|
|
24
|
+
@image_counter_pattern = '%03d'
|
|
25
|
+
|
|
26
|
+
@em_delimiter = '_'
|
|
27
|
+
@strong_delimiter = '*'
|
|
28
|
+
@inline_options = {}
|
|
29
|
+
@tag_border = ' '
|
|
30
|
+
|
|
31
|
+
@split_sections = nil
|
|
32
|
+
|
|
33
|
+
# Document width - used to compute table sizes.
|
|
34
|
+
# This is an assumption for screen size in input document.
|
|
35
|
+
# If column widths are specified in absolute values, then we
|
|
36
|
+
# have to convert them to relative values for better portability
|
|
37
|
+
# across output formats.
|
|
38
|
+
@doc_width = 1000
|
|
39
|
+
|
|
40
|
+
# Plugin system
|
|
41
|
+
@plugins = []
|
|
42
|
+
|
|
43
|
+
# Debugging options
|
|
44
|
+
@track_time = false
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def with(options = {})
|
|
48
|
+
old_options = @inline_options
|
|
49
|
+
@inline_options = options
|
|
50
|
+
result = yield
|
|
51
|
+
@inline_options = old_options
|
|
52
|
+
result
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def self.declare_option(option)
|
|
56
|
+
define_method(option) do
|
|
57
|
+
@inline_options[option] || instance_variable_get(:"@#{option}")
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
attr_writer option
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
declare_option :unknown_tags
|
|
64
|
+
declare_option :tag_border
|
|
65
|
+
declare_option :mathml2asciimath
|
|
66
|
+
declare_option :external_images
|
|
67
|
+
declare_option :destination
|
|
68
|
+
declare_option :sourcedir
|
|
69
|
+
declare_option :image_counter
|
|
70
|
+
declare_option :image_counter_pattern
|
|
71
|
+
declare_option :input_format
|
|
72
|
+
declare_option :split_sections
|
|
73
|
+
declare_option :doc_width
|
|
74
|
+
declare_option :plugins
|
|
75
|
+
declare_option :track_time
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'coradoc'
|
|
4
|
+
|
|
5
|
+
module Coradoc
|
|
6
|
+
module Input
|
|
7
|
+
module Html
|
|
8
|
+
module Converters
|
|
9
|
+
class A < Base
|
|
10
|
+
def to_coradoc(node, state = {})
|
|
11
|
+
# Use treat_children_coradoc to get CoreModel elements
|
|
12
|
+
content = treat_children_coradoc(node, state)
|
|
13
|
+
|
|
14
|
+
href = node['href']
|
|
15
|
+
title = extract_title(node)
|
|
16
|
+
id = node['id'] || node['name']
|
|
17
|
+
|
|
18
|
+
id = id&.gsub(/\s/, '')&.gsub(/__+/, '_')
|
|
19
|
+
id = nil if id&.empty?
|
|
20
|
+
|
|
21
|
+
return nil if /^_Toc\d+$|^_GoBack$/.match?(id)
|
|
22
|
+
|
|
23
|
+
# For inline anchors - return CoreModel InlineElement with format_type "anchor"
|
|
24
|
+
if id
|
|
25
|
+
return Coradoc::CoreModel::InlineElement.new(
|
|
26
|
+
format_type: 'anchor',
|
|
27
|
+
target: id
|
|
28
|
+
)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# For cross-references
|
|
32
|
+
if href.to_s.start_with?('#')
|
|
33
|
+
ref_id = href.sub(/^#/, '').gsub(/\s/, '').gsub(/__+/, '_')
|
|
34
|
+
# Convert content to string
|
|
35
|
+
content_str = if content.is_a?(Array)
|
|
36
|
+
content.map { |c| c.is_a?(Coradoc::CoreModel::Base) ? c.content : c.to_s }.join
|
|
37
|
+
else
|
|
38
|
+
content.to_s
|
|
39
|
+
end
|
|
40
|
+
return Coradoc::CoreModel::InlineElement.new(
|
|
41
|
+
format_type: 'xref',
|
|
42
|
+
target: ref_id,
|
|
43
|
+
content: content_str.strip.empty? ? nil : content_str.strip
|
|
44
|
+
)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
return nil if href.to_s.empty?
|
|
48
|
+
|
|
49
|
+
# For links
|
|
50
|
+
ambigous_characters = /[\w.?&#=%;\[\u{ff}-\u{10ffff}]/
|
|
51
|
+
right_constrain = textnode_after_start_with?(node, ambigous_characters)
|
|
52
|
+
|
|
53
|
+
# Convert content to string for the link
|
|
54
|
+
content_str = if content.is_a?(Array)
|
|
55
|
+
content.map { |c| c.is_a?(Coradoc::CoreModel::Base) && c.content ? c.content : c.to_s }.join
|
|
56
|
+
else
|
|
57
|
+
content.to_s
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
out = []
|
|
61
|
+
# Add leading space if needed
|
|
62
|
+
if textnode_before_end_with?(node, ambigous_characters)
|
|
63
|
+
out << Coradoc::CoreModel::InlineElement.new(
|
|
64
|
+
format_type: 'text',
|
|
65
|
+
content: ' '
|
|
66
|
+
)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Create link element
|
|
70
|
+
link = Coradoc::CoreModel::InlineElement.new(
|
|
71
|
+
format_type: 'link',
|
|
72
|
+
target: href,
|
|
73
|
+
content: content_str.strip,
|
|
74
|
+
metadata: {
|
|
75
|
+
title: (title.strip unless title.to_s.strip.empty?),
|
|
76
|
+
right_constrain: right_constrain
|
|
77
|
+
}.compact
|
|
78
|
+
)
|
|
79
|
+
out << link
|
|
80
|
+
|
|
81
|
+
# Return single element or array
|
|
82
|
+
out.length == 1 ? out.first : out
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
register :a, A.new
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|