coradoc-html 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/lib/coradoc/html/base.rb +157 -0
- data/lib/coradoc/html/config.rb +467 -0
- data/lib/coradoc/html/converter_base.rb +177 -0
- data/lib/coradoc/html/converters/admonition.rb +180 -0
- data/lib/coradoc/html/converters/attribute.rb +68 -0
- data/lib/coradoc/html/converters/attribute_reference.rb +60 -0
- data/lib/coradoc/html/converters/audio.rb +165 -0
- data/lib/coradoc/html/converters/base.rb +615 -0
- data/lib/coradoc/html/converters/bibliography.rb +82 -0
- data/lib/coradoc/html/converters/bibliography_entry.rb +108 -0
- data/lib/coradoc/html/converters/block_image.rb +72 -0
- data/lib/coradoc/html/converters/bold.rb +34 -0
- data/lib/coradoc/html/converters/break.rb +32 -0
- data/lib/coradoc/html/converters/comment_block.rb +42 -0
- data/lib/coradoc/html/converters/comment_line.rb +54 -0
- data/lib/coradoc/html/converters/cross_reference.rb +59 -0
- data/lib/coradoc/html/converters/document.rb +108 -0
- data/lib/coradoc/html/converters/example.rb +114 -0
- data/lib/coradoc/html/converters/highlight.rb +34 -0
- data/lib/coradoc/html/converters/include.rb +68 -0
- data/lib/coradoc/html/converters/inline_image.rb +41 -0
- data/lib/coradoc/html/converters/italic.rb +34 -0
- data/lib/coradoc/html/converters/line_break.rb +31 -0
- data/lib/coradoc/html/converters/link.rb +46 -0
- data/lib/coradoc/html/converters/list_item.rb +75 -0
- data/lib/coradoc/html/converters/listing.rb +99 -0
- data/lib/coradoc/html/converters/literal.rb +102 -0
- data/lib/coradoc/html/converters/monospace.rb +34 -0
- data/lib/coradoc/html/converters/open.rb +78 -0
- data/lib/coradoc/html/converters/ordered.rb +53 -0
- data/lib/coradoc/html/converters/paragraph.rb +46 -0
- data/lib/coradoc/html/converters/quote.rb +113 -0
- data/lib/coradoc/html/converters/reviewer_comment.rb +74 -0
- data/lib/coradoc/html/converters/reviewer_note.rb +134 -0
- data/lib/coradoc/html/converters/section.rb +90 -0
- data/lib/coradoc/html/converters/sidebar.rb +113 -0
- data/lib/coradoc/html/converters/source.rb +137 -0
- data/lib/coradoc/html/converters/source_code.rb +16 -0
- data/lib/coradoc/html/converters/span.rb +61 -0
- data/lib/coradoc/html/converters/strikethrough.rb +34 -0
- data/lib/coradoc/html/converters/subscript.rb +34 -0
- data/lib/coradoc/html/converters/superscript.rb +34 -0
- data/lib/coradoc/html/converters/table.rb +85 -0
- data/lib/coradoc/html/converters/table_cell.rb +203 -0
- data/lib/coradoc/html/converters/table_row.rb +45 -0
- data/lib/coradoc/html/converters/template_html_converter.rb +105 -0
- data/lib/coradoc/html/converters/term.rb +58 -0
- data/lib/coradoc/html/converters/text_element.rb +44 -0
- data/lib/coradoc/html/converters/underline.rb +34 -0
- data/lib/coradoc/html/converters/unordered.rb +47 -0
- data/lib/coradoc/html/converters/verse.rb +105 -0
- data/lib/coradoc/html/converters/video.rb +179 -0
- data/lib/coradoc/html/element_mapping.rb +210 -0
- data/lib/coradoc/html/entity.rb +137 -0
- data/lib/coradoc/html/input/cleaner.rb +163 -0
- data/lib/coradoc/html/input/config.rb +79 -0
- data/lib/coradoc/html/input/converters/a.rb +90 -0
- data/lib/coradoc/html/input/converters/aside.rb +23 -0
- data/lib/coradoc/html/input/converters/audio.rb +50 -0
- data/lib/coradoc/html/input/converters/base.rb +116 -0
- data/lib/coradoc/html/input/converters/blockquote.rb +25 -0
- data/lib/coradoc/html/input/converters/br.rb +19 -0
- data/lib/coradoc/html/input/converters/bypass.rb +83 -0
- data/lib/coradoc/html/input/converters/code.rb +25 -0
- data/lib/coradoc/html/input/converters/div.rb +25 -0
- data/lib/coradoc/html/input/converters/dl.rb +106 -0
- data/lib/coradoc/html/input/converters/drop.rb +28 -0
- data/lib/coradoc/html/input/converters/em.rb +23 -0
- data/lib/coradoc/html/input/converters/figure.rb +58 -0
- data/lib/coradoc/html/input/converters/h.rb +76 -0
- data/lib/coradoc/html/input/converters/head.rb +30 -0
- data/lib/coradoc/html/input/converters/hr.rb +20 -0
- data/lib/coradoc/html/input/converters/ignore.rb +22 -0
- data/lib/coradoc/html/input/converters/img.rb +110 -0
- data/lib/coradoc/html/input/converters/li.rb +35 -0
- data/lib/coradoc/html/input/converters/mark.rb +21 -0
- data/lib/coradoc/html/input/converters/markup.rb +107 -0
- data/lib/coradoc/html/input/converters/math.rb +46 -0
- data/lib/coradoc/html/input/converters/ol.rb +46 -0
- data/lib/coradoc/html/input/converters/p.rb +81 -0
- data/lib/coradoc/html/input/converters/pass_through.rb +19 -0
- data/lib/coradoc/html/input/converters/pre.rb +59 -0
- data/lib/coradoc/html/input/converters/q.rb +24 -0
- data/lib/coradoc/html/input/converters/strong.rb +22 -0
- data/lib/coradoc/html/input/converters/sub.rb +40 -0
- data/lib/coradoc/html/input/converters/sup.rb +40 -0
- data/lib/coradoc/html/input/converters/table.rb +64 -0
- data/lib/coradoc/html/input/converters/td.rb +70 -0
- data/lib/coradoc/html/input/converters/text.rb +67 -0
- data/lib/coradoc/html/input/converters/th.rb +20 -0
- data/lib/coradoc/html/input/converters/tr.rb +28 -0
- data/lib/coradoc/html/input/converters/video.rb +53 -0
- data/lib/coradoc/html/input/converters.rb +122 -0
- data/lib/coradoc/html/input/errors.rb +22 -0
- data/lib/coradoc/html/input/html_converter.rb +170 -0
- data/lib/coradoc/html/input/plugin.rb +169 -0
- data/lib/coradoc/html/input/plugins/plateau.rb +229 -0
- data/lib/coradoc/html/input/postprocessor.rb +31 -0
- data/lib/coradoc/html/input.rb +68 -0
- data/lib/coradoc/html/output.rb +95 -0
- data/lib/coradoc/html/renderer.rb +409 -0
- data/lib/coradoc/html/spa.rb +309 -0
- data/lib/coradoc/html/static.rb +293 -0
- data/lib/coradoc/html/template_config.rb +151 -0
- data/lib/coradoc/html/template_helpers.rb +58 -0
- data/lib/coradoc/html/template_locator.rb +114 -0
- data/lib/coradoc/html/theme/base.rb +231 -0
- data/lib/coradoc/html/theme/classic_renderer.rb +390 -0
- data/lib/coradoc/html/theme/modern/components/ui_components.rb +344 -0
- data/lib/coradoc/html/theme/modern/css_generator.rb +311 -0
- data/lib/coradoc/html/theme/modern/javascript_generator.rb +314 -0
- data/lib/coradoc/html/theme/modern/serializers/document_serializer.rb +382 -0
- data/lib/coradoc/html/theme/modern/tailwind_config_builder.rb +164 -0
- data/lib/coradoc/html/theme/modern/vue_template_generator.rb +374 -0
- data/lib/coradoc/html/theme/modern_renderer.rb +250 -0
- data/lib/coradoc/html/theme/registry.rb +153 -0
- data/lib/coradoc/html/theme.rb +13 -0
- data/lib/coradoc/html/transform/from_core_model.rb +32 -0
- data/lib/coradoc/html/transform/to_core_model.rb +39 -0
- data/lib/coradoc/html/version.rb +7 -0
- data/lib/coradoc/html.rb +255 -0
- metadata +264 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Input
|
|
5
|
+
module Html
|
|
6
|
+
module Converters
|
|
7
|
+
class Sup < Base
|
|
8
|
+
def to_coradoc(node, state = {})
|
|
9
|
+
leading_whitespace, trailing_whitespace = extract_leading_trailing_whitespace(node)
|
|
10
|
+
|
|
11
|
+
content = treat_children_coradoc(node, state)
|
|
12
|
+
|
|
13
|
+
# Check if content is empty
|
|
14
|
+
return nil if content_empty?(content)
|
|
15
|
+
|
|
16
|
+
# Create CoreModel::InlineElement with format_type "superscript"
|
|
17
|
+
e = Coradoc::CoreModel::InlineElement.new(
|
|
18
|
+
format_type: 'superscript',
|
|
19
|
+
content: content
|
|
20
|
+
)
|
|
21
|
+
result = [leading_whitespace, e, trailing_whitespace].compact
|
|
22
|
+
result.length == 1 ? result.first : result
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def content_empty?(content)
|
|
28
|
+
return true if content.nil?
|
|
29
|
+
return content.strip.empty? if content.is_a?(String)
|
|
30
|
+
return content.empty? if content.is_a?(Array)
|
|
31
|
+
|
|
32
|
+
false
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
register :sup, Sup.new
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Input
|
|
5
|
+
module Html
|
|
6
|
+
module Converters
|
|
7
|
+
class Table < Base
|
|
8
|
+
def to_coradoc(node, state = {})
|
|
9
|
+
id = node['id']
|
|
10
|
+
title = extract_title(node)
|
|
11
|
+
content = treat_children_coradoc(node, state)
|
|
12
|
+
|
|
13
|
+
# Apply frame and grid attributes if available
|
|
14
|
+
frame_attr = frame(node)
|
|
15
|
+
grid_attr = rules(node)
|
|
16
|
+
|
|
17
|
+
Coradoc::CoreModel::Table.new(
|
|
18
|
+
title: title,
|
|
19
|
+
rows: content,
|
|
20
|
+
id: id,
|
|
21
|
+
frame: frame_attr,
|
|
22
|
+
grid: grid_attr
|
|
23
|
+
)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def extract_title(node)
|
|
27
|
+
title = node.at('./caption')
|
|
28
|
+
return nil if title.nil?
|
|
29
|
+
|
|
30
|
+
title.text.strip
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def frame(node)
|
|
34
|
+
case node['frame']
|
|
35
|
+
when 'void'
|
|
36
|
+
'none'
|
|
37
|
+
when 'hsides'
|
|
38
|
+
'topbot'
|
|
39
|
+
when 'vsides'
|
|
40
|
+
'sides'
|
|
41
|
+
when 'box', 'border'
|
|
42
|
+
'all'
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def rules(node)
|
|
47
|
+
case node['rules']
|
|
48
|
+
when 'all'
|
|
49
|
+
'all'
|
|
50
|
+
when 'rows'
|
|
51
|
+
'rows'
|
|
52
|
+
when 'cols'
|
|
53
|
+
'cols'
|
|
54
|
+
when 'none'
|
|
55
|
+
'none'
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
register :table, Table.new
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Input
|
|
5
|
+
module Html
|
|
6
|
+
module Converters
|
|
7
|
+
class Td < Base
|
|
8
|
+
def to_coradoc(node, state = {})
|
|
9
|
+
node['id']
|
|
10
|
+
colspan = node['colspan']&.to_i
|
|
11
|
+
rowspan = node['rowspan']&.to_i
|
|
12
|
+
alignment = extract_alignment(node)
|
|
13
|
+
|
|
14
|
+
singlepara = node.elements.size == 1 && node.elements.first.name == 'p'
|
|
15
|
+
state[:tdsinglepara] = singlepara if singlepara
|
|
16
|
+
|
|
17
|
+
content = treat_children_coradoc(node, state)
|
|
18
|
+
|
|
19
|
+
# Use CoreModel::TableCell
|
|
20
|
+
Coradoc::CoreModel::TableCell.new(
|
|
21
|
+
content: extract_text_from_content(content),
|
|
22
|
+
alignment: alignment,
|
|
23
|
+
colspan: colspan && colspan > 1 ? colspan : nil,
|
|
24
|
+
rowspan: rowspan && rowspan > 1 ? rowspan : nil,
|
|
25
|
+
header: node.name == 'th'
|
|
26
|
+
)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def extract_alignment(node)
|
|
30
|
+
align = node['align']
|
|
31
|
+
node['valign']
|
|
32
|
+
# Combine horizontal and vertical alignment
|
|
33
|
+
case align
|
|
34
|
+
when 'left' then 'left'
|
|
35
|
+
when 'center' then 'center'
|
|
36
|
+
when 'right' then 'right'
|
|
37
|
+
end
|
|
38
|
+
# Return alignment string (can be extended to include vertical)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Extract text from content array
|
|
42
|
+
def extract_text_from_content(content)
|
|
43
|
+
return content if content.is_a?(String)
|
|
44
|
+
return '' if content.nil?
|
|
45
|
+
|
|
46
|
+
content.map do |item|
|
|
47
|
+
case item
|
|
48
|
+
when String
|
|
49
|
+
item
|
|
50
|
+
when Coradoc::CoreModel::InlineElement
|
|
51
|
+
item.content.to_s
|
|
52
|
+
when Coradoc::CoreModel::Base
|
|
53
|
+
if item.content
|
|
54
|
+
item.content.to_s
|
|
55
|
+
else
|
|
56
|
+
''
|
|
57
|
+
end
|
|
58
|
+
else
|
|
59
|
+
item.to_s
|
|
60
|
+
end
|
|
61
|
+
end.join
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
register :td, Td.new
|
|
66
|
+
register :th, Td.new
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Input
|
|
5
|
+
module Html
|
|
6
|
+
module Converters
|
|
7
|
+
class Text < Base
|
|
8
|
+
def to_coradoc(node, state = {})
|
|
9
|
+
return treat_empty(node, state) if node.text.strip.empty?
|
|
10
|
+
|
|
11
|
+
# HTML cleanup is performed in the converter layer
|
|
12
|
+
cleaned_content = cleanup_html_text(node.text)
|
|
13
|
+
|
|
14
|
+
# Return as CoreModel::InlineElement with format_type "text"
|
|
15
|
+
Coradoc::CoreModel::InlineElement.new(
|
|
16
|
+
format_type: 'text',
|
|
17
|
+
content: cleaned_content
|
|
18
|
+
)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
def treat_empty(node, state)
|
|
24
|
+
parent = node.parent.name.to_sym
|
|
25
|
+
if %i[ol ul].include?(parent) # Otherwise the indentation is broken
|
|
26
|
+
nil
|
|
27
|
+
elsif state[:tdsinglepara]
|
|
28
|
+
nil
|
|
29
|
+
elsif node.text == ' ' # Regular whitespace text node
|
|
30
|
+
' '
|
|
31
|
+
else
|
|
32
|
+
nil
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# HTML-to-CoreModel text cleanup
|
|
37
|
+
def cleanup_html_text(text)
|
|
38
|
+
text = preserve_nbsp(text)
|
|
39
|
+
text = remove_border_newlines(text)
|
|
40
|
+
text = remove_inner_newlines(text)
|
|
41
|
+
escape_links(text)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def preserve_nbsp(text)
|
|
45
|
+
text.gsub("\u00A0", ' ')
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def escape_links(text)
|
|
49
|
+
text.gsub(/<<([^ ][^>]*)>>/, '\\<<\\1>>')
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def remove_border_newlines(text)
|
|
53
|
+
text.gsub(/\A\n+/, '').gsub(/\n+\z/, '')
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def remove_inner_newlines(text)
|
|
57
|
+
# Convert newlines/tabs to spaces and squeeze multiple spaces
|
|
58
|
+
# Preserve single leading/trailing space for inline contexts
|
|
59
|
+
text.tr("\n\t", ' ').squeeze(' ')
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
register :text, Text.new
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Input
|
|
5
|
+
module Html
|
|
6
|
+
module Converters
|
|
7
|
+
class Th < Td
|
|
8
|
+
def cellstyle(node)
|
|
9
|
+
# this is the header row
|
|
10
|
+
return '' if node.parent.previous_element.nil?
|
|
11
|
+
|
|
12
|
+
'h'
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
register :th, Th.new
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Input
|
|
5
|
+
module Html
|
|
6
|
+
module Converters
|
|
7
|
+
class Tr < Base
|
|
8
|
+
def to_coradoc(node, state = {})
|
|
9
|
+
content = treat_children_coradoc(node, state)
|
|
10
|
+
header = table_header_row?(node)
|
|
11
|
+
# Use CoreModel::TableRow with cells (not columns)
|
|
12
|
+
Coradoc::CoreModel::TableRow.new(
|
|
13
|
+
cells: content,
|
|
14
|
+
header: header
|
|
15
|
+
)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def table_header_row?(node)
|
|
19
|
+
# node.element_children.all? {|child| child.name.to_sym == :th}
|
|
20
|
+
node.previous_element.nil?
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
register :tr, Tr.new
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Input
|
|
5
|
+
module Html
|
|
6
|
+
module Converters
|
|
7
|
+
class Video < Base
|
|
8
|
+
def to_coradoc(node, _state = {})
|
|
9
|
+
src = node['src']
|
|
10
|
+
id = node['id']
|
|
11
|
+
title = extract_title(node)
|
|
12
|
+
options(node)
|
|
13
|
+
|
|
14
|
+
# Use Block with custom attributes to store video info
|
|
15
|
+
# CoreModel doesn't have a specific Video type, so we use Block
|
|
16
|
+
# with element_attributes to store video-specific data
|
|
17
|
+
Coradoc::CoreModel::Block.new(
|
|
18
|
+
element_type: 'video',
|
|
19
|
+
block_semantic_type: :video,
|
|
20
|
+
content: src,
|
|
21
|
+
title: title,
|
|
22
|
+
id: id,
|
|
23
|
+
width: node['width'],
|
|
24
|
+
height: node['height'],
|
|
25
|
+
element_attributes: {
|
|
26
|
+
autoplay: node['autoplay'],
|
|
27
|
+
loop: node['loop'],
|
|
28
|
+
controls: node['controls'],
|
|
29
|
+
poster: node['poster']
|
|
30
|
+
}.compact
|
|
31
|
+
)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def extract_title(node)
|
|
35
|
+
title = node.at('./track') || node.at('.//source')
|
|
36
|
+
return '' if title.nil?
|
|
37
|
+
|
|
38
|
+
title['label'] || title['srclang'] || ''
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def options(node)
|
|
42
|
+
autoplay = node['autoplay']
|
|
43
|
+
loop_attr = node['loop']
|
|
44
|
+
controls = node['controls']
|
|
45
|
+
[autoplay, loop_attr, controls].compact
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
register :video, Video.new
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Input
|
|
5
|
+
module Html
|
|
6
|
+
module Converters
|
|
7
|
+
# Autoload converter classes - they will register themselves when first accessed
|
|
8
|
+
autoload :Base, 'coradoc/html/input/converters/base'
|
|
9
|
+
autoload :Markup, 'coradoc/html/input/converters/markup'
|
|
10
|
+
autoload :A, 'coradoc/html/input/converters/a'
|
|
11
|
+
autoload :Aside, 'coradoc/html/input/converters/aside'
|
|
12
|
+
autoload :Audio, 'coradoc/html/input/converters/audio'
|
|
13
|
+
autoload :Blockquote, 'coradoc/html/input/converters/blockquote'
|
|
14
|
+
autoload :Br, 'coradoc/html/input/converters/br'
|
|
15
|
+
autoload :Bypass, 'coradoc/html/input/converters/bypass'
|
|
16
|
+
autoload :Code, 'coradoc/html/input/converters/code'
|
|
17
|
+
autoload :Div, 'coradoc/html/input/converters/div'
|
|
18
|
+
autoload :Dl, 'coradoc/html/input/converters/dl'
|
|
19
|
+
autoload :Drop, 'coradoc/html/input/converters/drop'
|
|
20
|
+
autoload :Em, 'coradoc/html/input/converters/em'
|
|
21
|
+
autoload :Figure, 'coradoc/html/input/converters/figure'
|
|
22
|
+
autoload :H, 'coradoc/html/input/converters/h'
|
|
23
|
+
autoload :Head, 'coradoc/html/input/converters/head'
|
|
24
|
+
autoload :Hr, 'coradoc/html/input/converters/hr'
|
|
25
|
+
autoload :Ignore, 'coradoc/html/input/converters/ignore'
|
|
26
|
+
autoload :Img, 'coradoc/html/input/converters/img'
|
|
27
|
+
autoload :Li, 'coradoc/html/input/converters/li'
|
|
28
|
+
autoload :Mark, 'coradoc/html/input/converters/mark'
|
|
29
|
+
autoload :Ol, 'coradoc/html/input/converters/ol'
|
|
30
|
+
autoload :P, 'coradoc/html/input/converters/p'
|
|
31
|
+
autoload :PassThrough, 'coradoc/html/input/converters/pass_through'
|
|
32
|
+
autoload :Pre, 'coradoc/html/input/converters/pre'
|
|
33
|
+
autoload :Q, 'coradoc/html/input/converters/q'
|
|
34
|
+
autoload :Strong, 'coradoc/html/input/converters/strong'
|
|
35
|
+
autoload :Sup, 'coradoc/html/input/converters/sup'
|
|
36
|
+
autoload :Sub, 'coradoc/html/input/converters/sub'
|
|
37
|
+
autoload :Table, 'coradoc/html/input/converters/table'
|
|
38
|
+
autoload :Td, 'coradoc/html/input/converters/td'
|
|
39
|
+
autoload :Text, 'coradoc/html/input/converters/text'
|
|
40
|
+
autoload :Th, 'coradoc/html/input/converters/th'
|
|
41
|
+
autoload :Tr, 'coradoc/html/input/converters/tr'
|
|
42
|
+
autoload :Video, 'coradoc/html/input/converters/video'
|
|
43
|
+
autoload :Math, 'coradoc/html/input/converters/math'
|
|
44
|
+
|
|
45
|
+
# Define class methods
|
|
46
|
+
def self.register(tag_name, converter)
|
|
47
|
+
@@converters ||= {}
|
|
48
|
+
@@converters[tag_name.to_sym] = converter
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def self.unregister(tag_name)
|
|
52
|
+
@@converters.delete(tag_name.to_sym)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Ensure all converters are loaded and registered before first use
|
|
56
|
+
def self.ensure_converters_loaded
|
|
57
|
+
return if @converters_loaded
|
|
58
|
+
|
|
59
|
+
@converters_loaded = true
|
|
60
|
+
|
|
61
|
+
# Access each autoloaded constant to trigger file load + registration
|
|
62
|
+
# Only load converters that register HTML tag handlers
|
|
63
|
+
# Note: Some converters may have gem dependencies (e.g., Img requires marcel)
|
|
64
|
+
# so we only load the essential ones here
|
|
65
|
+
[
|
|
66
|
+
Base, Markup, A, Aside, Blockquote, Br, Bypass, Code, Div, Dl,
|
|
67
|
+
Drop, Em, Figure, H, Head, Hr, Ignore, Li, Mark, Ol, P,
|
|
68
|
+
PassThrough, Pre, Q, Strong, Sup, Sub, Table, Td, Text, Th, Tr
|
|
69
|
+
].each do |converter|
|
|
70
|
+
# Just accessing the constant triggers autoload
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def self.lookup(tag_name)
|
|
75
|
+
ensure_converters_loaded
|
|
76
|
+
converter = @@converters[tag_name.to_sym] || default_converter(tag_name)
|
|
77
|
+
converter.is_a?(Class) ? converter.new : converter
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# NOTE: process won't run plugin hooks
|
|
81
|
+
def self.process(node, state)
|
|
82
|
+
node = node.to_a if node.is_a? Nokogiri::XML::NodeSet
|
|
83
|
+
if node.is_a? Array
|
|
84
|
+
return node.map { |i| process(i, state) }
|
|
85
|
+
.join
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
lookup(node.name).convert(node, state)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def self.process_coradoc(node, state)
|
|
92
|
+
node = node.to_a if node.is_a? Nokogiri::XML::NodeSet
|
|
93
|
+
return node.map { |i| process_coradoc(i, state) } if node.is_a? Array
|
|
94
|
+
|
|
95
|
+
plugins = state[:plugin_instances] || {}
|
|
96
|
+
process = proc { lookup(node.name).to_coradoc(node, state) }
|
|
97
|
+
plugins.each do |i|
|
|
98
|
+
prev_process = process
|
|
99
|
+
process = proc { i.html_tree_run_hooks(node, state, &prev_process) }
|
|
100
|
+
end
|
|
101
|
+
process.call(node, state)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def self.default_converter(tag_name)
|
|
105
|
+
case Html.config.unknown_tags.to_sym
|
|
106
|
+
when :pass_through
|
|
107
|
+
PassThrough.new
|
|
108
|
+
when :drop
|
|
109
|
+
Drop.new
|
|
110
|
+
when :bypass
|
|
111
|
+
Bypass.new
|
|
112
|
+
when :raise
|
|
113
|
+
raise Errors::UnknownTagError, "unknown tag: #{tag_name}"
|
|
114
|
+
else
|
|
115
|
+
raise Errors::InvalidConfigurationError,
|
|
116
|
+
"unknown value #{Html.config.unknown_tags.inspect} for Coradoc::Input::Html.config.unknown_tags"
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Input
|
|
5
|
+
module Html
|
|
6
|
+
module Errors
|
|
7
|
+
# Base error class for HTML input errors
|
|
8
|
+
# Inherits from Coradoc::Error for unified error handling
|
|
9
|
+
class Error < Coradoc::Error
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Raised when an unknown HTML tag is encountered
|
|
13
|
+
class UnknownTagError < Error
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Raised when HTML input configuration is invalid
|
|
17
|
+
class InvalidConfigurationError < Error
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Input
|
|
5
|
+
module Html
|
|
6
|
+
# HTML to CoreModel converter
|
|
7
|
+
#
|
|
8
|
+
# This class handles the conversion of HTML documents to CoreModel.
|
|
9
|
+
# It does NOT handle serialization to any specific output format.
|
|
10
|
+
# For serialization, use Coradoc.serialize(coremodel, to: :format)
|
|
11
|
+
#
|
|
12
|
+
# @example Basic usage - get CoreModel
|
|
13
|
+
# coremodel = HtmlConverter.to_core_model(html_string)
|
|
14
|
+
#
|
|
15
|
+
# @example Serialize to AsciiDoc
|
|
16
|
+
# coremodel = HtmlConverter.to_core_model(html_string)
|
|
17
|
+
# adoc_text = Coradoc.serialize(coremodel, to: :asciidoc)
|
|
18
|
+
#
|
|
19
|
+
class HtmlConverter
|
|
20
|
+
# Convert HTML to CoreModel
|
|
21
|
+
#
|
|
22
|
+
# @param input [String, Nokogiri::XML::Document, Nokogiri::XML::Node] HTML input
|
|
23
|
+
# @param options [Hash] Conversion options
|
|
24
|
+
# @return [Coradoc::CoreModel::Base] CoreModel document
|
|
25
|
+
def self.to_core_model(input, options = {})
|
|
26
|
+
Input::Html.config.with(options) do
|
|
27
|
+
plugin_instances = prepare_plugin_instances(options)
|
|
28
|
+
|
|
29
|
+
root = track_time 'Loading input HTML document' do
|
|
30
|
+
case input
|
|
31
|
+
when String
|
|
32
|
+
Nokogiri::HTML(input).root
|
|
33
|
+
when Nokogiri::XML::Document
|
|
34
|
+
input.root
|
|
35
|
+
when Nokogiri::XML::Node
|
|
36
|
+
input
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
return nil unless root
|
|
41
|
+
|
|
42
|
+
plugin_instances.each do |plugin|
|
|
43
|
+
plugin.html_tree = root
|
|
44
|
+
if plugin.public_methods.include?(:preprocess_html_tree)
|
|
45
|
+
track_time "Preprocessing document with #{plugin.name} plugin" do
|
|
46
|
+
plugin.preprocess_html_tree
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
root = plugin.html_tree
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
coremodel = track_time 'Converting input document tree to CoreModel' do
|
|
53
|
+
Converters.process_coradoc(
|
|
54
|
+
root,
|
|
55
|
+
plugin_instances: plugin_instances
|
|
56
|
+
)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
coremodel = track_time 'Post-process CoreModel tree' do
|
|
60
|
+
Postprocessor.process(coremodel)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
plugin_instances.each do |plugin|
|
|
64
|
+
next unless plugin.public_methods.include?(:postprocess_coremodel_tree)
|
|
65
|
+
|
|
66
|
+
plugin.coremodel_tree = coremodel
|
|
67
|
+
track_time "Postprocessing CoreModel tree with #{plugin.name} plugin" do
|
|
68
|
+
plugin.postprocess_coremodel_tree
|
|
69
|
+
end
|
|
70
|
+
coremodel = plugin.coremodel_tree
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
options[:plugin_instances] = plugin_instances unless options.frozen?
|
|
74
|
+
|
|
75
|
+
coremodel
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Legacy method - returns CoreModel
|
|
80
|
+
# @deprecated Use {#to_core_model} instead
|
|
81
|
+
def self.to_coradoc(input, options = {})
|
|
82
|
+
to_core_model(input, options)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Legacy method for backward compatibility
|
|
86
|
+
# Converts HTML to CoreModel, then serializes to target format
|
|
87
|
+
#
|
|
88
|
+
# @deprecated Use {#to_core_model} + Coradoc.serialize instead
|
|
89
|
+
# @param input [String] HTML input
|
|
90
|
+
# @param options [Hash] Conversion options
|
|
91
|
+
# @param options [Symbol] :output_format Target format (default: :asciidoc)
|
|
92
|
+
# @return [String] Serialized document in target format
|
|
93
|
+
def self.convert(input, options = {})
|
|
94
|
+
output_format = options.delete(:output_format) || :asciidoc
|
|
95
|
+
|
|
96
|
+
coremodel = to_core_model(input, options)
|
|
97
|
+
|
|
98
|
+
if coremodel.is_a?(Hash)
|
|
99
|
+
coremodel.to_h do |file, tree|
|
|
100
|
+
track_time "Serializing file #{file || 'main'}" do
|
|
101
|
+
[file, serialize_core_model(tree, output_format, options)]
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
else
|
|
105
|
+
serialize_core_model(coremodel, output_format, options)
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Serialize CoreModel to target format using the appropriate gem
|
|
110
|
+
#
|
|
111
|
+
# @param coremodel [Coradoc::CoreModel::Base] CoreModel document
|
|
112
|
+
# @param format [Symbol] Target format
|
|
113
|
+
# @param options [Hash] Serialization options
|
|
114
|
+
# @return [String] Serialized document
|
|
115
|
+
def self.serialize_core_model(coremodel, format, options = {})
|
|
116
|
+
result = Coradoc.serialize(coremodel, to: format)
|
|
117
|
+
cleanup_result(result, options)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Clean up the serialized result
|
|
121
|
+
#
|
|
122
|
+
# @param result [String] Serialized result
|
|
123
|
+
# @param options [Hash] Cleanup options
|
|
124
|
+
# @return [String] Cleaned result
|
|
125
|
+
def self.cleanup_result(result, options = {})
|
|
126
|
+
Input::Html.config.with(options) do
|
|
127
|
+
plugin_instances = prepare_plugin_instances(options)
|
|
128
|
+
|
|
129
|
+
result = track_time 'Cleaning up the result' do
|
|
130
|
+
Input::Html.cleaner.tidy(result)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
plugin_instances.each do |plugin|
|
|
134
|
+
next unless plugin.public_methods.include?(:postprocess_output_string)
|
|
135
|
+
|
|
136
|
+
plugin.output_string = result
|
|
137
|
+
track_time "Postprocessing output string with #{plugin.name} plugin" do
|
|
138
|
+
plugin.postprocess_output_string
|
|
139
|
+
end
|
|
140
|
+
result = plugin.output_string
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
result
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def self.prepare_plugin_instances(options)
|
|
148
|
+
options[:plugin_instances] || Html.config.plugins.map(&:new)
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
@track_time_indentation = 0
|
|
152
|
+
def self.track_time(task)
|
|
153
|
+
if Input::Html.config.track_time
|
|
154
|
+
warn (' ' * @track_time_indentation) + "* #{task} is starting..."
|
|
155
|
+
@track_time_indentation += 1
|
|
156
|
+
t0 = Time.now
|
|
157
|
+
ret = yield
|
|
158
|
+
time_elapsed = Time.now - t0
|
|
159
|
+
@track_time_indentation -= 1
|
|
160
|
+
warn (' ' * @track_time_indentation) +
|
|
161
|
+
"* #{task} took #{time_elapsed.round(3)} seconds"
|
|
162
|
+
ret
|
|
163
|
+
else
|
|
164
|
+
yield
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
end
|