coradoc-html 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/lib/coradoc/html/base.rb +157 -0
  4. data/lib/coradoc/html/config.rb +467 -0
  5. data/lib/coradoc/html/converter_base.rb +177 -0
  6. data/lib/coradoc/html/converters/admonition.rb +180 -0
  7. data/lib/coradoc/html/converters/attribute.rb +68 -0
  8. data/lib/coradoc/html/converters/attribute_reference.rb +60 -0
  9. data/lib/coradoc/html/converters/audio.rb +165 -0
  10. data/lib/coradoc/html/converters/base.rb +615 -0
  11. data/lib/coradoc/html/converters/bibliography.rb +82 -0
  12. data/lib/coradoc/html/converters/bibliography_entry.rb +108 -0
  13. data/lib/coradoc/html/converters/block_image.rb +72 -0
  14. data/lib/coradoc/html/converters/bold.rb +34 -0
  15. data/lib/coradoc/html/converters/break.rb +32 -0
  16. data/lib/coradoc/html/converters/comment_block.rb +42 -0
  17. data/lib/coradoc/html/converters/comment_line.rb +54 -0
  18. data/lib/coradoc/html/converters/cross_reference.rb +59 -0
  19. data/lib/coradoc/html/converters/document.rb +108 -0
  20. data/lib/coradoc/html/converters/example.rb +114 -0
  21. data/lib/coradoc/html/converters/highlight.rb +34 -0
  22. data/lib/coradoc/html/converters/include.rb +68 -0
  23. data/lib/coradoc/html/converters/inline_image.rb +41 -0
  24. data/lib/coradoc/html/converters/italic.rb +34 -0
  25. data/lib/coradoc/html/converters/line_break.rb +31 -0
  26. data/lib/coradoc/html/converters/link.rb +46 -0
  27. data/lib/coradoc/html/converters/list_item.rb +75 -0
  28. data/lib/coradoc/html/converters/listing.rb +99 -0
  29. data/lib/coradoc/html/converters/literal.rb +102 -0
  30. data/lib/coradoc/html/converters/monospace.rb +34 -0
  31. data/lib/coradoc/html/converters/open.rb +78 -0
  32. data/lib/coradoc/html/converters/ordered.rb +53 -0
  33. data/lib/coradoc/html/converters/paragraph.rb +46 -0
  34. data/lib/coradoc/html/converters/quote.rb +113 -0
  35. data/lib/coradoc/html/converters/reviewer_comment.rb +74 -0
  36. data/lib/coradoc/html/converters/reviewer_note.rb +134 -0
  37. data/lib/coradoc/html/converters/section.rb +90 -0
  38. data/lib/coradoc/html/converters/sidebar.rb +113 -0
  39. data/lib/coradoc/html/converters/source.rb +137 -0
  40. data/lib/coradoc/html/converters/source_code.rb +16 -0
  41. data/lib/coradoc/html/converters/span.rb +61 -0
  42. data/lib/coradoc/html/converters/strikethrough.rb +34 -0
  43. data/lib/coradoc/html/converters/subscript.rb +34 -0
  44. data/lib/coradoc/html/converters/superscript.rb +34 -0
  45. data/lib/coradoc/html/converters/table.rb +85 -0
  46. data/lib/coradoc/html/converters/table_cell.rb +203 -0
  47. data/lib/coradoc/html/converters/table_row.rb +45 -0
  48. data/lib/coradoc/html/converters/template_html_converter.rb +105 -0
  49. data/lib/coradoc/html/converters/term.rb +58 -0
  50. data/lib/coradoc/html/converters/text_element.rb +44 -0
  51. data/lib/coradoc/html/converters/underline.rb +34 -0
  52. data/lib/coradoc/html/converters/unordered.rb +47 -0
  53. data/lib/coradoc/html/converters/verse.rb +105 -0
  54. data/lib/coradoc/html/converters/video.rb +179 -0
  55. data/lib/coradoc/html/element_mapping.rb +210 -0
  56. data/lib/coradoc/html/entity.rb +137 -0
  57. data/lib/coradoc/html/input/cleaner.rb +163 -0
  58. data/lib/coradoc/html/input/config.rb +79 -0
  59. data/lib/coradoc/html/input/converters/a.rb +90 -0
  60. data/lib/coradoc/html/input/converters/aside.rb +23 -0
  61. data/lib/coradoc/html/input/converters/audio.rb +50 -0
  62. data/lib/coradoc/html/input/converters/base.rb +116 -0
  63. data/lib/coradoc/html/input/converters/blockquote.rb +25 -0
  64. data/lib/coradoc/html/input/converters/br.rb +19 -0
  65. data/lib/coradoc/html/input/converters/bypass.rb +83 -0
  66. data/lib/coradoc/html/input/converters/code.rb +25 -0
  67. data/lib/coradoc/html/input/converters/div.rb +25 -0
  68. data/lib/coradoc/html/input/converters/dl.rb +106 -0
  69. data/lib/coradoc/html/input/converters/drop.rb +28 -0
  70. data/lib/coradoc/html/input/converters/em.rb +23 -0
  71. data/lib/coradoc/html/input/converters/figure.rb +58 -0
  72. data/lib/coradoc/html/input/converters/h.rb +76 -0
  73. data/lib/coradoc/html/input/converters/head.rb +30 -0
  74. data/lib/coradoc/html/input/converters/hr.rb +20 -0
  75. data/lib/coradoc/html/input/converters/ignore.rb +22 -0
  76. data/lib/coradoc/html/input/converters/img.rb +110 -0
  77. data/lib/coradoc/html/input/converters/li.rb +35 -0
  78. data/lib/coradoc/html/input/converters/mark.rb +21 -0
  79. data/lib/coradoc/html/input/converters/markup.rb +107 -0
  80. data/lib/coradoc/html/input/converters/math.rb +46 -0
  81. data/lib/coradoc/html/input/converters/ol.rb +46 -0
  82. data/lib/coradoc/html/input/converters/p.rb +81 -0
  83. data/lib/coradoc/html/input/converters/pass_through.rb +19 -0
  84. data/lib/coradoc/html/input/converters/pre.rb +59 -0
  85. data/lib/coradoc/html/input/converters/q.rb +24 -0
  86. data/lib/coradoc/html/input/converters/strong.rb +22 -0
  87. data/lib/coradoc/html/input/converters/sub.rb +40 -0
  88. data/lib/coradoc/html/input/converters/sup.rb +40 -0
  89. data/lib/coradoc/html/input/converters/table.rb +64 -0
  90. data/lib/coradoc/html/input/converters/td.rb +70 -0
  91. data/lib/coradoc/html/input/converters/text.rb +67 -0
  92. data/lib/coradoc/html/input/converters/th.rb +20 -0
  93. data/lib/coradoc/html/input/converters/tr.rb +28 -0
  94. data/lib/coradoc/html/input/converters/video.rb +53 -0
  95. data/lib/coradoc/html/input/converters.rb +122 -0
  96. data/lib/coradoc/html/input/errors.rb +22 -0
  97. data/lib/coradoc/html/input/html_converter.rb +170 -0
  98. data/lib/coradoc/html/input/plugin.rb +169 -0
  99. data/lib/coradoc/html/input/plugins/plateau.rb +229 -0
  100. data/lib/coradoc/html/input/postprocessor.rb +31 -0
  101. data/lib/coradoc/html/input.rb +68 -0
  102. data/lib/coradoc/html/output.rb +95 -0
  103. data/lib/coradoc/html/renderer.rb +409 -0
  104. data/lib/coradoc/html/spa.rb +309 -0
  105. data/lib/coradoc/html/static.rb +293 -0
  106. data/lib/coradoc/html/template_config.rb +151 -0
  107. data/lib/coradoc/html/template_helpers.rb +58 -0
  108. data/lib/coradoc/html/template_locator.rb +114 -0
  109. data/lib/coradoc/html/theme/base.rb +231 -0
  110. data/lib/coradoc/html/theme/classic_renderer.rb +390 -0
  111. data/lib/coradoc/html/theme/modern/components/ui_components.rb +344 -0
  112. data/lib/coradoc/html/theme/modern/css_generator.rb +311 -0
  113. data/lib/coradoc/html/theme/modern/javascript_generator.rb +314 -0
  114. data/lib/coradoc/html/theme/modern/serializers/document_serializer.rb +382 -0
  115. data/lib/coradoc/html/theme/modern/tailwind_config_builder.rb +164 -0
  116. data/lib/coradoc/html/theme/modern/vue_template_generator.rb +374 -0
  117. data/lib/coradoc/html/theme/modern_renderer.rb +250 -0
  118. data/lib/coradoc/html/theme/registry.rb +153 -0
  119. data/lib/coradoc/html/theme.rb +13 -0
  120. data/lib/coradoc/html/transform/from_core_model.rb +32 -0
  121. data/lib/coradoc/html/transform/to_core_model.rb +39 -0
  122. data/lib/coradoc/html/version.rb +7 -0
  123. data/lib/coradoc/html.rb +255 -0
  124. metadata +264 -0
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Input
5
+ module Html
6
+ module Converters
7
+ class Sup < Base
8
+ def to_coradoc(node, state = {})
9
+ leading_whitespace, trailing_whitespace = extract_leading_trailing_whitespace(node)
10
+
11
+ content = treat_children_coradoc(node, state)
12
+
13
+ # Check if content is empty
14
+ return nil if content_empty?(content)
15
+
16
+ # Create CoreModel::InlineElement with format_type "superscript"
17
+ e = Coradoc::CoreModel::InlineElement.new(
18
+ format_type: 'superscript',
19
+ content: content
20
+ )
21
+ result = [leading_whitespace, e, trailing_whitespace].compact
22
+ result.length == 1 ? result.first : result
23
+ end
24
+
25
+ private
26
+
27
+ def content_empty?(content)
28
+ return true if content.nil?
29
+ return content.strip.empty? if content.is_a?(String)
30
+ return content.empty? if content.is_a?(Array)
31
+
32
+ false
33
+ end
34
+ end
35
+
36
+ register :sup, Sup.new
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Input
5
+ module Html
6
+ module Converters
7
+ class Table < Base
8
+ def to_coradoc(node, state = {})
9
+ id = node['id']
10
+ title = extract_title(node)
11
+ content = treat_children_coradoc(node, state)
12
+
13
+ # Apply frame and grid attributes if available
14
+ frame_attr = frame(node)
15
+ grid_attr = rules(node)
16
+
17
+ Coradoc::CoreModel::Table.new(
18
+ title: title,
19
+ rows: content,
20
+ id: id,
21
+ frame: frame_attr,
22
+ grid: grid_attr
23
+ )
24
+ end
25
+
26
+ def extract_title(node)
27
+ title = node.at('./caption')
28
+ return nil if title.nil?
29
+
30
+ title.text.strip
31
+ end
32
+
33
+ def frame(node)
34
+ case node['frame']
35
+ when 'void'
36
+ 'none'
37
+ when 'hsides'
38
+ 'topbot'
39
+ when 'vsides'
40
+ 'sides'
41
+ when 'box', 'border'
42
+ 'all'
43
+ end
44
+ end
45
+
46
+ def rules(node)
47
+ case node['rules']
48
+ when 'all'
49
+ 'all'
50
+ when 'rows'
51
+ 'rows'
52
+ when 'cols'
53
+ 'cols'
54
+ when 'none'
55
+ 'none'
56
+ end
57
+ end
58
+ end
59
+
60
+ register :table, Table.new
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Input
5
+ module Html
6
+ module Converters
7
+ class Td < Base
8
+ def to_coradoc(node, state = {})
9
+ node['id']
10
+ colspan = node['colspan']&.to_i
11
+ rowspan = node['rowspan']&.to_i
12
+ alignment = extract_alignment(node)
13
+
14
+ singlepara = node.elements.size == 1 && node.elements.first.name == 'p'
15
+ state[:tdsinglepara] = singlepara if singlepara
16
+
17
+ content = treat_children_coradoc(node, state)
18
+
19
+ # Use CoreModel::TableCell
20
+ Coradoc::CoreModel::TableCell.new(
21
+ content: extract_text_from_content(content),
22
+ alignment: alignment,
23
+ colspan: colspan && colspan > 1 ? colspan : nil,
24
+ rowspan: rowspan && rowspan > 1 ? rowspan : nil,
25
+ header: node.name == 'th'
26
+ )
27
+ end
28
+
29
+ def extract_alignment(node)
30
+ align = node['align']
31
+ node['valign']
32
+ # Combine horizontal and vertical alignment
33
+ case align
34
+ when 'left' then 'left'
35
+ when 'center' then 'center'
36
+ when 'right' then 'right'
37
+ end
38
+ # Return alignment string (can be extended to include vertical)
39
+ end
40
+
41
+ # Extract text from content array
42
+ def extract_text_from_content(content)
43
+ return content if content.is_a?(String)
44
+ return '' if content.nil?
45
+
46
+ content.map do |item|
47
+ case item
48
+ when String
49
+ item
50
+ when Coradoc::CoreModel::InlineElement
51
+ item.content.to_s
52
+ when Coradoc::CoreModel::Base
53
+ if item.content
54
+ item.content.to_s
55
+ else
56
+ ''
57
+ end
58
+ else
59
+ item.to_s
60
+ end
61
+ end.join
62
+ end
63
+ end
64
+
65
+ register :td, Td.new
66
+ register :th, Td.new
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Input
5
+ module Html
6
+ module Converters
7
+ class Text < Base
8
+ def to_coradoc(node, state = {})
9
+ return treat_empty(node, state) if node.text.strip.empty?
10
+
11
+ # HTML cleanup is performed in the converter layer
12
+ cleaned_content = cleanup_html_text(node.text)
13
+
14
+ # Return as CoreModel::InlineElement with format_type "text"
15
+ Coradoc::CoreModel::InlineElement.new(
16
+ format_type: 'text',
17
+ content: cleaned_content
18
+ )
19
+ end
20
+
21
+ private
22
+
23
+ def treat_empty(node, state)
24
+ parent = node.parent.name.to_sym
25
+ if %i[ol ul].include?(parent) # Otherwise the indentation is broken
26
+ nil
27
+ elsif state[:tdsinglepara]
28
+ nil
29
+ elsif node.text == ' ' # Regular whitespace text node
30
+ ' '
31
+ else
32
+ nil
33
+ end
34
+ end
35
+
36
+ # HTML-to-CoreModel text cleanup
37
+ def cleanup_html_text(text)
38
+ text = preserve_nbsp(text)
39
+ text = remove_border_newlines(text)
40
+ text = remove_inner_newlines(text)
41
+ escape_links(text)
42
+ end
43
+
44
+ def preserve_nbsp(text)
45
+ text.gsub("\u00A0", '&nbsp;')
46
+ end
47
+
48
+ def escape_links(text)
49
+ text.gsub(/<<([^ ][^>]*)>>/, '\\<<\\1>>')
50
+ end
51
+
52
+ def remove_border_newlines(text)
53
+ text.gsub(/\A\n+/, '').gsub(/\n+\z/, '')
54
+ end
55
+
56
+ def remove_inner_newlines(text)
57
+ # Convert newlines/tabs to spaces and squeeze multiple spaces
58
+ # Preserve single leading/trailing space for inline contexts
59
+ text.tr("\n\t", ' ').squeeze(' ')
60
+ end
61
+ end
62
+
63
+ register :text, Text.new
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Input
5
+ module Html
6
+ module Converters
7
+ class Th < Td
8
+ def cellstyle(node)
9
+ # this is the header row
10
+ return '' if node.parent.previous_element.nil?
11
+
12
+ 'h'
13
+ end
14
+ end
15
+
16
+ register :th, Th.new
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Input
5
+ module Html
6
+ module Converters
7
+ class Tr < Base
8
+ def to_coradoc(node, state = {})
9
+ content = treat_children_coradoc(node, state)
10
+ header = table_header_row?(node)
11
+ # Use CoreModel::TableRow with cells (not columns)
12
+ Coradoc::CoreModel::TableRow.new(
13
+ cells: content,
14
+ header: header
15
+ )
16
+ end
17
+
18
+ def table_header_row?(node)
19
+ # node.element_children.all? {|child| child.name.to_sym == :th}
20
+ node.previous_element.nil?
21
+ end
22
+ end
23
+
24
+ register :tr, Tr.new
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Input
5
+ module Html
6
+ module Converters
7
+ class Video < Base
8
+ def to_coradoc(node, _state = {})
9
+ src = node['src']
10
+ id = node['id']
11
+ title = extract_title(node)
12
+ options(node)
13
+
14
+ # Use Block with custom attributes to store video info
15
+ # CoreModel doesn't have a specific Video type, so we use Block
16
+ # with element_attributes to store video-specific data
17
+ Coradoc::CoreModel::Block.new(
18
+ element_type: 'video',
19
+ block_semantic_type: :video,
20
+ content: src,
21
+ title: title,
22
+ id: id,
23
+ width: node['width'],
24
+ height: node['height'],
25
+ element_attributes: {
26
+ autoplay: node['autoplay'],
27
+ loop: node['loop'],
28
+ controls: node['controls'],
29
+ poster: node['poster']
30
+ }.compact
31
+ )
32
+ end
33
+
34
+ def extract_title(node)
35
+ title = node.at('./track') || node.at('.//source')
36
+ return '' if title.nil?
37
+
38
+ title['label'] || title['srclang'] || ''
39
+ end
40
+
41
+ def options(node)
42
+ autoplay = node['autoplay']
43
+ loop_attr = node['loop']
44
+ controls = node['controls']
45
+ [autoplay, loop_attr, controls].compact
46
+ end
47
+ end
48
+
49
+ register :video, Video.new
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Input
5
+ module Html
6
+ module Converters
7
+ # Autoload converter classes - they will register themselves when first accessed
8
+ autoload :Base, 'coradoc/html/input/converters/base'
9
+ autoload :Markup, 'coradoc/html/input/converters/markup'
10
+ autoload :A, 'coradoc/html/input/converters/a'
11
+ autoload :Aside, 'coradoc/html/input/converters/aside'
12
+ autoload :Audio, 'coradoc/html/input/converters/audio'
13
+ autoload :Blockquote, 'coradoc/html/input/converters/blockquote'
14
+ autoload :Br, 'coradoc/html/input/converters/br'
15
+ autoload :Bypass, 'coradoc/html/input/converters/bypass'
16
+ autoload :Code, 'coradoc/html/input/converters/code'
17
+ autoload :Div, 'coradoc/html/input/converters/div'
18
+ autoload :Dl, 'coradoc/html/input/converters/dl'
19
+ autoload :Drop, 'coradoc/html/input/converters/drop'
20
+ autoload :Em, 'coradoc/html/input/converters/em'
21
+ autoload :Figure, 'coradoc/html/input/converters/figure'
22
+ autoload :H, 'coradoc/html/input/converters/h'
23
+ autoload :Head, 'coradoc/html/input/converters/head'
24
+ autoload :Hr, 'coradoc/html/input/converters/hr'
25
+ autoload :Ignore, 'coradoc/html/input/converters/ignore'
26
+ autoload :Img, 'coradoc/html/input/converters/img'
27
+ autoload :Li, 'coradoc/html/input/converters/li'
28
+ autoload :Mark, 'coradoc/html/input/converters/mark'
29
+ autoload :Ol, 'coradoc/html/input/converters/ol'
30
+ autoload :P, 'coradoc/html/input/converters/p'
31
+ autoload :PassThrough, 'coradoc/html/input/converters/pass_through'
32
+ autoload :Pre, 'coradoc/html/input/converters/pre'
33
+ autoload :Q, 'coradoc/html/input/converters/q'
34
+ autoload :Strong, 'coradoc/html/input/converters/strong'
35
+ autoload :Sup, 'coradoc/html/input/converters/sup'
36
+ autoload :Sub, 'coradoc/html/input/converters/sub'
37
+ autoload :Table, 'coradoc/html/input/converters/table'
38
+ autoload :Td, 'coradoc/html/input/converters/td'
39
+ autoload :Text, 'coradoc/html/input/converters/text'
40
+ autoload :Th, 'coradoc/html/input/converters/th'
41
+ autoload :Tr, 'coradoc/html/input/converters/tr'
42
+ autoload :Video, 'coradoc/html/input/converters/video'
43
+ autoload :Math, 'coradoc/html/input/converters/math'
44
+
45
+ # Define class methods
46
+ def self.register(tag_name, converter)
47
+ @@converters ||= {}
48
+ @@converters[tag_name.to_sym] = converter
49
+ end
50
+
51
+ def self.unregister(tag_name)
52
+ @@converters.delete(tag_name.to_sym)
53
+ end
54
+
55
+ # Ensure all converters are loaded and registered before first use
56
+ def self.ensure_converters_loaded
57
+ return if @converters_loaded
58
+
59
+ @converters_loaded = true
60
+
61
+ # Access each autoloaded constant to trigger file load + registration
62
+ # Only load converters that register HTML tag handlers
63
+ # Note: Some converters may have gem dependencies (e.g., Img requires marcel)
64
+ # so we only load the essential ones here
65
+ [
66
+ Base, Markup, A, Aside, Blockquote, Br, Bypass, Code, Div, Dl,
67
+ Drop, Em, Figure, H, Head, Hr, Ignore, Li, Mark, Ol, P,
68
+ PassThrough, Pre, Q, Strong, Sup, Sub, Table, Td, Text, Th, Tr
69
+ ].each do |converter|
70
+ # Just accessing the constant triggers autoload
71
+ end
72
+ end
73
+
74
+ def self.lookup(tag_name)
75
+ ensure_converters_loaded
76
+ converter = @@converters[tag_name.to_sym] || default_converter(tag_name)
77
+ converter.is_a?(Class) ? converter.new : converter
78
+ end
79
+
80
+ # NOTE: process won't run plugin hooks
81
+ def self.process(node, state)
82
+ node = node.to_a if node.is_a? Nokogiri::XML::NodeSet
83
+ if node.is_a? Array
84
+ return node.map { |i| process(i, state) }
85
+ .join
86
+ end
87
+
88
+ lookup(node.name).convert(node, state)
89
+ end
90
+
91
+ def self.process_coradoc(node, state)
92
+ node = node.to_a if node.is_a? Nokogiri::XML::NodeSet
93
+ return node.map { |i| process_coradoc(i, state) } if node.is_a? Array
94
+
95
+ plugins = state[:plugin_instances] || {}
96
+ process = proc { lookup(node.name).to_coradoc(node, state) }
97
+ plugins.each do |i|
98
+ prev_process = process
99
+ process = proc { i.html_tree_run_hooks(node, state, &prev_process) }
100
+ end
101
+ process.call(node, state)
102
+ end
103
+
104
+ def self.default_converter(tag_name)
105
+ case Html.config.unknown_tags.to_sym
106
+ when :pass_through
107
+ PassThrough.new
108
+ when :drop
109
+ Drop.new
110
+ when :bypass
111
+ Bypass.new
112
+ when :raise
113
+ raise Errors::UnknownTagError, "unknown tag: #{tag_name}"
114
+ else
115
+ raise Errors::InvalidConfigurationError,
116
+ "unknown value #{Html.config.unknown_tags.inspect} for Coradoc::Input::Html.config.unknown_tags"
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Input
5
+ module Html
6
+ module Errors
7
+ # Base error class for HTML input errors
8
+ # Inherits from Coradoc::Error for unified error handling
9
+ class Error < Coradoc::Error
10
+ end
11
+
12
+ # Raised when an unknown HTML tag is encountered
13
+ class UnknownTagError < Error
14
+ end
15
+
16
+ # Raised when HTML input configuration is invalid
17
+ class InvalidConfigurationError < Error
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,170 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Input
5
+ module Html
6
+ # HTML to CoreModel converter
7
+ #
8
+ # This class handles the conversion of HTML documents to CoreModel.
9
+ # It does NOT handle serialization to any specific output format.
10
+ # For serialization, use Coradoc.serialize(coremodel, to: :format)
11
+ #
12
+ # @example Basic usage - get CoreModel
13
+ # coremodel = HtmlConverter.to_core_model(html_string)
14
+ #
15
+ # @example Serialize to AsciiDoc
16
+ # coremodel = HtmlConverter.to_core_model(html_string)
17
+ # adoc_text = Coradoc.serialize(coremodel, to: :asciidoc)
18
+ #
19
+ class HtmlConverter
20
+ # Convert HTML to CoreModel
21
+ #
22
+ # @param input [String, Nokogiri::XML::Document, Nokogiri::XML::Node] HTML input
23
+ # @param options [Hash] Conversion options
24
+ # @return [Coradoc::CoreModel::Base] CoreModel document
25
+ def self.to_core_model(input, options = {})
26
+ Input::Html.config.with(options) do
27
+ plugin_instances = prepare_plugin_instances(options)
28
+
29
+ root = track_time 'Loading input HTML document' do
30
+ case input
31
+ when String
32
+ Nokogiri::HTML(input).root
33
+ when Nokogiri::XML::Document
34
+ input.root
35
+ when Nokogiri::XML::Node
36
+ input
37
+ end
38
+ end
39
+
40
+ return nil unless root
41
+
42
+ plugin_instances.each do |plugin|
43
+ plugin.html_tree = root
44
+ if plugin.public_methods.include?(:preprocess_html_tree)
45
+ track_time "Preprocessing document with #{plugin.name} plugin" do
46
+ plugin.preprocess_html_tree
47
+ end
48
+ end
49
+ root = plugin.html_tree
50
+ end
51
+
52
+ coremodel = track_time 'Converting input document tree to CoreModel' do
53
+ Converters.process_coradoc(
54
+ root,
55
+ plugin_instances: plugin_instances
56
+ )
57
+ end
58
+
59
+ coremodel = track_time 'Post-process CoreModel tree' do
60
+ Postprocessor.process(coremodel)
61
+ end
62
+
63
+ plugin_instances.each do |plugin|
64
+ next unless plugin.public_methods.include?(:postprocess_coremodel_tree)
65
+
66
+ plugin.coremodel_tree = coremodel
67
+ track_time "Postprocessing CoreModel tree with #{plugin.name} plugin" do
68
+ plugin.postprocess_coremodel_tree
69
+ end
70
+ coremodel = plugin.coremodel_tree
71
+ end
72
+
73
+ options[:plugin_instances] = plugin_instances unless options.frozen?
74
+
75
+ coremodel
76
+ end
77
+ end
78
+
79
+ # Legacy method - returns CoreModel
80
+ # @deprecated Use {#to_core_model} instead
81
+ def self.to_coradoc(input, options = {})
82
+ to_core_model(input, options)
83
+ end
84
+
85
+ # Legacy method for backward compatibility
86
+ # Converts HTML to CoreModel, then serializes to target format
87
+ #
88
+ # @deprecated Use {#to_core_model} + Coradoc.serialize instead
89
+ # @param input [String] HTML input
90
+ # @param options [Hash] Conversion options
91
+ # @param options [Symbol] :output_format Target format (default: :asciidoc)
92
+ # @return [String] Serialized document in target format
93
+ def self.convert(input, options = {})
94
+ output_format = options.delete(:output_format) || :asciidoc
95
+
96
+ coremodel = to_core_model(input, options)
97
+
98
+ if coremodel.is_a?(Hash)
99
+ coremodel.to_h do |file, tree|
100
+ track_time "Serializing file #{file || 'main'}" do
101
+ [file, serialize_core_model(tree, output_format, options)]
102
+ end
103
+ end
104
+ else
105
+ serialize_core_model(coremodel, output_format, options)
106
+ end
107
+ end
108
+
109
+ # Serialize CoreModel to target format using the appropriate gem
110
+ #
111
+ # @param coremodel [Coradoc::CoreModel::Base] CoreModel document
112
+ # @param format [Symbol] Target format
113
+ # @param options [Hash] Serialization options
114
+ # @return [String] Serialized document
115
+ def self.serialize_core_model(coremodel, format, options = {})
116
+ result = Coradoc.serialize(coremodel, to: format)
117
+ cleanup_result(result, options)
118
+ end
119
+
120
+ # Clean up the serialized result
121
+ #
122
+ # @param result [String] Serialized result
123
+ # @param options [Hash] Cleanup options
124
+ # @return [String] Cleaned result
125
+ def self.cleanup_result(result, options = {})
126
+ Input::Html.config.with(options) do
127
+ plugin_instances = prepare_plugin_instances(options)
128
+
129
+ result = track_time 'Cleaning up the result' do
130
+ Input::Html.cleaner.tidy(result)
131
+ end
132
+
133
+ plugin_instances.each do |plugin|
134
+ next unless plugin.public_methods.include?(:postprocess_output_string)
135
+
136
+ plugin.output_string = result
137
+ track_time "Postprocessing output string with #{plugin.name} plugin" do
138
+ plugin.postprocess_output_string
139
+ end
140
+ result = plugin.output_string
141
+ end
142
+
143
+ result
144
+ end
145
+ end
146
+
147
+ def self.prepare_plugin_instances(options)
148
+ options[:plugin_instances] || Html.config.plugins.map(&:new)
149
+ end
150
+
151
+ @track_time_indentation = 0
152
+ def self.track_time(task)
153
+ if Input::Html.config.track_time
154
+ warn (' ' * @track_time_indentation) + "* #{task} is starting..."
155
+ @track_time_indentation += 1
156
+ t0 = Time.now
157
+ ret = yield
158
+ time_elapsed = Time.now - t0
159
+ @track_time_indentation -= 1
160
+ warn (' ' * @track_time_indentation) +
161
+ "* #{task} took #{time_elapsed.round(3)} seconds"
162
+ ret
163
+ else
164
+ yield
165
+ end
166
+ end
167
+ end
168
+ end
169
+ end
170
+ end