coradoc-html 1.1.18 → 1.1.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/coradoc/html/cleaner.rb +128 -0
- data/lib/coradoc/html/converters/a.rb +77 -0
- data/lib/coradoc/html/converters/aside.rb +20 -0
- data/lib/coradoc/html/converters/audio.rb +19 -0
- data/lib/coradoc/html/converters/base.rb +98 -0
- data/lib/coradoc/html/converters/blockquote.rb +25 -0
- data/lib/coradoc/html/converters/br.rb +17 -0
- data/lib/coradoc/html/converters/bypass.rb +82 -0
- data/lib/coradoc/html/converters/code.rb +25 -0
- data/lib/coradoc/html/converters/div.rb +23 -0
- data/lib/coradoc/html/converters/dl.rb +82 -0
- data/lib/coradoc/html/converters/drop.rb +26 -0
- data/lib/coradoc/html/converters/em.rb +23 -0
- data/lib/coradoc/html/converters/figure.rb +33 -0
- data/lib/coradoc/html/converters/h.rb +58 -0
- data/lib/coradoc/html/converters/head.rb +29 -0
- data/lib/coradoc/html/converters/hr.rb +17 -0
- data/lib/coradoc/html/converters/img.rb +103 -0
- data/lib/coradoc/html/converters/li.rb +35 -0
- data/lib/coradoc/html/converters/mark.rb +21 -0
- data/lib/coradoc/html/converters/markup.rb +93 -0
- data/lib/coradoc/html/converters/math.rb +37 -0
- data/lib/coradoc/html/converters/media_base.rb +48 -0
- data/lib/coradoc/html/converters/ol.rb +42 -0
- data/lib/coradoc/html/converters/p.rb +64 -0
- data/lib/coradoc/html/converters/pass_through.rb +15 -0
- data/lib/coradoc/html/converters/positional_formatting.rb +35 -0
- data/lib/coradoc/html/converters/pre.rb +57 -0
- data/lib/coradoc/html/converters/q.rb +25 -0
- data/lib/coradoc/html/converters/strong.rb +22 -0
- data/lib/coradoc/html/converters/sub.rb +20 -0
- data/lib/coradoc/html/converters/sup.rb +20 -0
- data/lib/coradoc/html/converters/table.rb +64 -0
- data/lib/coradoc/html/converters/td.rb +42 -0
- data/lib/coradoc/html/converters/text.rb +66 -0
- data/lib/coradoc/html/converters/tr.rb +27 -0
- data/lib/coradoc/html/converters/video.rb +27 -0
- data/lib/coradoc/html/converters.rb +104 -0
- data/lib/coradoc/html/drop/drop_factory.rb +14 -22
- data/lib/coradoc/html/drop/inline_element_drop.rb +3 -5
- data/lib/coradoc/html/drop/raw_inline_element_drop.rb +30 -0
- data/lib/coradoc/html/drop.rb +30 -8
- data/lib/coradoc/html/errors.rb +11 -0
- data/lib/coradoc/html/html_converter.rb +78 -0
- data/lib/coradoc/html/input_config.rb +66 -0
- data/lib/coradoc/html/plugin.rb +90 -0
- data/lib/coradoc/html/plugins/plateau.rb +212 -0
- data/lib/coradoc/html/postprocessor.rb +19 -0
- data/lib/coradoc/html/spa.rb +0 -2
- data/lib/coradoc/html/static.rb +0 -2
- data/lib/coradoc/html/tag_mapping.rb +3 -1
- data/lib/coradoc/html/transform/from_core_model.rb +2 -2
- data/lib/coradoc/html/transform/to_core_model.rb +3 -3
- data/lib/coradoc/html/version.rb +1 -1
- data/lib/coradoc/html.rb +30 -5
- metadata +46 -47
- data/lib/coradoc/html/input/cleaner.rb +0 -134
- data/lib/coradoc/html/input/config.rb +0 -80
- data/lib/coradoc/html/input/converters/a.rb +0 -79
- data/lib/coradoc/html/input/converters/aside.rb +0 -22
- data/lib/coradoc/html/input/converters/audio.rb +0 -21
- data/lib/coradoc/html/input/converters/base.rb +0 -118
- data/lib/coradoc/html/input/converters/blockquote.rb +0 -27
- data/lib/coradoc/html/input/converters/br.rb +0 -19
- data/lib/coradoc/html/input/converters/bypass.rb +0 -84
- data/lib/coradoc/html/input/converters/code.rb +0 -27
- data/lib/coradoc/html/input/converters/div.rb +0 -25
- data/lib/coradoc/html/input/converters/dl.rb +0 -84
- data/lib/coradoc/html/input/converters/drop.rb +0 -28
- data/lib/coradoc/html/input/converters/em.rb +0 -25
- data/lib/coradoc/html/input/converters/figure.rb +0 -35
- data/lib/coradoc/html/input/converters/h.rb +0 -74
- data/lib/coradoc/html/input/converters/head.rb +0 -31
- data/lib/coradoc/html/input/converters/hr.rb +0 -19
- data/lib/coradoc/html/input/converters/img.rb +0 -105
- data/lib/coradoc/html/input/converters/li.rb +0 -37
- data/lib/coradoc/html/input/converters/mark.rb +0 -23
- data/lib/coradoc/html/input/converters/markup.rb +0 -103
- data/lib/coradoc/html/input/converters/math.rb +0 -39
- data/lib/coradoc/html/input/converters/media_base.rb +0 -50
- data/lib/coradoc/html/input/converters/ol.rb +0 -44
- data/lib/coradoc/html/input/converters/p.rb +0 -90
- data/lib/coradoc/html/input/converters/pass_through.rb +0 -17
- data/lib/coradoc/html/input/converters/positional_formatting.rb +0 -37
- data/lib/coradoc/html/input/converters/pre.rb +0 -59
- data/lib/coradoc/html/input/converters/q.rb +0 -27
- data/lib/coradoc/html/input/converters/strong.rb +0 -24
- data/lib/coradoc/html/input/converters/sub.rb +0 -22
- data/lib/coradoc/html/input/converters/sup.rb +0 -22
- data/lib/coradoc/html/input/converters/table.rb +0 -66
- data/lib/coradoc/html/input/converters/td.rb +0 -44
- data/lib/coradoc/html/input/converters/text.rb +0 -68
- data/lib/coradoc/html/input/converters/tr.rb +0 -29
- data/lib/coradoc/html/input/converters/video.rb +0 -29
- data/lib/coradoc/html/input/converters.rb +0 -107
- data/lib/coradoc/html/input/errors.rb +0 -22
- data/lib/coradoc/html/input/html_converter.rb +0 -98
- data/lib/coradoc/html/input/plugin.rb +0 -120
- data/lib/coradoc/html/input/plugins/plateau.rb +0 -214
- data/lib/coradoc/html/input/postprocessor.rb +0 -25
- data/lib/coradoc/html/input.rb +0 -86
- data/lib/coradoc/html/output.rb +0 -89
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Coradoc
|
|
4
|
-
module Input
|
|
5
|
-
module Html
|
|
6
|
-
module Errors
|
|
7
|
-
# Base error class for HTML input errors
|
|
8
|
-
# Inherits from Coradoc::Error for unified error handling
|
|
9
|
-
class Error < Coradoc::Error
|
|
10
|
-
end
|
|
11
|
-
|
|
12
|
-
# Raised when an unknown HTML tag is encountered
|
|
13
|
-
class UnknownTagError < Error
|
|
14
|
-
end
|
|
15
|
-
|
|
16
|
-
# Raised when HTML input configuration is invalid
|
|
17
|
-
class InvalidConfigurationError < Error
|
|
18
|
-
end
|
|
19
|
-
end
|
|
20
|
-
end
|
|
21
|
-
end
|
|
22
|
-
end
|
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Coradoc
|
|
4
|
-
module Input
|
|
5
|
-
module Html
|
|
6
|
-
# HTML to CoreModel converter
|
|
7
|
-
#
|
|
8
|
-
# This class handles the conversion of HTML documents to CoreModel.
|
|
9
|
-
# It does NOT handle serialization to any specific output format.
|
|
10
|
-
# For serialization, use Coradoc.serialize(coremodel, to: :format)
|
|
11
|
-
#
|
|
12
|
-
# @example Basic usage - get CoreModel
|
|
13
|
-
# coremodel = HtmlConverter.to_core_model(html_string)
|
|
14
|
-
#
|
|
15
|
-
# @example Serialize to AsciiDoc
|
|
16
|
-
# coremodel = HtmlConverter.to_core_model(html_string)
|
|
17
|
-
# adoc_text = Coradoc.serialize(coremodel, to: :asciidoc)
|
|
18
|
-
#
|
|
19
|
-
class HtmlConverter
|
|
20
|
-
# Convert HTML to CoreModel
|
|
21
|
-
#
|
|
22
|
-
# @param input [String, Nokogiri::XML::Document, Nokogiri::XML::Node] HTML input
|
|
23
|
-
# @param options [Hash] Conversion options
|
|
24
|
-
# @return [Coradoc::CoreModel::Base] CoreModel document
|
|
25
|
-
def self.to_core_model(input, options = {})
|
|
26
|
-
Input::Html.config.with(options) do
|
|
27
|
-
plugin_instances = prepare_plugin_instances(options)
|
|
28
|
-
|
|
29
|
-
root = track_time 'Loading input HTML document' do
|
|
30
|
-
case input
|
|
31
|
-
when String
|
|
32
|
-
Nokogiri::HTML(input).root
|
|
33
|
-
when Nokogiri::XML::Document
|
|
34
|
-
input.root
|
|
35
|
-
when Nokogiri::XML::Node
|
|
36
|
-
input
|
|
37
|
-
end
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
return nil unless root
|
|
41
|
-
|
|
42
|
-
plugin_instances.each do |plugin|
|
|
43
|
-
plugin.html_tree = root
|
|
44
|
-
track_time "Preprocessing document with #{plugin.name} plugin" do
|
|
45
|
-
plugin.preprocess_html_tree
|
|
46
|
-
end
|
|
47
|
-
root = plugin.html_tree
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
coremodel = track_time 'Converting input document tree to CoreModel' do
|
|
51
|
-
Converters.process_coradoc(
|
|
52
|
-
root,
|
|
53
|
-
plugin_instances: plugin_instances
|
|
54
|
-
)
|
|
55
|
-
end
|
|
56
|
-
|
|
57
|
-
coremodel = track_time 'Post-process CoreModel tree' do
|
|
58
|
-
Postprocessor.process(coremodel)
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
plugin_instances.each do |plugin|
|
|
62
|
-
plugin.coremodel_tree = coremodel
|
|
63
|
-
track_time "Postprocessing CoreModel tree with #{plugin.name} plugin" do
|
|
64
|
-
plugin.postprocess_coremodel_tree
|
|
65
|
-
end
|
|
66
|
-
coremodel = plugin.coremodel_tree
|
|
67
|
-
end
|
|
68
|
-
|
|
69
|
-
options[:plugin_instances] = plugin_instances unless options.frozen?
|
|
70
|
-
|
|
71
|
-
coremodel
|
|
72
|
-
end
|
|
73
|
-
end
|
|
74
|
-
|
|
75
|
-
def self.prepare_plugin_instances(options)
|
|
76
|
-
options[:plugin_instances] || Html.config.plugins.map(&:new)
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
@track_time_indentation = 0
|
|
80
|
-
def self.track_time(task)
|
|
81
|
-
if Input::Html.config.track_time
|
|
82
|
-
warn (' ' * @track_time_indentation) + "* #{task} is starting..."
|
|
83
|
-
@track_time_indentation += 1
|
|
84
|
-
t0 = Time.now
|
|
85
|
-
ret = yield
|
|
86
|
-
time_elapsed = Time.now - t0
|
|
87
|
-
@track_time_indentation -= 1
|
|
88
|
-
warn (' ' * @track_time_indentation) +
|
|
89
|
-
"* #{task} took #{time_elapsed.round(3)} seconds"
|
|
90
|
-
ret
|
|
91
|
-
else
|
|
92
|
-
yield
|
|
93
|
-
end
|
|
94
|
-
end
|
|
95
|
-
end
|
|
96
|
-
end
|
|
97
|
-
end
|
|
98
|
-
end
|
|
@@ -1,120 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Coradoc
|
|
4
|
-
module Input
|
|
5
|
-
module Html
|
|
6
|
-
class Plugin
|
|
7
|
-
#### Plugin system general
|
|
8
|
-
|
|
9
|
-
# Allow building plugins with a shorthand syntax:
|
|
10
|
-
# plugin = Coradoc::Html::Input::Plugin.new do
|
|
11
|
-
# def name = "Test"
|
|
12
|
-
# end
|
|
13
|
-
|
|
14
|
-
def self.new(&)
|
|
15
|
-
if self == Plugin
|
|
16
|
-
Class.new(Plugin, &)
|
|
17
|
-
else
|
|
18
|
-
super
|
|
19
|
-
end
|
|
20
|
-
end
|
|
21
|
-
|
|
22
|
-
def initialize
|
|
23
|
-
@html_tree_hooks_pre = {}
|
|
24
|
-
@html_tree_hooks_post = {}
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
def name
|
|
28
|
-
self.class.name
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
# Default no-op hooks. Plugins override these as needed.
|
|
32
|
-
def preprocess_html_tree; end
|
|
33
|
-
def postprocess_coremodel_tree; end
|
|
34
|
-
def postprocess_output_string; end
|
|
35
|
-
|
|
36
|
-
#### HTML Tree functionalities
|
|
37
|
-
|
|
38
|
-
attr_accessor :html_tree, :coremodel_tree, :output_string
|
|
39
|
-
|
|
40
|
-
def html_tree_change_tag_name_by_css(css, new_name)
|
|
41
|
-
html_tree.css(css).each do |e|
|
|
42
|
-
e.name = new_name
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
def html_tree_change_properties_by_css(css, properties)
|
|
47
|
-
html_tree.css(css).each do |e|
|
|
48
|
-
properties.each do |k, v|
|
|
49
|
-
e[k.to_s] = v
|
|
50
|
-
end
|
|
51
|
-
end
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
def html_tree_remove_by_css(css)
|
|
55
|
-
html_tree.css(css).each(&:remove)
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
def html_tree_replace_with_children_by_css(css)
|
|
59
|
-
html_tree.css(css).each do |e|
|
|
60
|
-
e.replace(e.children)
|
|
61
|
-
end
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
def html_tree_process_to_coremodel(tree, state = {})
|
|
65
|
-
Coradoc::Html::Input::Converters.process_coradoc(tree, state)
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
# define preprocess_html_tree to process HTML trees
|
|
69
|
-
|
|
70
|
-
# Creates a hook to be called instead of converting an element
|
|
71
|
-
# to a CoreModel node.
|
|
72
|
-
#
|
|
73
|
-
# proc |html_node, state|
|
|
74
|
-
# coremodel_node
|
|
75
|
-
# end
|
|
76
|
-
def html_tree_add_hook_pre(element, &block)
|
|
77
|
-
@html_tree_hooks_pre[element] = block
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
def html_tree_add_hook_pre_by_css(css, &block)
|
|
81
|
-
html_tree.css(css).each do |e|
|
|
82
|
-
html_tree_add_hook_pre(e, &block)
|
|
83
|
-
end
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
# Creates a hook to be called after converting an element
|
|
87
|
-
# to a CoreModel node.
|
|
88
|
-
#
|
|
89
|
-
# proc |html_node, coremodel_node, state|
|
|
90
|
-
# coremodel_node
|
|
91
|
-
# end
|
|
92
|
-
def html_tree_add_hook_post(element, &block)
|
|
93
|
-
@html_tree_hooks_post[element] = block
|
|
94
|
-
end
|
|
95
|
-
|
|
96
|
-
def html_tree_add_hook_post_by_css(css, &block)
|
|
97
|
-
html_tree.css(css).each do |e|
|
|
98
|
-
html_tree_add_hook_post(e, &block)
|
|
99
|
-
end
|
|
100
|
-
end
|
|
101
|
-
|
|
102
|
-
def html_tree_run_hooks(node, state, &)
|
|
103
|
-
hook_pre = @html_tree_hooks_pre[node]
|
|
104
|
-
hook_post = @html_tree_hooks_post[node]
|
|
105
|
-
|
|
106
|
-
coremodel = hook_pre.call(node, state) if hook_pre
|
|
107
|
-
coremodel ||= yield node, state
|
|
108
|
-
|
|
109
|
-
coremodel = hook_post.call(node, coremodel, state) if hook_post
|
|
110
|
-
|
|
111
|
-
coremodel
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
#### CoreModel tree functionalities
|
|
115
|
-
|
|
116
|
-
#### Output string functionalities
|
|
117
|
-
end
|
|
118
|
-
end
|
|
119
|
-
end
|
|
120
|
-
end
|
|
@@ -1,214 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Coradoc
|
|
4
|
-
module Input
|
|
5
|
-
module Html
|
|
6
|
-
class Plugin
|
|
7
|
-
# This plugin enhances documents from the PLATEAU project
|
|
8
|
-
# to extract more data.
|
|
9
|
-
#
|
|
10
|
-
# Usage:
|
|
11
|
-
# coradoc convert -rcoradoc/input/html/plugins/plateau
|
|
12
|
-
# --external-images -u raise --output _out/index.adoc index.html
|
|
13
|
-
class Plateau < Plugin
|
|
14
|
-
def name
|
|
15
|
-
'PLATEAU'
|
|
16
|
-
end
|
|
17
|
-
|
|
18
|
-
def preprocess_html_tree
|
|
19
|
-
# Remove side menu, so we can generate TOC ourselves
|
|
20
|
-
html_tree_remove_by_css('.sideMenu')
|
|
21
|
-
|
|
22
|
-
# Correct non-semantic classes into semantic HTML tags
|
|
23
|
-
html_tree_change_tag_name_by_css('.titledata', 'h1')
|
|
24
|
-
html_tree_change_tag_name_by_css('.subtitledata', 'h2')
|
|
25
|
-
html_tree_change_tag_name_by_css('.pitemdata', 'h3')
|
|
26
|
-
html_tree_change_tag_name_by_css('.sitemdata', 'h4')
|
|
27
|
-
html_tree_change_tag_name_by_css('td[bgcolor="#D0CECE"]', 'th')
|
|
28
|
-
html_tree_change_tag_name_by_css('td[bgcolor="#d0cece"]', 'th')
|
|
29
|
-
html_tree_change_tag_name_by_css(
|
|
30
|
-
'.framedata, .frame_container_box',
|
|
31
|
-
'aside'
|
|
32
|
-
)
|
|
33
|
-
html_tree_change_tag_name_by_css('.frame2data', 'pre')
|
|
34
|
-
# Assumption that all code snippets in those documents are XML...
|
|
35
|
-
html_tree_change_properties_by_css(
|
|
36
|
-
'.frame2data',
|
|
37
|
-
class: 'brush:xml;'
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
# Remove some CSS ids that are not important to us
|
|
41
|
-
html_tree_change_properties_by_css('#__nuxt', id: nil)
|
|
42
|
-
html_tree_change_properties_by_css('#__layout', id: nil)
|
|
43
|
-
html_tree_change_properties_by_css('#app', id: nil)
|
|
44
|
-
|
|
45
|
-
# Handle lists of document 02
|
|
46
|
-
html_tree_replace_with_children_by_css('.list_num-wrap')
|
|
47
|
-
|
|
48
|
-
# Convert table/img caption to become a caption
|
|
49
|
-
html_tree.css('.imagedata').each do |e|
|
|
50
|
-
table = e.parent.next&.children&.first
|
|
51
|
-
if table&.name == 'table'
|
|
52
|
-
e.name = 'caption'
|
|
53
|
-
table.prepend_child(e)
|
|
54
|
-
next
|
|
55
|
-
end
|
|
56
|
-
|
|
57
|
-
img = e.parent.previous&.children&.first
|
|
58
|
-
next unless img&.name == 'img' && img['src']
|
|
59
|
-
|
|
60
|
-
title = e.text.strip
|
|
61
|
-
img['title'] = title
|
|
62
|
-
e.remove
|
|
63
|
-
next
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
# Add hooks for H1, H2, H3, H4
|
|
67
|
-
html_tree_add_hook_post_by_css(
|
|
68
|
-
'h1, h2, h3',
|
|
69
|
-
&method(:handle_headers)
|
|
70
|
-
)
|
|
71
|
-
html_tree_add_hook_post_by_css('h4', &method(:handle_headers_h4))
|
|
72
|
-
|
|
73
|
-
# Table cells aligned to center
|
|
74
|
-
html_tree_change_properties_by_css(
|
|
75
|
-
'.tableTopCenter',
|
|
76
|
-
align: 'center'
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
# Handle non-semantic lists and indentation
|
|
80
|
-
html_tree_add_hook_pre_by_css '.text2data' do |node,|
|
|
81
|
-
text = html_tree_process_to_coremodel(node).strip
|
|
82
|
-
next '' if text.empty? || text == "\u3000"
|
|
83
|
-
|
|
84
|
-
if text.start_with?(/\d+\./)
|
|
85
|
-
text = text.sub(/\A\d+.\s*/, '')
|
|
86
|
-
".. #{text}\n"
|
|
87
|
-
else
|
|
88
|
-
text = text.gsub(/^/, '** ')
|
|
89
|
-
"\n\n//-PT2D\n#{text}\n//-ENDPT2D\n\n"
|
|
90
|
-
end
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
(3..4).each do |i|
|
|
94
|
-
html_tree_add_hook_pre_by_css ".text#{i}data" do |node,|
|
|
95
|
-
text = html_tree_process_to_coremodel(node).strip
|
|
96
|
-
next '' if text.empty? || text == "\u3000"
|
|
97
|
-
|
|
98
|
-
text = text.strip.gsub(/^/, "#{'*' * i} ")
|
|
99
|
-
"\n\n//-PT#{i}D\n#{text}\n//-ENDPT#{i}D\n\n"
|
|
100
|
-
end
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
(2..3).each do |i|
|
|
104
|
-
html_tree_add_hook_pre_by_css ".text#{i}data_point ul" do |node,|
|
|
105
|
-
text = html_tree_process_to_coremodel(node.children.first.children).strip
|
|
106
|
-
|
|
107
|
-
"#{'*' * i} #{text}\n"
|
|
108
|
-
end
|
|
109
|
-
end
|
|
110
|
-
|
|
111
|
-
(1..20).each do |i|
|
|
112
|
-
html_tree_add_hook_pre_by_css ".numtextdata_num .list_num#{i}" do |node,|
|
|
113
|
-
text = html_tree_process_to_coremodel(node).strip
|
|
114
|
-
|
|
115
|
-
"[start=#{i}]\n. #{text}\n"
|
|
116
|
-
end
|
|
117
|
-
end
|
|
118
|
-
end
|
|
119
|
-
|
|
120
|
-
IM = /[A-Z0-9]{1,3}/
|
|
121
|
-
|
|
122
|
-
def handle_headers(node, coradoc, _state)
|
|
123
|
-
content = coradoc.content.map(&:content).join
|
|
124
|
-
|
|
125
|
-
if %w[toc0 toc_0].any? { |i| coradoc.id&.start_with?(i) }
|
|
126
|
-
# Special content
|
|
127
|
-
case content.strip
|
|
128
|
-
when 'はじめに' # Introduction
|
|
129
|
-
coradoc.style = 'abstract' # The older version document has ".preface"
|
|
130
|
-
coradoc.level_int = 1
|
|
131
|
-
when '改定の概要' # Revision overview
|
|
132
|
-
coradoc.style = 'abstract' # The older version document has ".preface"
|
|
133
|
-
coradoc.level_int = 1
|
|
134
|
-
when '参考文献' # Bibliography
|
|
135
|
-
coradoc.style = 'bibliography'
|
|
136
|
-
coradoc.level_int = 1
|
|
137
|
-
when '改訂履歴' # Document history
|
|
138
|
-
coradoc.style = 'appendix'
|
|
139
|
-
coradoc.level_int = 1
|
|
140
|
-
when '0 概要' # Overview
|
|
141
|
-
coradoc.style = 'abstract' # I'm not sure this is correct
|
|
142
|
-
coradoc.level_int = 1
|
|
143
|
-
when '索引' # Index
|
|
144
|
-
coradoc.style = 'index' # I'm not sure this is correct
|
|
145
|
-
coradoc.level_int = 1
|
|
146
|
-
else
|
|
147
|
-
warn "Unknown section #{content.inspect}"
|
|
148
|
-
end
|
|
149
|
-
end
|
|
150
|
-
|
|
151
|
-
if node.name == 'h1' && content.start_with?('Annex')
|
|
152
|
-
coradoc.style = 'appendix'
|
|
153
|
-
coradoc.content.first.content.sub!(/\AAnnex [A-Z]/, '')
|
|
154
|
-
end
|
|
155
|
-
|
|
156
|
-
# Remove numbers
|
|
157
|
-
coradoc.content.first.content.sub!(
|
|
158
|
-
/\A(#{IM}\.)*#{IM}[[:space:]]/o,
|
|
159
|
-
''
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
coradoc
|
|
163
|
-
end
|
|
164
|
-
|
|
165
|
-
def handle_headers_h4(_node, coradoc, _state)
|
|
166
|
-
title = Coradoc.strip_unicode(coradoc.content.first.content)
|
|
167
|
-
case title
|
|
168
|
-
when /\A\(\d+\)(.*)/
|
|
169
|
-
coradoc.level_int = 4
|
|
170
|
-
coradoc.content.first.content = ::Regexp.last_match(1).strip
|
|
171
|
-
coradoc
|
|
172
|
-
when /\A\d+\)(.*)/
|
|
173
|
-
coradoc.level_int = 5
|
|
174
|
-
coradoc.content.first.content = ::Regexp.last_match(1).strip
|
|
175
|
-
coradoc
|
|
176
|
-
when /\A#{IM}\.#{IM}\.#{IM}\.#{IM}(.*)/o
|
|
177
|
-
coradoc.level_int = 4
|
|
178
|
-
coradoc.content.first.content = ::Regexp.last_match(1).strip
|
|
179
|
-
else
|
|
180
|
-
if title.empty?
|
|
181
|
-
# Strip instances of faulty empty paragraphs
|
|
182
|
-
nil
|
|
183
|
-
else
|
|
184
|
-
["// Unhandled h4 content\n", coradoc]
|
|
185
|
-
end
|
|
186
|
-
end
|
|
187
|
-
end
|
|
188
|
-
|
|
189
|
-
def postprocess_output_string
|
|
190
|
-
str = output_string
|
|
191
|
-
|
|
192
|
-
### Custom indentation handling
|
|
193
|
-
# If there's a step up, add [none]
|
|
194
|
-
str = str.gsub(%r{\s+//-ENDPT2D\s+//-PT3D\s+}, "\n[none]\n")
|
|
195
|
-
str = str.gsub(%r{\s+//-ENDPT2D\s+//-PT4D\s+}, "\n[none]\n")
|
|
196
|
-
str = str.gsub(%r{\s+//-ENDPT3D\s+//-PT4D\s+}, "\n[none]\n")
|
|
197
|
-
# Collapse blocks of text[2,3]data
|
|
198
|
-
str = str.gsub(%r{\s+//-ENDPT[234]D\s+//-PT[234]D\s+}, "\n\n")
|
|
199
|
-
# In the beginning, add [none]
|
|
200
|
-
str = str.gsub(%r{\s+//-PT[234]D\s+}, "\n\n[none]\n")
|
|
201
|
-
# If following with another list, ensure we readd styling
|
|
202
|
-
str = str.gsub(%r{\s+//-ENDPT[234]D\s+\*}, "\n\n[disc]\n*")
|
|
203
|
-
# Otherwise, clean up
|
|
204
|
-
str = str.gsub(%r{\s+//-ENDPT[234]D\s+}, "\n\n")
|
|
205
|
-
|
|
206
|
-
self.output_string = str
|
|
207
|
-
end
|
|
208
|
-
end
|
|
209
|
-
end
|
|
210
|
-
end
|
|
211
|
-
end
|
|
212
|
-
end
|
|
213
|
-
|
|
214
|
-
Coradoc::Input::Html.config.plugins << Coradoc::Input::Html::Plugin::Plateau
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Coradoc
|
|
4
|
-
module Input
|
|
5
|
-
module Html
|
|
6
|
-
# Postprocessor hook for CoreModel tree transformations after HTML parsing.
|
|
7
|
-
#
|
|
8
|
-
# Override or extend to apply post-parse cleanup. The default
|
|
9
|
-
# implementation returns the tree unchanged.
|
|
10
|
-
class Postprocessor
|
|
11
|
-
def self.process(coradoc)
|
|
12
|
-
new(coradoc).process
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
def initialize(coradoc)
|
|
16
|
-
@tree = coradoc
|
|
17
|
-
end
|
|
18
|
-
|
|
19
|
-
def process
|
|
20
|
-
@tree
|
|
21
|
-
end
|
|
22
|
-
end
|
|
23
|
-
end
|
|
24
|
-
end
|
|
25
|
-
end
|
data/lib/coradoc/html/input.rb
DELETED
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'digest'
|
|
4
|
-
require 'nokogiri'
|
|
5
|
-
require 'coradoc'
|
|
6
|
-
|
|
7
|
-
module Coradoc
|
|
8
|
-
module Input
|
|
9
|
-
module Html
|
|
10
|
-
# Autoload all components
|
|
11
|
-
autoload :Errors, 'coradoc/html/input/errors'
|
|
12
|
-
autoload :Cleaner, 'coradoc/html/input/cleaner'
|
|
13
|
-
autoload :Config, 'coradoc/html/input/config'
|
|
14
|
-
autoload :Plugin, 'coradoc/html/input/plugin'
|
|
15
|
-
autoload :Postprocessor, 'coradoc/html/input/postprocessor'
|
|
16
|
-
autoload :Converters, 'coradoc/html/input/converters'
|
|
17
|
-
autoload :HtmlConverter, 'coradoc/html/input/html_converter'
|
|
18
|
-
|
|
19
|
-
def self.convert(input, options = {})
|
|
20
|
-
HtmlConverter.to_core_model(input, options)
|
|
21
|
-
end
|
|
22
|
-
|
|
23
|
-
def self.to_coradoc(input, options = {})
|
|
24
|
-
HtmlConverter.to_core_model(input, options)
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
def self.config
|
|
28
|
-
@config ||= Config.new
|
|
29
|
-
yield @config if block_given?
|
|
30
|
-
@config
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
def self.cleaner
|
|
34
|
-
@cleaner ||= Cleaner.new
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
def self.processor_id
|
|
38
|
-
:html
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
extend Coradoc::Html::FormatDetection
|
|
42
|
-
|
|
43
|
-
def self.processor_match?(filename)
|
|
44
|
-
html_extension?(filename)
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
def self.processor_execute(input, options = {})
|
|
48
|
-
to_coradoc(input, options)
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
def self.processor_postprocess(data, options)
|
|
52
|
-
if options[:output_processor] == :adoc
|
|
53
|
-
data.transform_values { |v| clean_output(v, options) }
|
|
54
|
-
else
|
|
55
|
-
data
|
|
56
|
-
end
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
def self.clean_output(result, options = {})
|
|
60
|
-
config.with(options) do
|
|
61
|
-
plugin_instances = HtmlConverter.prepare_plugin_instances(options)
|
|
62
|
-
|
|
63
|
-
result = HtmlConverter.track_time('Cleaning up the result') do
|
|
64
|
-
cleaner.tidy(result)
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
plugin_instances.each do |plugin|
|
|
68
|
-
plugin.output_string = result
|
|
69
|
-
HtmlConverter.track_time("Postprocessing output string with #{plugin.name} plugin") do
|
|
70
|
-
plugin.postprocess_output_string
|
|
71
|
-
end
|
|
72
|
-
result = plugin.output_string
|
|
73
|
-
end
|
|
74
|
-
|
|
75
|
-
result
|
|
76
|
-
end
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
Coradoc::Input.define(self)
|
|
80
|
-
end
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
module Html
|
|
84
|
-
Input = Coradoc::Input::Html
|
|
85
|
-
end
|
|
86
|
-
end
|
data/lib/coradoc/html/output.rb
DELETED
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'coradoc'
|
|
4
|
-
|
|
5
|
-
module Coradoc
|
|
6
|
-
module Output
|
|
7
|
-
# Static HTML output processor
|
|
8
|
-
#
|
|
9
|
-
# Generates static HTML documents from CoreModel using the classic
|
|
10
|
-
# rendering approach without JavaScript frameworks.
|
|
11
|
-
#
|
|
12
|
-
# @example Using the processor directly
|
|
13
|
-
# html = Coradoc::Output::HtmlStatic.processor_execute({ "doc.html" => document }, {})
|
|
14
|
-
#
|
|
15
|
-
# @example Using through Output module
|
|
16
|
-
# result = Coradoc::Output.process(document, format: :html_static)
|
|
17
|
-
#
|
|
18
|
-
class HtmlStatic
|
|
19
|
-
extend Coradoc::Html::FormatDetection
|
|
20
|
-
|
|
21
|
-
class << self
|
|
22
|
-
def processor_id
|
|
23
|
-
:html_static
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
def processor_match?(filename)
|
|
27
|
-
html_extension?(filename)
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
# Process documents to static HTML
|
|
31
|
-
# @param input [Hash<String, Object>] mapping of filenames to documents
|
|
32
|
-
# @param options [Hash] processing options
|
|
33
|
-
# @return [Hash<String, String>] mapping of filenames to HTML output
|
|
34
|
-
def processor_execute(input, options = {})
|
|
35
|
-
result = {}
|
|
36
|
-
input.each do |filename, document|
|
|
37
|
-
html = Coradoc::Html::Static.convert(document, options)
|
|
38
|
-
result[filename] = html
|
|
39
|
-
end
|
|
40
|
-
result
|
|
41
|
-
end
|
|
42
|
-
end
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
# SPA (Single Page Application) HTML output processor
|
|
46
|
-
#
|
|
47
|
-
# Generates modern Vue.js + Tailwind CSS HTML documents from CoreModel.
|
|
48
|
-
#
|
|
49
|
-
# @example Using the processor directly
|
|
50
|
-
# html = Coradoc::Output::HtmlSpa.processor_execute({ "doc.html" => document }, {})
|
|
51
|
-
#
|
|
52
|
-
# @example Using through Output module
|
|
53
|
-
# result = Coradoc::Output.process(document, format: :html_spa)
|
|
54
|
-
#
|
|
55
|
-
class HtmlSpa
|
|
56
|
-
extend Coradoc::Html::FormatDetection
|
|
57
|
-
|
|
58
|
-
class << self
|
|
59
|
-
def processor_id
|
|
60
|
-
:html_spa
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
def processor_match?(filename)
|
|
64
|
-
html_extension?(filename)
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
# Process documents to SPA HTML
|
|
68
|
-
# @param input [Hash<String, Object>] mapping of filenames to documents
|
|
69
|
-
# @param options [Hash] processing options
|
|
70
|
-
# @return [Hash<String, String>] mapping of filenames to SPA HTML output
|
|
71
|
-
def processor_execute(input, options = {})
|
|
72
|
-
result = {}
|
|
73
|
-
input.each do |filename, document|
|
|
74
|
-
html = Coradoc::Html::Spa.convert(document, options)
|
|
75
|
-
result[filename] = html
|
|
76
|
-
end
|
|
77
|
-
result
|
|
78
|
-
end
|
|
79
|
-
end
|
|
80
|
-
end
|
|
81
|
-
|
|
82
|
-
# Alias for HtmlSpa
|
|
83
|
-
Spa = HtmlSpa
|
|
84
|
-
end
|
|
85
|
-
end
|
|
86
|
-
|
|
87
|
-
# Register processors with the Output module
|
|
88
|
-
Coradoc::Output.define(Coradoc::Output::HtmlStatic)
|
|
89
|
-
Coradoc::Output.define(Coradoc::Output::HtmlSpa)
|