coradoc-html 1.1.18 → 1.1.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/coradoc/html/cleaner.rb +128 -0
- data/lib/coradoc/html/converters/a.rb +77 -0
- data/lib/coradoc/html/converters/aside.rb +20 -0
- data/lib/coradoc/html/converters/audio.rb +19 -0
- data/lib/coradoc/html/converters/base.rb +98 -0
- data/lib/coradoc/html/converters/blockquote.rb +25 -0
- data/lib/coradoc/html/converters/br.rb +17 -0
- data/lib/coradoc/html/converters/bypass.rb +82 -0
- data/lib/coradoc/html/converters/code.rb +25 -0
- data/lib/coradoc/html/converters/div.rb +23 -0
- data/lib/coradoc/html/converters/dl.rb +82 -0
- data/lib/coradoc/html/converters/drop.rb +26 -0
- data/lib/coradoc/html/converters/em.rb +23 -0
- data/lib/coradoc/html/converters/figure.rb +33 -0
- data/lib/coradoc/html/converters/h.rb +58 -0
- data/lib/coradoc/html/converters/head.rb +29 -0
- data/lib/coradoc/html/converters/hr.rb +17 -0
- data/lib/coradoc/html/converters/img.rb +103 -0
- data/lib/coradoc/html/converters/li.rb +35 -0
- data/lib/coradoc/html/converters/mark.rb +21 -0
- data/lib/coradoc/html/converters/markup.rb +93 -0
- data/lib/coradoc/html/converters/math.rb +37 -0
- data/lib/coradoc/html/converters/media_base.rb +48 -0
- data/lib/coradoc/html/converters/ol.rb +42 -0
- data/lib/coradoc/html/converters/p.rb +64 -0
- data/lib/coradoc/html/converters/pass_through.rb +15 -0
- data/lib/coradoc/html/converters/positional_formatting.rb +35 -0
- data/lib/coradoc/html/converters/pre.rb +57 -0
- data/lib/coradoc/html/converters/q.rb +25 -0
- data/lib/coradoc/html/converters/strong.rb +22 -0
- data/lib/coradoc/html/converters/sub.rb +20 -0
- data/lib/coradoc/html/converters/sup.rb +20 -0
- data/lib/coradoc/html/converters/table.rb +64 -0
- data/lib/coradoc/html/converters/td.rb +42 -0
- data/lib/coradoc/html/converters/text.rb +66 -0
- data/lib/coradoc/html/converters/tr.rb +27 -0
- data/lib/coradoc/html/converters/video.rb +27 -0
- data/lib/coradoc/html/converters.rb +104 -0
- data/lib/coradoc/html/drop/drop_factory.rb +14 -22
- data/lib/coradoc/html/drop/inline_element_drop.rb +3 -5
- data/lib/coradoc/html/drop/raw_inline_element_drop.rb +30 -0
- data/lib/coradoc/html/drop.rb +30 -8
- data/lib/coradoc/html/errors.rb +11 -0
- data/lib/coradoc/html/html_converter.rb +78 -0
- data/lib/coradoc/html/input_config.rb +66 -0
- data/lib/coradoc/html/plugin.rb +90 -0
- data/lib/coradoc/html/plugins/plateau.rb +212 -0
- data/lib/coradoc/html/postprocessor.rb +19 -0
- data/lib/coradoc/html/spa.rb +0 -2
- data/lib/coradoc/html/static.rb +0 -2
- data/lib/coradoc/html/tag_mapping.rb +3 -1
- data/lib/coradoc/html/transform/from_core_model.rb +2 -2
- data/lib/coradoc/html/transform/to_core_model.rb +3 -3
- data/lib/coradoc/html/version.rb +1 -1
- data/lib/coradoc/html.rb +30 -5
- metadata +46 -47
- data/lib/coradoc/html/input/cleaner.rb +0 -134
- data/lib/coradoc/html/input/config.rb +0 -80
- data/lib/coradoc/html/input/converters/a.rb +0 -79
- data/lib/coradoc/html/input/converters/aside.rb +0 -22
- data/lib/coradoc/html/input/converters/audio.rb +0 -21
- data/lib/coradoc/html/input/converters/base.rb +0 -118
- data/lib/coradoc/html/input/converters/blockquote.rb +0 -27
- data/lib/coradoc/html/input/converters/br.rb +0 -19
- data/lib/coradoc/html/input/converters/bypass.rb +0 -84
- data/lib/coradoc/html/input/converters/code.rb +0 -27
- data/lib/coradoc/html/input/converters/div.rb +0 -25
- data/lib/coradoc/html/input/converters/dl.rb +0 -84
- data/lib/coradoc/html/input/converters/drop.rb +0 -28
- data/lib/coradoc/html/input/converters/em.rb +0 -25
- data/lib/coradoc/html/input/converters/figure.rb +0 -35
- data/lib/coradoc/html/input/converters/h.rb +0 -74
- data/lib/coradoc/html/input/converters/head.rb +0 -31
- data/lib/coradoc/html/input/converters/hr.rb +0 -19
- data/lib/coradoc/html/input/converters/img.rb +0 -105
- data/lib/coradoc/html/input/converters/li.rb +0 -37
- data/lib/coradoc/html/input/converters/mark.rb +0 -23
- data/lib/coradoc/html/input/converters/markup.rb +0 -103
- data/lib/coradoc/html/input/converters/math.rb +0 -39
- data/lib/coradoc/html/input/converters/media_base.rb +0 -50
- data/lib/coradoc/html/input/converters/ol.rb +0 -44
- data/lib/coradoc/html/input/converters/p.rb +0 -90
- data/lib/coradoc/html/input/converters/pass_through.rb +0 -17
- data/lib/coradoc/html/input/converters/positional_formatting.rb +0 -37
- data/lib/coradoc/html/input/converters/pre.rb +0 -59
- data/lib/coradoc/html/input/converters/q.rb +0 -27
- data/lib/coradoc/html/input/converters/strong.rb +0 -24
- data/lib/coradoc/html/input/converters/sub.rb +0 -22
- data/lib/coradoc/html/input/converters/sup.rb +0 -22
- data/lib/coradoc/html/input/converters/table.rb +0 -66
- data/lib/coradoc/html/input/converters/td.rb +0 -44
- data/lib/coradoc/html/input/converters/text.rb +0 -68
- data/lib/coradoc/html/input/converters/tr.rb +0 -29
- data/lib/coradoc/html/input/converters/video.rb +0 -29
- data/lib/coradoc/html/input/converters.rb +0 -107
- data/lib/coradoc/html/input/errors.rb +0 -22
- data/lib/coradoc/html/input/html_converter.rb +0 -98
- data/lib/coradoc/html/input/plugin.rb +0 -120
- data/lib/coradoc/html/input/plugins/plateau.rb +0 -214
- data/lib/coradoc/html/input/postprocessor.rb +0 -25
- data/lib/coradoc/html/input.rb +0 -86
- data/lib/coradoc/html/output.rb +0 -89
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 011622bc6889a0af8aadea9d1a13e4cd4e322b808f5afe95b5da4a785a4b720f
|
|
4
|
+
data.tar.gz: ffe601423e5ab805e854e150744983336c49d217858a9f4d5aa660ced64d9f9b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0225afbdd3e517cb260bad9edc7a469605918937faebcc5a425b202e61ac36a6e66d96304defbca5562aa49421fdb20d436c0d47241ab272a0c19344ae839940
|
|
7
|
+
data.tar.gz: c2847f907efb900fb4ccef551254e808cc042c7aa5d8af2c0439a493fa560b58e07ed55d643a9c915d4cd6f765b1c7d6cff498e6408fee1e8f242b4833b40ff5
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Html
|
|
5
|
+
class Cleaner
|
|
6
|
+
INNER_WHITESPACE_REGEX_1 = /\n stem:\[/
|
|
7
|
+
INNER_WHITESPACE_REGEX_2 = /(stem:\[([^\]]|\\\])*\])\n(?=\S)/
|
|
8
|
+
NEWLINES_REGEX = /\n{3,}/
|
|
9
|
+
LEADING_NEWLINE_REGEX = /\A\n+/
|
|
10
|
+
WHITESPACE_REGEX = /[ \t\r\n]+/
|
|
11
|
+
TRAILING_WHITESPACE_REGEX = /[ \t\r\n]+\z/
|
|
12
|
+
|
|
13
|
+
def tidy(string)
|
|
14
|
+
return string.transform_values { |i| tidy(i) } if string.is_a? Hash
|
|
15
|
+
|
|
16
|
+
result = HtmlConverter.track_time 'Removing inner whitespace' do
|
|
17
|
+
remove_inner_whitespaces(String.new(string))
|
|
18
|
+
end
|
|
19
|
+
result = HtmlConverter.track_time 'Removing newlines' do
|
|
20
|
+
remove_newlines(result)
|
|
21
|
+
end
|
|
22
|
+
result = HtmlConverter.track_time 'Removing leading newlines' do
|
|
23
|
+
remove_leading_newlines(result)
|
|
24
|
+
end
|
|
25
|
+
result = HtmlConverter.track_time 'Cleaning tag borders' do
|
|
26
|
+
clean_tag_borders(result)
|
|
27
|
+
end
|
|
28
|
+
result = HtmlConverter.track_time 'Cleaning punctuation characters' do
|
|
29
|
+
clean_punctuation_characters(result)
|
|
30
|
+
end
|
|
31
|
+
result = remove_block_leading_newlines(result)
|
|
32
|
+
result = remove_section_attribute_newlines(result)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def remove_block_leading_newlines(string)
|
|
36
|
+
string.gsub("]\n****\n\n", "]\n****\n")
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def remove_section_attribute_newlines(string)
|
|
40
|
+
string.gsub("]\n\n==", "]\n==")
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def remove_newlines(string)
|
|
44
|
+
string.gsub(NEWLINES_REGEX, "\n\n")
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def remove_leading_newlines(string)
|
|
48
|
+
string.gsub(LEADING_NEWLINE_REGEX, '')
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def remove_inner_whitespaces(string)
|
|
52
|
+
unless string.nil?
|
|
53
|
+
string.gsub!("\n stem:[", "\nstem:[")
|
|
54
|
+
string.gsub!(INNER_WHITESPACE_REGEX_1, '\\1 ')
|
|
55
|
+
string.gsub!(INNER_WHITESPACE_REGEX_2, '\\1')
|
|
56
|
+
end
|
|
57
|
+
result = +''
|
|
58
|
+
string.each_line do |line|
|
|
59
|
+
result << preserve_border_whitespaces(line) do
|
|
60
|
+
line.gsub(/\A[ \t\r\n]+/, '').gsub(/[ \t\r\n]+\z/, '').gsub(/[ \t]{2,}/, ' ')
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
result
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def clean_tag_borders(string)
|
|
67
|
+
result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
|
|
68
|
+
preserve_border_whitespaces(
|
|
69
|
+
match,
|
|
70
|
+
default_border: Html.input_config.tag_border
|
|
71
|
+
) do
|
|
72
|
+
match.strip.sub('~~ ', '~~').sub(' ~~', '~~')
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
result.gsub(/\s?\[.*?\]\s?/) do |match|
|
|
77
|
+
preserve_border_whitespaces(match) do
|
|
78
|
+
match.strip.sub('[ ', '[').sub(' ]', ']')
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def clean_punctuation_characters(string)
|
|
84
|
+
string.gsub(/(\*\*|~~|__)\s([.!?'"])/, '\\1\\2')
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def preprocess_word_html(string)
|
|
88
|
+
clean_headings(scrub_whitespace(string.dup))
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def scrub_whitespace(string)
|
|
92
|
+
string.gsub!(/ | | /i, ' ')
|
|
93
|
+
string = Coradoc.strip_unicode(string)
|
|
94
|
+
string.gsub!(/( +)$/, ' ')
|
|
95
|
+
string.gsub!("\n\n\n\n", "\n\n")
|
|
96
|
+
string
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def clean_headings(string)
|
|
100
|
+
string.gsub!(%r{<h([1-9])[^>]*></h\1>}, ' ')
|
|
101
|
+
string.gsub!(
|
|
102
|
+
%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
|
|
103
|
+
'<sup>\\2</sup>'
|
|
104
|
+
)
|
|
105
|
+
string
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
private
|
|
109
|
+
|
|
110
|
+
def preserve_border_whitespaces(string, options = {})
|
|
111
|
+
return string if /\A\s*\Z/.match?(string)
|
|
112
|
+
|
|
113
|
+
default_border = options.fetch(:default_border, '')
|
|
114
|
+
default_border = '' if /[\[(\])]/.match?(string)
|
|
115
|
+
string_start = present_or_default(string[/\A\s*/], default_border)
|
|
116
|
+
string_end = present_or_default(string[/\s*\Z/], default_border)
|
|
117
|
+
result = yield
|
|
118
|
+
string_start + result + string_end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def present_or_default(string, default)
|
|
122
|
+
return default if string.nil? || string.empty?
|
|
123
|
+
|
|
124
|
+
string
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'coradoc'
|
|
4
|
+
|
|
5
|
+
module Coradoc
|
|
6
|
+
module Html
|
|
7
|
+
module Converters
|
|
8
|
+
class A < Base
|
|
9
|
+
INSTANCE = new
|
|
10
|
+
|
|
11
|
+
def to_coradoc(node, state = {})
|
|
12
|
+
# Use treat_children_coradoc to get CoreModel elements
|
|
13
|
+
content = treat_children_coradoc(node, state)
|
|
14
|
+
|
|
15
|
+
href = node['href']
|
|
16
|
+
title = extract_title(node)
|
|
17
|
+
id = node['id'] || node['name']
|
|
18
|
+
|
|
19
|
+
id = id&.gsub(/\s/, '')&.gsub(/__+/, '_')
|
|
20
|
+
id = nil if id&.empty?
|
|
21
|
+
|
|
22
|
+
return nil if /^_Toc\d+$|^_GoBack$/.match?(id)
|
|
23
|
+
|
|
24
|
+
# For inline anchors - return CoreModel InlineElement with format_type "anchor"
|
|
25
|
+
if id
|
|
26
|
+
return Coradoc::CoreModel::InlineElement.new(
|
|
27
|
+
format_type: 'anchor',
|
|
28
|
+
target: id
|
|
29
|
+
)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# For cross-references
|
|
33
|
+
if href.to_s.start_with?('#')
|
|
34
|
+
ref_id = href.sub(/^#/, '').gsub(/\s/, '').gsub(/__+/, '_')
|
|
35
|
+
content_str = extract_text_from_content(content)
|
|
36
|
+
return Coradoc::CoreModel::CrossReferenceElement.new(
|
|
37
|
+
target: ref_id,
|
|
38
|
+
content: content_str.strip.empty? ? nil : content_str.strip
|
|
39
|
+
)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
return nil if href.to_s.empty?
|
|
43
|
+
|
|
44
|
+
# For links
|
|
45
|
+
ambigous_characters = /[\w.?&#=%;\[\u{ff}-\u{10ffff}]/
|
|
46
|
+
right_constrain = textnode_after_start_with?(node, ambigous_characters)
|
|
47
|
+
|
|
48
|
+
content_str = extract_text_from_content(content)
|
|
49
|
+
|
|
50
|
+
out = []
|
|
51
|
+
# Add leading space if needed
|
|
52
|
+
if textnode_before_end_with?(node, ambigous_characters)
|
|
53
|
+
out << Coradoc::CoreModel::TextElement.new(
|
|
54
|
+
content: ' '
|
|
55
|
+
)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Create link element
|
|
59
|
+
link = Coradoc::CoreModel::LinkElement.new(
|
|
60
|
+
target: href,
|
|
61
|
+
content: content_str.strip,
|
|
62
|
+
metadata: {
|
|
63
|
+
title: (title.strip unless title.to_s.strip.empty?),
|
|
64
|
+
right_constrain: right_constrain
|
|
65
|
+
}.compact
|
|
66
|
+
)
|
|
67
|
+
out << link
|
|
68
|
+
|
|
69
|
+
# Return single element or array
|
|
70
|
+
out.length == 1 ? out.first : out
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
register :a, A::INSTANCE
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Html
|
|
5
|
+
module Converters
|
|
6
|
+
class Aside < Base
|
|
7
|
+
INSTANCE = new
|
|
8
|
+
|
|
9
|
+
def to_coradoc(node, state = {})
|
|
10
|
+
content = treat_children_coradoc(node, state)
|
|
11
|
+
Coradoc::CoreModel::SidebarBlock.new(
|
|
12
|
+
children: content
|
|
13
|
+
)
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
register :aside, Aside::INSTANCE
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Html
|
|
5
|
+
module Converters
|
|
6
|
+
class Audio < MediaBase
|
|
7
|
+
INSTANCE = new
|
|
8
|
+
|
|
9
|
+
private
|
|
10
|
+
|
|
11
|
+
def semantic_type
|
|
12
|
+
:audio
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
register :audio, Audio::INSTANCE
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Html
|
|
5
|
+
module Converters
|
|
6
|
+
class Base
|
|
7
|
+
def treat_children_coradoc(node, state)
|
|
8
|
+
results = node.children.map do |child|
|
|
9
|
+
treat_coradoc(child, state)
|
|
10
|
+
end.flatten
|
|
11
|
+
|
|
12
|
+
results.reject do |x|
|
|
13
|
+
x.nil? || (x.is_a?(String) && x.strip.empty?)
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def treat_coradoc(node, state)
|
|
18
|
+
Converters.process_coradoc(node, state)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def extract_title(node)
|
|
22
|
+
node['title'].to_s
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def node_has_ancestor?(node, name)
|
|
26
|
+
case name
|
|
27
|
+
when String
|
|
28
|
+
node.ancestors(name).any?
|
|
29
|
+
when Array
|
|
30
|
+
name.any? { |n| node.ancestors(n).any? }
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def textnode_before_end_with?(node, str)
|
|
35
|
+
return false unless [String, Regexp].include?(str.class)
|
|
36
|
+
return false if str.is_a?(String) && str.empty?
|
|
37
|
+
|
|
38
|
+
str = /#{Regexp.escape(str)}/ if str.is_a?(String)
|
|
39
|
+
str = /(?:#{str})\z/
|
|
40
|
+
|
|
41
|
+
node2 = node.at_xpath('preceding-sibling::node()[1]')
|
|
42
|
+
node2.is_a?(Nokogiri::XML::Text) && node2.text.match?(str)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def textnode_after_start_with?(node, str)
|
|
46
|
+
return false unless [String, Regexp].include?(str.class)
|
|
47
|
+
return false if str.is_a?(String) && str.empty?
|
|
48
|
+
|
|
49
|
+
str = /#{Regexp.escape(str)}/ if str.is_a?(String)
|
|
50
|
+
str = /\A(?:#{str})/
|
|
51
|
+
|
|
52
|
+
node2 = node.at_xpath('following-sibling::node()[1]')
|
|
53
|
+
node2.is_a?(Nokogiri::XML::Text) && node2.text.match?(str)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def extract_leading_trailing_whitespace(node)
|
|
57
|
+
node.text =~ /^(\s+)/
|
|
58
|
+
leading_whitespace = ::Regexp.last_match(1)
|
|
59
|
+
unless leading_whitespace.nil?
|
|
60
|
+
first_text = node.at_xpath('./text()[1]')
|
|
61
|
+
first_text&.replace(first_text.text.lstrip)
|
|
62
|
+
leading_whitespace = ' '
|
|
63
|
+
end
|
|
64
|
+
node.text =~ /(\s+)$/
|
|
65
|
+
trailing_whitespace = ::Regexp.last_match(1)
|
|
66
|
+
unless trailing_whitespace.nil?
|
|
67
|
+
last_text = node.at_xpath('./text()[last()]')
|
|
68
|
+
last_text&.replace(last_text.text.rstrip)
|
|
69
|
+
trailing_whitespace = ' '
|
|
70
|
+
end
|
|
71
|
+
[leading_whitespace, trailing_whitespace]
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def unconstrained_before?(node)
|
|
75
|
+
before = node.at_xpath('preceding::node()[1]')
|
|
76
|
+
|
|
77
|
+
before &&
|
|
78
|
+
!before.text.strip.empty? &&
|
|
79
|
+
before.text[-1]&.match?(/\w/)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def unconstrained_after?(node)
|
|
83
|
+
after = node.at_xpath('following::node()[1]')
|
|
84
|
+
|
|
85
|
+
after && !after.text.strip.empty? &&
|
|
86
|
+
after.text[0]&.match?(/\w|,|;|"|\.\?!/)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Extract plain text from a mixed content array. Delegates to
|
|
90
|
+
# CoreModel::InlineContent.text_of — single source of truth for
|
|
91
|
+
# nil/Array/InlineElement/StructuralElement handling.
|
|
92
|
+
def extract_text_from_content(content)
|
|
93
|
+
Coradoc::CoreModel::InlineContent.text_of(content)
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Html
|
|
5
|
+
module Converters
|
|
6
|
+
class Blockquote < Base
|
|
7
|
+
INSTANCE = new
|
|
8
|
+
|
|
9
|
+
def to_coradoc(node, state = {})
|
|
10
|
+
id = node['id']
|
|
11
|
+
cite = node['cite']
|
|
12
|
+
content = treat_children_coradoc(node, state)
|
|
13
|
+
|
|
14
|
+
Coradoc::CoreModel::QuoteBlock.new(
|
|
15
|
+
children: content,
|
|
16
|
+
id: id,
|
|
17
|
+
attribution: cite
|
|
18
|
+
)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
register :blockquote, Blockquote::INSTANCE
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Html
|
|
5
|
+
module Converters
|
|
6
|
+
class Br < Base
|
|
7
|
+
INSTANCE = new
|
|
8
|
+
|
|
9
|
+
def to_coradoc(_node, _state = {})
|
|
10
|
+
Coradoc::CoreModel::LineBreakElement.new
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
register :br, Br::INSTANCE
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Html
|
|
5
|
+
module Converters
|
|
6
|
+
class Bypass < Base
|
|
7
|
+
INSTANCE = new
|
|
8
|
+
|
|
9
|
+
def to_coradoc(node, state = {})
|
|
10
|
+
treat_children_coradoc(node, state)
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
register :document, Bypass::INSTANCE
|
|
15
|
+
register :html, Bypass::INSTANCE
|
|
16
|
+
register :body, Bypass::INSTANCE
|
|
17
|
+
register :span, Bypass::INSTANCE
|
|
18
|
+
register :thead, Bypass::INSTANCE
|
|
19
|
+
register :tbody, Bypass::INSTANCE
|
|
20
|
+
register :tfoot, Bypass::INSTANCE
|
|
21
|
+
register :abbr, Bypass::INSTANCE
|
|
22
|
+
register :acronym, Bypass::INSTANCE
|
|
23
|
+
register :address, Bypass::INSTANCE
|
|
24
|
+
register :applet, Bypass::INSTANCE
|
|
25
|
+
register :map, Bypass::INSTANCE
|
|
26
|
+
register :area, Bypass::INSTANCE
|
|
27
|
+
register :bdi, Bypass::INSTANCE
|
|
28
|
+
register :bdo, Bypass::INSTANCE
|
|
29
|
+
register :big, Bypass::INSTANCE
|
|
30
|
+
register :button, Bypass::INSTANCE
|
|
31
|
+
register :canvas, Bypass::INSTANCE
|
|
32
|
+
register :data, Bypass::INSTANCE
|
|
33
|
+
register :datalist, Bypass::INSTANCE
|
|
34
|
+
register :del, Bypass::INSTANCE
|
|
35
|
+
register :ins, Bypass::INSTANCE
|
|
36
|
+
register :dfn, Bypass::INSTANCE
|
|
37
|
+
register :dialog, Bypass::INSTANCE
|
|
38
|
+
register :embed, Bypass::INSTANCE
|
|
39
|
+
register :fieldset, Bypass::INSTANCE
|
|
40
|
+
register :font, Bypass::INSTANCE
|
|
41
|
+
register :footer, Bypass::INSTANCE
|
|
42
|
+
register :form, Bypass::INSTANCE
|
|
43
|
+
register :frame, Bypass::INSTANCE
|
|
44
|
+
register :frameset, Bypass::INSTANCE
|
|
45
|
+
register :header, Bypass::INSTANCE
|
|
46
|
+
register :iframe, Bypass::INSTANCE
|
|
47
|
+
register :input, Bypass::INSTANCE
|
|
48
|
+
register :label, Bypass::INSTANCE
|
|
49
|
+
register :legend, Bypass::INSTANCE
|
|
50
|
+
register :main, Bypass::INSTANCE
|
|
51
|
+
register :menu, Bypass::INSTANCE
|
|
52
|
+
register :menulist, Bypass::INSTANCE
|
|
53
|
+
register :meter, Bypass::INSTANCE
|
|
54
|
+
register :nav, Bypass::INSTANCE
|
|
55
|
+
register :noframes, Bypass::INSTANCE
|
|
56
|
+
register :noscript, Bypass::INSTANCE
|
|
57
|
+
register :object, Bypass::INSTANCE
|
|
58
|
+
register :optgroup, Bypass::INSTANCE
|
|
59
|
+
register :option, Bypass::INSTANCE
|
|
60
|
+
register :output, Bypass::INSTANCE
|
|
61
|
+
register :param, Bypass::INSTANCE
|
|
62
|
+
register :picture, Bypass::INSTANCE
|
|
63
|
+
register :progress, Bypass::INSTANCE
|
|
64
|
+
register :ruby, Bypass::INSTANCE
|
|
65
|
+
register :rt, Bypass::INSTANCE
|
|
66
|
+
register :rp, Bypass::INSTANCE
|
|
67
|
+
register :s, Bypass::INSTANCE
|
|
68
|
+
register :select, Bypass::INSTANCE
|
|
69
|
+
register :small, Bypass::INSTANCE
|
|
70
|
+
register :strike, Bypass::INSTANCE
|
|
71
|
+
register :details, Bypass::INSTANCE
|
|
72
|
+
register :section, Bypass::INSTANCE
|
|
73
|
+
register :summary, Bypass::INSTANCE
|
|
74
|
+
register :svg, Bypass::INSTANCE
|
|
75
|
+
register :template, Bypass::INSTANCE
|
|
76
|
+
register :textarea, Bypass::INSTANCE
|
|
77
|
+
register :track, Bypass::INSTANCE
|
|
78
|
+
register :u, Bypass::INSTANCE
|
|
79
|
+
register :wbr, Bypass::INSTANCE
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Html
|
|
5
|
+
module Converters
|
|
6
|
+
class Code < Markup
|
|
7
|
+
INSTANCE = new
|
|
8
|
+
|
|
9
|
+
def coradoc_format_type
|
|
10
|
+
'monospace'
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def markup_ancestor_tag_names
|
|
14
|
+
%w[code tt kbd samp var]
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
register :code, Code::INSTANCE
|
|
19
|
+
register :tt, Code::INSTANCE
|
|
20
|
+
register :kbd, Code::INSTANCE
|
|
21
|
+
register :samp, Code::INSTANCE
|
|
22
|
+
register :var, Code::INSTANCE
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Html
|
|
5
|
+
module Converters
|
|
6
|
+
class Div < Base
|
|
7
|
+
INSTANCE = new
|
|
8
|
+
|
|
9
|
+
def to_coradoc(node, state = {})
|
|
10
|
+
id = node['id']
|
|
11
|
+
contents = treat_children_coradoc(node, state)
|
|
12
|
+
|
|
13
|
+
Coradoc::CoreModel::OpenBlock.new(
|
|
14
|
+
children: contents,
|
|
15
|
+
id: id
|
|
16
|
+
)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
register :div, Div::INSTANCE
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Html
|
|
5
|
+
module Converters
|
|
6
|
+
class Dl < Base
|
|
7
|
+
INSTANCE = new
|
|
8
|
+
|
|
9
|
+
def to_coradoc(node, state = {})
|
|
10
|
+
items = process_dl(node, state)
|
|
11
|
+
|
|
12
|
+
# Convert items to CoreModel::ListItem objects
|
|
13
|
+
# For definition lists, term goes in content, definition goes in children
|
|
14
|
+
list_items = items.map do |item|
|
|
15
|
+
term_text = extract_text_from_content(item[:name])
|
|
16
|
+
Coradoc::CoreModel::ListItem.new(
|
|
17
|
+
content: term_text,
|
|
18
|
+
children: item[:value]
|
|
19
|
+
)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Use CoreModel::ListBlock with marker_type "definition"
|
|
23
|
+
Coradoc::CoreModel::ListBlock.new(
|
|
24
|
+
marker_type: 'definition',
|
|
25
|
+
items: list_items
|
|
26
|
+
)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def process_dl(node, state = {})
|
|
30
|
+
groups = []
|
|
31
|
+
current = { name: [], value: [] }
|
|
32
|
+
|
|
33
|
+
seen_dd = false
|
|
34
|
+
child = node.at_xpath('*[1]')
|
|
35
|
+
grandchild = nil
|
|
36
|
+
until child.nil?
|
|
37
|
+
if child.name == 'div'
|
|
38
|
+
grandchild = child.at_xpath('*[1]')
|
|
39
|
+
until grandchild.nil?
|
|
40
|
+
groups, current, seen_dd = process_dt_or_dd(
|
|
41
|
+
groups,
|
|
42
|
+
current,
|
|
43
|
+
seen_dd,
|
|
44
|
+
grandchild,
|
|
45
|
+
state
|
|
46
|
+
)
|
|
47
|
+
grandchild = grandchild.at_xpath('following-sibling::*[1]')
|
|
48
|
+
end
|
|
49
|
+
elsif %w[dt dd].include?(child.name)
|
|
50
|
+
groups, current, seen_dd = process_dt_or_dd(
|
|
51
|
+
groups,
|
|
52
|
+
current,
|
|
53
|
+
seen_dd,
|
|
54
|
+
child,
|
|
55
|
+
state
|
|
56
|
+
)
|
|
57
|
+
end
|
|
58
|
+
child = child.at_xpath('following-sibling::*[1]')
|
|
59
|
+
groups << current if current[:name].any? && current[:value].any?
|
|
60
|
+
end
|
|
61
|
+
groups
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def process_dt_or_dd(groups, current, seen_dd, subnode, state = {})
|
|
65
|
+
if subnode.name == 'dt'
|
|
66
|
+
if seen_dd
|
|
67
|
+
current = { name: [], value: [] }
|
|
68
|
+
seen_dd = false
|
|
69
|
+
end
|
|
70
|
+
current[:name] += treat_children_coradoc(subnode, state)
|
|
71
|
+
elsif subnode.name == 'dd'
|
|
72
|
+
current[:value] += treat_children_coradoc(subnode, state)
|
|
73
|
+
seen_dd = true
|
|
74
|
+
end
|
|
75
|
+
[groups, current, seen_dd]
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
register :dl, Dl::INSTANCE
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Html
|
|
5
|
+
module Converters
|
|
6
|
+
class Skip < Base
|
|
7
|
+
INSTANCE = new
|
|
8
|
+
|
|
9
|
+
def to_coradoc(_node, _state = {})
|
|
10
|
+
''
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
register :caption, Skip::INSTANCE
|
|
15
|
+
register :figcaption, Skip::INSTANCE
|
|
16
|
+
register :title, Skip::INSTANCE
|
|
17
|
+
register :link, Skip::INSTANCE
|
|
18
|
+
register :style, Skip::INSTANCE
|
|
19
|
+
register :meta, Skip::INSTANCE
|
|
20
|
+
register :script, Skip::INSTANCE
|
|
21
|
+
register :comment, Skip::INSTANCE
|
|
22
|
+
register :colgroup, Skip::INSTANCE
|
|
23
|
+
register :col, Skip::INSTANCE
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Html
|
|
5
|
+
module Converters
|
|
6
|
+
class Em < Markup
|
|
7
|
+
INSTANCE = new
|
|
8
|
+
|
|
9
|
+
def coradoc_format_type
|
|
10
|
+
'italic'
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def markup_ancestor_tag_names
|
|
14
|
+
%w[em i cite]
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
register :em, Em::INSTANCE
|
|
19
|
+
register :i, Em::INSTANCE
|
|
20
|
+
register :cite, Em::INSTANCE
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|