jekyll-l10n 1.3.15 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/jekyll-l10n/po_file/writer.rb +1 -1
- data/lib/jekyll-l10n/translation/block_text_extractor.rb +3 -3
- data/lib/jekyll-l10n/translation/html_translator.rb +42 -7
- data/lib/jekyll-l10n/utils/html_text_utils.rb +21 -3
- data/lib/jekyll-l10n/utils/translation_resolver.rb +24 -17
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1c8edcdac30ed6d89b587d914f1b8038ffd3b10a95a0e15cc43965dd915e39f2
|
|
4
|
+
data.tar.gz: 513218e1e01c3814a2f7130f60a6b35463d0836659ee13d85ae3d7c9a0aaf6c5
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 24586cc374d7ad819cbda298c98d9dc70438b60bc7afafcc6d695190bb62dc6294ddfd4bb02fb4f04c57c3231cea3fe12047dc9bdf3eb0de5390c6e4fb7a0970
|
|
7
|
+
data.tar.gz: f5f5bb5755c8354794fd81090c74c0519f06fb15c216cc1b9ba748e5837dcef7b7a05c324d5f67feef8c544884e17a936d492c76132ac26cb7f0d6f0bafa22e6
|
|
@@ -19,7 +19,6 @@ module Jekyll
|
|
|
19
19
|
# * Remove nested block elements from text
|
|
20
20
|
# * Remove empty icon tags (external link markers)
|
|
21
21
|
# * Normalize and validate extracted text
|
|
22
|
-
# * Decode HTML entities
|
|
23
22
|
#
|
|
24
23
|
# @example
|
|
25
24
|
# text = BlockTextExtractor.extract(paragraph_node)
|
|
@@ -31,7 +30,8 @@ module Jekyll
|
|
|
31
30
|
#
|
|
32
31
|
# Returns nil if element is not extractable or if extracted text fails
|
|
33
32
|
# validation. Clones the node, removes nested block elements and empty
|
|
34
|
-
# icon tags, normalizes whitespace,
|
|
33
|
+
# icon tags, normalizes whitespace, and validates. HTML entities are
|
|
34
|
+
# preserved verbatim to match the keys produced by the extraction pipeline.
|
|
35
35
|
#
|
|
36
36
|
# @param node [Nokogiri::XML::Element] DOM element to extract from
|
|
37
37
|
# @return [String, nil] Normalized text from element, or nil if not valid
|
|
@@ -39,11 +39,11 @@ module Jekyll
|
|
|
39
39
|
return nil unless extractable?(node)
|
|
40
40
|
|
|
41
41
|
clone = node.dup
|
|
42
|
+
HtmlTextUtils.remove_code_blocks(clone)
|
|
42
43
|
HtmlTextUtils.remove_block_elements(clone)
|
|
43
44
|
HtmlTextUtils.remove_empty_icon_tags(clone)
|
|
44
45
|
|
|
45
46
|
text = TextNormalizer.normalize(clone.inner_html).strip
|
|
46
|
-
text = HtmlTextUtils.decode_html_entities(text)
|
|
47
47
|
|
|
48
48
|
TextValidator.valid?(text) ? text : nil
|
|
49
49
|
end
|
|
@@ -108,7 +108,7 @@ module Jekyll
|
|
|
108
108
|
normalized_text = TextNormalizer.normalize(text)
|
|
109
109
|
translated = TranslationResolver.resolve(node, normalized_text, translations)
|
|
110
110
|
|
|
111
|
-
return if apply_block_level_translation?(node, normalized_text,
|
|
111
|
+
return if apply_block_level_translation?(node, normalized_text, translations)
|
|
112
112
|
|
|
113
113
|
if @debug_logging
|
|
114
114
|
log_translation_debug_info(text, normalized_text, translated,
|
|
@@ -147,9 +147,15 @@ module Jekyll
|
|
|
147
147
|
"[HtmlTranslator] Translation: #{translation[0..truncate_length]}..."
|
|
148
148
|
end
|
|
149
149
|
|
|
150
|
+
inject_translation_preserving_structure(parent, translation)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def inject_translation_preserving_structure(parent, translation)
|
|
154
|
+
preserved_blocks = parent.css('pre').map(&:dup)
|
|
150
155
|
preserved_anchors = parent.css('.heading-anchor').map(&:dup)
|
|
151
156
|
parent.children.each(&:remove)
|
|
152
157
|
parent.inner_html = translation
|
|
158
|
+
preserved_blocks.each { |b| parent.add_child(b) }
|
|
153
159
|
preserved_anchors.each { |a| parent.add_child(a) } if parent.css('.heading-anchor').empty?
|
|
154
160
|
end
|
|
155
161
|
|
|
@@ -204,18 +210,45 @@ module Jekyll
|
|
|
204
210
|
@debug_logging && text.include?('attribute')
|
|
205
211
|
end
|
|
206
212
|
|
|
207
|
-
def apply_block_level_translation?(node, normalized_text,
|
|
208
|
-
|
|
213
|
+
def apply_block_level_translation?(node, normalized_text, translations)
|
|
214
|
+
ancestor = find_content_element_ancestor(node)
|
|
215
|
+
return false unless ancestor
|
|
209
216
|
|
|
210
|
-
return false if TranslationResolver.contains_protected_elements?(
|
|
217
|
+
return false if TranslationResolver.contains_protected_elements?(ancestor)
|
|
211
218
|
|
|
212
|
-
block_text = BlockTextExtractor.extract(
|
|
219
|
+
block_text = BlockTextExtractor.extract(ancestor)
|
|
213
220
|
return false unless block_text && block_text != normalized_text
|
|
214
221
|
|
|
215
|
-
|
|
222
|
+
translation = select_block_translation(node, ancestor, normalized_text, block_text,
|
|
223
|
+
translations)
|
|
224
|
+
return false unless translation
|
|
225
|
+
|
|
226
|
+
apply_block_level_translation(ancestor, translation)
|
|
216
227
|
true
|
|
217
228
|
end
|
|
218
229
|
|
|
230
|
+
# When text is directly inside a content element, prefer its direct translation
|
|
231
|
+
# (the msgstr may itself contain HTML, e.g. a heading with an anchor).
|
|
232
|
+
# When text is nested inside inline element(s), only apply if the full block
|
|
233
|
+
# text is an explicit translation key.
|
|
234
|
+
def select_block_translation(node, ancestor, normalized_text, block_text, translations)
|
|
235
|
+
if node.parent == ancestor
|
|
236
|
+
translations[normalized_text] || translations[block_text]
|
|
237
|
+
else
|
|
238
|
+
translations[block_text]
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
def find_content_element_ancestor(node)
|
|
243
|
+
current = node&.parent
|
|
244
|
+
while current&.element?
|
|
245
|
+
return current if content_element?(current)
|
|
246
|
+
|
|
247
|
+
current = current.parent
|
|
248
|
+
end
|
|
249
|
+
nil
|
|
250
|
+
end
|
|
251
|
+
|
|
219
252
|
def log_translation_debug_info(text, normalized_text, translated, translations)
|
|
220
253
|
translation_data = DebugLogger::TranslationData.new(text: text,
|
|
221
254
|
normalized_text: normalized_text,
|
|
@@ -225,7 +258,9 @@ module Jekyll
|
|
|
225
258
|
end
|
|
226
259
|
|
|
227
260
|
private :log_text_node_debug, :should_skip_translation?, :should_log_text_debug?,
|
|
228
|
-
:apply_block_level_translation?, :
|
|
261
|
+
:apply_block_level_translation?, :select_block_translation,
|
|
262
|
+
:find_content_element_ancestor, :log_translation_debug_info,
|
|
263
|
+
:inject_translation_preserving_structure
|
|
229
264
|
end
|
|
230
265
|
end
|
|
231
266
|
end
|
|
@@ -48,6 +48,22 @@ module Jekyll
|
|
|
48
48
|
.gsub(''', "'")
|
|
49
49
|
end
|
|
50
50
|
|
|
51
|
+
# Remove preformatted code blocks from a node.
|
|
52
|
+
#
|
|
53
|
+
# Removes all <pre> elements entirely. With highlighter: none in Jekyll config,
|
|
54
|
+
# fenced code blocks produce plain <pre><code> as direct children of content
|
|
55
|
+
# elements — no Rouge wrappers. Removing <pre> before extraction ensures raw
|
|
56
|
+
# code never appears in PO msgids.
|
|
57
|
+
#
|
|
58
|
+
# Must run before remove_block_elements_from_node so that <code> inside <pre>
|
|
59
|
+
# is gone before the general flattening pass.
|
|
60
|
+
#
|
|
61
|
+
# @param node [Nokogiri::XML::Node] Node to process (modified in place)
|
|
62
|
+
# @return [void]
|
|
63
|
+
def self.remove_code_blocks(node)
|
|
64
|
+
node.css('pre').each(&:remove)
|
|
65
|
+
end
|
|
66
|
+
|
|
51
67
|
# Remove block-level elements from a cloned node.
|
|
52
68
|
#
|
|
53
69
|
# Replaces block-level element nodes with their children (flattening structure).
|
|
@@ -89,18 +105,20 @@ module Jekyll
|
|
|
89
105
|
# Extract text with inline tags preserved.
|
|
90
106
|
#
|
|
91
107
|
# Extracts text from an element, removes block elements and empty icons,
|
|
92
|
-
# normalizes whitespace
|
|
93
|
-
#
|
|
108
|
+
# and normalizes whitespace. HTML entities (e.g. <, >) are preserved
|
|
109
|
+
# verbatim so that entity-encoded content inside inline elements (such as
|
|
110
|
+
# <code><p></code>) is written to PO msgids as-is and does not
|
|
111
|
+
# become a live HTML tag when the msgstr is later injected via inner_html.
|
|
94
112
|
#
|
|
95
113
|
# @param node [Nokogiri::XML::Node] Element to extract from
|
|
96
114
|
# @return [String] Extracted and normalized text
|
|
97
115
|
def self.extract_with_inline_tags(node)
|
|
98
116
|
clone = node.dup
|
|
117
|
+
remove_code_blocks(clone)
|
|
99
118
|
remove_block_elements_from_node(clone)
|
|
100
119
|
remove_empty_icon_tags(clone)
|
|
101
120
|
|
|
102
121
|
text = TextNormalizer.normalize(clone.inner_html)
|
|
103
|
-
text = decode_html_entities(text)
|
|
104
122
|
text&.then { |t| TextNormalizer.normalize(t).strip }
|
|
105
123
|
end
|
|
106
124
|
|
|
@@ -47,7 +47,7 @@ module Jekyll
|
|
|
47
47
|
# node alone doesn't have a direct translation but the entire block does.
|
|
48
48
|
#
|
|
49
49
|
# Security consideration: Returns nil if the block contains protected elements
|
|
50
|
-
# (script, style
|
|
50
|
+
# (script, style) to prevent unsafe translation application.
|
|
51
51
|
#
|
|
52
52
|
# @param node [Nokogiri::XML::Node] Text node being translated
|
|
53
53
|
# @param text [String] Normalized text of the node
|
|
@@ -59,13 +59,12 @@ module Jekyll
|
|
|
59
59
|
# # Returns nil (protected element present, prevents block translation)
|
|
60
60
|
# TranslationResolver.try_block_level_translation(node, "text", translations)
|
|
61
61
|
def self.try_block_level_translation(node, text, translations)
|
|
62
|
-
|
|
62
|
+
ancestor = find_content_element_ancestor(node)
|
|
63
|
+
return nil unless ancestor
|
|
63
64
|
|
|
64
|
-
|
|
65
|
-
# (script, style, pre). These cannot be safely applied at block level.
|
|
66
|
-
return nil if contains_protected_elements?(node.parent)
|
|
65
|
+
return nil if contains_protected_elements?(ancestor)
|
|
67
66
|
|
|
68
|
-
block_text = BlockTextExtractor.extract(
|
|
67
|
+
block_text = BlockTextExtractor.extract(ancestor)
|
|
69
68
|
return nil unless block_text && block_text != text
|
|
70
69
|
|
|
71
70
|
translations[block_text]
|
|
@@ -78,15 +77,24 @@ module Jekyll
|
|
|
78
77
|
HtmlElements::CONTENT_ELEMENTS.include?(node.name)
|
|
79
78
|
end
|
|
80
79
|
|
|
80
|
+
def self.find_content_element_ancestor(node)
|
|
81
|
+
current = node&.parent
|
|
82
|
+
while current&.element?
|
|
83
|
+
return current if content_element?(current)
|
|
84
|
+
|
|
85
|
+
current = current.parent
|
|
86
|
+
end
|
|
87
|
+
nil
|
|
88
|
+
end
|
|
89
|
+
private_class_method :find_content_element_ancestor
|
|
90
|
+
|
|
81
91
|
# Check if an element contains protected child elements that block translations.
|
|
82
92
|
#
|
|
83
|
-
# Protected elements (script, style
|
|
84
|
-
# translated at the block level
|
|
85
|
-
#
|
|
86
|
-
#
|
|
87
|
-
#
|
|
88
|
-
# This is a shared utility used by both HtmlTranslator and TranslationResolver
|
|
89
|
-
# to ensure consistent protection of sensitive content across the codebase.
|
|
93
|
+
# Protected elements (script, style) cannot have their surrounding text
|
|
94
|
+
# translated at the block level for security and functionality reasons.
|
|
95
|
+
# <pre> is not protected here — HtmlTextUtils.remove_code_blocks strips it
|
|
96
|
+
# before extraction so code content never reaches PO msgids, and
|
|
97
|
+
# HtmlTranslator preserves <pre> verbatim across translation injection.
|
|
90
98
|
#
|
|
91
99
|
# @param node [Nokogiri::XML::Node] Element to check
|
|
92
100
|
# @return [Boolean] true if node contains protected elements, false otherwise
|
|
@@ -101,13 +109,12 @@ module Jekyll
|
|
|
101
109
|
# doc = Nokogiri::HTML('<p><code>inline</code> text</p>')
|
|
102
110
|
# para = doc.xpath('//p').first
|
|
103
111
|
# TranslationResolver.contains_protected_elements?(para)
|
|
104
|
-
# # => false (code is allowed, only script/style
|
|
112
|
+
# # => false (code is allowed, only script/style are protected)
|
|
105
113
|
def self.contains_protected_elements?(node)
|
|
106
114
|
return false unless node.element?
|
|
107
115
|
|
|
108
|
-
# Block block-level translation for script
|
|
109
|
-
|
|
110
|
-
protected_elements = %w[script style pre]
|
|
116
|
+
# Block block-level translation for script and style (security/functionality).
|
|
117
|
+
protected_elements = %w[script style]
|
|
111
118
|
node.children.any? { |child| child.element? && protected_elements.include?(child.name) }
|
|
112
119
|
end
|
|
113
120
|
end
|