jekyll-l10n 1.4.1 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/jekyll-l10n/constants.rb +23 -0
- data/lib/jekyll-l10n/extraction/compendium_merger.rb +57 -10
- data/lib/jekyll-l10n/extraction/compendium_translator.rb +1 -1
- data/lib/jekyll-l10n/extraction/dom_text_extractor.rb +2 -1
- data/lib/jekyll-l10n/extraction/extractor.rb +1 -0
- data/lib/jekyll-l10n/extraction/logger.rb +3 -1
- data/lib/jekyll-l10n/extraction/result_saver.rb +13 -2
- data/lib/jekyll-l10n/jekyll/generator.rb +2 -1
- data/lib/jekyll-l10n/jekyll/url_filter.rb +5 -3
- data/lib/jekyll-l10n/po_file/fuzzy_matcher.rb +117 -0
- data/lib/jekyll-l10n/po_file/loader.rb +3 -2
- data/lib/jekyll-l10n/po_file/manager.rb +2 -1
- data/lib/jekyll-l10n/po_file/merger.rb +2 -1
- data/lib/jekyll-l10n/po_file/reader.rb +71 -21
- data/lib/jekyll-l10n/po_file/writer.rb +50 -13
- data/lib/jekyll-l10n/translation/block_text_extractor.rb +2 -8
- data/lib/jekyll-l10n/translation/html_translator.rb +88 -19
- data/lib/jekyll-l10n/translation/libre_translator.rb +10 -10
- data/lib/jekyll-l10n/translation/translator.rb +2 -1
- data/lib/jekyll-l10n/utils/error_handler.rb +12 -0
- data/lib/jekyll-l10n/utils/external_link_icon_preserver.rb +3 -2
- data/lib/jekyll-l10n/utils/html_elements.rb +4 -0
- data/lib/jekyll-l10n/utils/html_text_utils.rb +77 -4
- data/lib/jekyll-l10n/utils/page_locales_config.rb +14 -14
- data/lib/jekyll-l10n/utils/po_entry_converter.rb +22 -17
- data/lib/jekyll-l10n/utils/site_config_accessor.rb +3 -1
- data/lib/jekyll-l10n/utils/translation_resolver.rb +1 -4
- data/lib/jekyll-l10n/utils/url_transformer.rb +4 -5
- data/lib/jekyll-l10n.rb +1 -0
- metadata +4 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 21d081d43cd7503f98fb38d64526397eaf5ff3164e0eb3d6525aa8d88290b9d7
|
|
4
|
+
data.tar.gz: 382bb065f52fed1d762c2cd5110afc349de73a4780d564f813e8a7be4929373b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: cab0e3382f953d8f88acc764be019ccdb02bf382e65a4f227f99206e7e6c4370e4a6b158da960cf7a762e28d84765f75917ee0f351cfcd2b434aecd8d8bb8969
|
|
7
|
+
data.tar.gz: f5dc58255fd89efe2014d63e9a916d9ff3ff04d20ebf67a0d863469fd082bee7d3b399586a1f32392f78b2e4f1e6cb7c26254986faacaf0b1208a6fe5ed0c3f6
|
|
@@ -24,6 +24,13 @@ module Jekyll
|
|
|
24
24
|
# @return [Regexp]
|
|
25
25
|
LOCALE_PATTERN = /^[a-z]{2}(_[A-Z]{2})?$/.freeze
|
|
26
26
|
|
|
27
|
+
# Regex fragment for matching locale codes in URL path segments.
|
|
28
|
+
# Accepts both underscore ('pt_BR') and hyphen ('zh-CN') subtag separators,
|
|
29
|
+
# unlike LOCALE_PATTERN which only accepts underscore (strict validation form).
|
|
30
|
+
# Compose into full path patterns rather than using standalone.
|
|
31
|
+
# @return [String]
|
|
32
|
+
LOCALE_CODE_SEGMENT = '[a-z]{2}(?:[_-][A-Z]{2})?'
|
|
33
|
+
|
|
27
34
|
# ## Translation Fallback Modes
|
|
28
35
|
|
|
29
36
|
# Fallback mode: use original English text if translation not found
|
|
@@ -44,6 +51,16 @@ module Jekyll
|
|
|
44
51
|
# @return [String] "[UNTRANSLATED]"
|
|
45
52
|
UNTRANSLATED_MARKER = '[UNTRANSLATED]'
|
|
46
53
|
|
|
54
|
+
# Minimum similarity score (0.0–1.0) for fuzzy matching old translations to
|
|
55
|
+
# changed msgids. Mirrors GNU msgmerge default. Set to 1.0 to disable.
|
|
56
|
+
# @return [Float] 0.6
|
|
57
|
+
DEFAULT_FUZZY_THRESHOLD = 0.6
|
|
58
|
+
|
|
59
|
+
# Maximum msgid character length above which fuzzy matching is skipped.
|
|
60
|
+
# Long strings are typically unique HTML fragments; Levenshtein cost is O(n²).
|
|
61
|
+
# @return [Integer] 400
|
|
62
|
+
MAX_FUZZY_MSGID_LENGTH = 400
|
|
63
|
+
|
|
47
64
|
# ## PO File Formatting (GNU Gettext Standard)
|
|
48
65
|
|
|
49
66
|
# Line length threshold below which strings are rendered on a single line
|
|
@@ -99,6 +116,12 @@ module Jekyll
|
|
|
99
116
|
# @return [Array<String>] ["title", "alt", "aria-label", "placeholder", "aria-description"]
|
|
100
117
|
DEFAULT_TRANSLATABLE_ATTRIBUTES = %w[title alt aria-label placeholder aria-description].freeze
|
|
101
118
|
|
|
119
|
+
# Structural/styling attributes restored from the source DOM at render time.
|
|
120
|
+
# These are never read from msgstr — always sourced from original HTML.
|
|
121
|
+
# Prevents MT-corrupted class strings from reaching the rendered page.
|
|
122
|
+
# @return [Array<String>]
|
|
123
|
+
STRUCTURAL_PASSTHROUGH_ATTRS = %w[class style id target rel tabindex aria-hidden].freeze
|
|
124
|
+
|
|
102
125
|
# ## LibreTranslate Integration Defaults
|
|
103
126
|
|
|
104
127
|
# Default LibreTranslate API endpoint URL
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'fileutils'
|
|
3
4
|
require_relative '../po_file/manager'
|
|
5
|
+
require_relative '../po_file/fuzzy_matcher'
|
|
4
6
|
require_relative '../utils/page_locales_config'
|
|
5
7
|
require_relative '../utils/site_config_accessor'
|
|
6
8
|
|
|
@@ -84,10 +86,11 @@ module Jekyll
|
|
|
84
86
|
end
|
|
85
87
|
end
|
|
86
88
|
|
|
87
|
-
# Load existing compendium translations or return empty hash if not found
|
|
89
|
+
# Load existing compendium translations or return empty hash if not found.
|
|
90
|
+
# Uses parse_for_merge to preserve fuzzy flags from previous runs.
|
|
88
91
|
def load_existing_compendium(compendium_path)
|
|
89
92
|
if File.exist?(compendium_path)
|
|
90
|
-
PoFileReader.
|
|
93
|
+
PoFileReader.parse_for_merge(compendium_path)
|
|
91
94
|
else
|
|
92
95
|
{}
|
|
93
96
|
end
|
|
@@ -102,24 +105,54 @@ module Jekyll
|
|
|
102
105
|
combined
|
|
103
106
|
end
|
|
104
107
|
|
|
105
|
-
# Normalize entry format to ensure consistent hash structure with :msgstr and :reference keys
|
|
108
|
+
# Normalize entry format to ensure consistent hash structure with :msgstr and :reference keys.
|
|
109
|
+
# Fuzzy metadata is preserved when present so it survives subsequent compendium reads.
|
|
106
110
|
def normalize_compendium_entry(data)
|
|
107
111
|
if data.is_a?(Hash)
|
|
108
|
-
{ msgstr: data[:msgstr], reference: data[:reference] }
|
|
112
|
+
entry = { msgstr: data[:msgstr], reference: data[:reference] }
|
|
113
|
+
entry[:fuzzy] = data[:fuzzy] if data[:fuzzy]
|
|
114
|
+
entry[:previous_msgid] = data[:previous_msgid] if data[:previous_msgid]
|
|
115
|
+
entry
|
|
109
116
|
else
|
|
110
117
|
{ msgstr: data, reference: nil }
|
|
111
118
|
end
|
|
112
119
|
end
|
|
113
120
|
|
|
114
|
-
# Merge newly found translations into combined hash, preserving existing translations
|
|
121
|
+
# Merge newly found translations into combined hash, preserving existing translations.
|
|
122
|
+
# Applies Levenshtein fuzzy matching for changed msgids: when a new msgid closely
|
|
123
|
+
# resembles an old compendium entry, the old entry is replaced by a fuzzy-marked new
|
|
124
|
+
# entry carrying the old translation as a hint. Old entries without a new fuzzy match
|
|
125
|
+
# stay in the combined hash (they may belong to pages not extracted in this build).
|
|
115
126
|
def merge_into_combined(combined, merged)
|
|
127
|
+
new_msgids = merged.keys.to_set
|
|
128
|
+
fuzzy_candidates = compendium_fuzzy_candidates(combined, new_msgids)
|
|
129
|
+
matched_old_msgids = []
|
|
130
|
+
|
|
116
131
|
merged.each do |msgid, entry|
|
|
117
132
|
if combined[msgid]
|
|
118
133
|
update_entry_reference(combined[msgid], entry)
|
|
119
134
|
else
|
|
120
|
-
combined
|
|
135
|
+
resolve_new_entry(combined, msgid, entry, fuzzy_candidates, matched_old_msgids)
|
|
121
136
|
end
|
|
122
137
|
end
|
|
138
|
+
|
|
139
|
+
matched_old_msgids.each { |old_msgid| combined.delete(old_msgid) }
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def compendium_fuzzy_candidates(combined, new_msgids)
|
|
143
|
+
combined.reject do |msgid, entry|
|
|
144
|
+
new_msgids.include?(msgid) || PoFuzzyMatcher.msgstr_from_entry(entry).empty?
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def resolve_new_entry(combined, msgid, entry, fuzzy_candidates, matched_old_msgids)
|
|
149
|
+
match = PoFuzzyMatcher.find_match(msgid, fuzzy_candidates)
|
|
150
|
+
if match
|
|
151
|
+
combined[msgid] = create_fuzzy_entry(entry, match)
|
|
152
|
+
matched_old_msgids << match[:msgid]
|
|
153
|
+
else
|
|
154
|
+
combined[msgid] = create_new_entry(entry)
|
|
155
|
+
end
|
|
123
156
|
end
|
|
124
157
|
|
|
125
158
|
# Update reference for existing entry if new reference is available
|
|
@@ -138,11 +171,24 @@ module Jekyll
|
|
|
138
171
|
end
|
|
139
172
|
end
|
|
140
173
|
|
|
141
|
-
#
|
|
174
|
+
# Create a fuzzy entry for a changed msgid, carrying the old translation as a hint
|
|
175
|
+
def create_fuzzy_entry(entry, match)
|
|
176
|
+
{
|
|
177
|
+
msgstr: match[:msgstr],
|
|
178
|
+
reference: entry.is_a?(Hash) ? entry[:reference] : nil,
|
|
179
|
+
fuzzy: true,
|
|
180
|
+
previous_msgid: match[:msgid]
|
|
181
|
+
}
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Convert combined hash to array of entries suitable for PO file writing.
|
|
185
|
+
# Fuzzy metadata is included so PoFileWriter can serialize #, fuzzy and #| msgid.
|
|
142
186
|
def format_compendium_entries(combined)
|
|
143
187
|
combined.map do |msgid, data|
|
|
144
188
|
entry = { msgid: msgid, msgstr: data[:msgstr] }
|
|
145
|
-
entry[:reference]
|
|
189
|
+
entry[:reference] = data[:reference] if data[:reference]
|
|
190
|
+
entry[:fuzzy] = data[:fuzzy] if data[:fuzzy]
|
|
191
|
+
entry[:previous_msgid] = data[:previous_msgid] if data[:previous_msgid]
|
|
146
192
|
entry
|
|
147
193
|
end
|
|
148
194
|
end
|
|
@@ -156,8 +202,9 @@ module Jekyll
|
|
|
156
202
|
|
|
157
203
|
private :process_locale, :compendium_unchanged?, :load_existing_compendium,
|
|
158
204
|
:build_combined_hash, :normalize_compendium_entry, :merge_into_combined,
|
|
159
|
-
:
|
|
160
|
-
:
|
|
205
|
+
:compendium_fuzzy_candidates, :resolve_new_entry,
|
|
206
|
+
:update_entry_reference, :create_new_entry, :create_fuzzy_entry,
|
|
207
|
+
:format_compendium_entries, :cleanup_locale_directory
|
|
161
208
|
end
|
|
162
209
|
end
|
|
163
210
|
end
|
|
@@ -72,7 +72,7 @@ module Jekyll
|
|
|
72
72
|
"Processing compendium file: #{compendium_path}")
|
|
73
73
|
return unless File.exist?(compendium_path)
|
|
74
74
|
|
|
75
|
-
entries = PoFileReader.
|
|
75
|
+
entries = PoFileReader.parse_for_merge(compendium_path)
|
|
76
76
|
po_entries = PoEntryConverter.hash_to_po_entry_array(entries)
|
|
77
77
|
|
|
78
78
|
log_compendium_stats(locale, po_entries, compendium_path)
|
|
@@ -52,11 +52,12 @@ module Jekyll
|
|
|
52
52
|
end
|
|
53
53
|
|
|
54
54
|
def extractable?(node)
|
|
55
|
-
|
|
55
|
+
HtmlTextUtils.extractable?(node)
|
|
56
56
|
end
|
|
57
57
|
|
|
58
58
|
def extract_block_text(node)
|
|
59
59
|
return nil if only_contains_block_elements?(node)
|
|
60
|
+
return nil if HtmlTextUtils.layout_block_children?(node)
|
|
60
61
|
|
|
61
62
|
text = HtmlTextUtils.extract_with_inline_tags(node)
|
|
62
63
|
TextValidator.valid?(text) ? text : nil
|
|
@@ -75,6 +75,7 @@ module Jekyll
|
|
|
75
75
|
Jekyll.logger.info 'Localization', 'Extracting translatable strings...'
|
|
76
76
|
start_time = Time.now
|
|
77
77
|
stats = process_all_html_files
|
|
78
|
+
@result_saver.finalize_compendia
|
|
78
79
|
translate_all_compendia
|
|
79
80
|
ExtractionLogger.log_summary(stats, Time.now - start_time)
|
|
80
81
|
stats
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative '../utils/error_handler'
|
|
4
|
+
|
|
3
5
|
module Jekyll
|
|
4
6
|
module L10n
|
|
5
7
|
# Logs extraction process errors and summary statistics.
|
|
@@ -22,7 +24,7 @@ module Jekyll
|
|
|
22
24
|
# @param error [StandardError] The error that occurred
|
|
23
25
|
# @return [void]
|
|
24
26
|
def self.log_error(file_path, error)
|
|
25
|
-
|
|
27
|
+
ErrorHandler.log_error("extracting from #{file_path}", error)
|
|
26
28
|
end
|
|
27
29
|
|
|
28
30
|
# Log extraction completion summary.
|
|
@@ -63,6 +63,19 @@ module Jekyll
|
|
|
63
63
|
}
|
|
64
64
|
end
|
|
65
65
|
|
|
66
|
+
# Merge all page-specific PO files into compendia after all pages are extracted.
|
|
67
|
+
#
|
|
68
|
+
# Called once per build (from Extractor.extract_site) rather than per page,
|
|
69
|
+
# reducing disk I/O and fuzzy-matching passes from O(pages) to O(1).
|
|
70
|
+
#
|
|
71
|
+
# @return [void]
|
|
72
|
+
def finalize_compendia
|
|
73
|
+
return unless @site_config.update_compendium?
|
|
74
|
+
|
|
75
|
+
po_manager = PoFileManager.new(@site, @site_config.locales_dir)
|
|
76
|
+
CompendiumMerger.new(@site).merge_compendia(po_manager, @site_config)
|
|
77
|
+
end
|
|
78
|
+
|
|
66
79
|
# Translate compendia using LibreTranslate.
|
|
67
80
|
#
|
|
68
81
|
# If LibreTranslate is enabled in config, translates all empty entries in
|
|
@@ -84,8 +97,6 @@ module Jekyll
|
|
|
84
97
|
po_files_created += 1 if po_manager.save_po_file(locale, entries, page_path: page_path)
|
|
85
98
|
end
|
|
86
99
|
|
|
87
|
-
CompendiumMerger.new(@site).merge_compendia(po_manager, config) if config.update_compendium?
|
|
88
|
-
|
|
89
100
|
po_files_created
|
|
90
101
|
end
|
|
91
102
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative '../constants'
|
|
3
4
|
require_relative 'regeneration_checker'
|
|
4
5
|
|
|
5
6
|
module Jekyll
|
|
@@ -119,7 +120,7 @@ module Jekyll
|
|
|
119
120
|
def valid_locale_code?(locale)
|
|
120
121
|
return false unless locale.is_a?(String)
|
|
121
122
|
|
|
122
|
-
locale.match?(
|
|
123
|
+
locale.match?(Constants::LOCALE_PATTERN)
|
|
123
124
|
end
|
|
124
125
|
|
|
125
126
|
# Check if any pages in the site are marked for localization
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'liquid'
|
|
4
|
+
require_relative '../constants'
|
|
5
|
+
require_relative '../utils/error_handler'
|
|
4
6
|
|
|
5
7
|
module Jekyll
|
|
6
8
|
module L10n
|
|
@@ -99,7 +101,7 @@ module Jekyll
|
|
|
99
101
|
page&.data&.[]('locale')
|
|
100
102
|
end
|
|
101
103
|
rescue StandardError => e
|
|
102
|
-
|
|
104
|
+
ErrorHandler.log_warning('retrieving current locale', e)
|
|
103
105
|
nil
|
|
104
106
|
end
|
|
105
107
|
|
|
@@ -124,7 +126,7 @@ module Jekyll
|
|
|
124
126
|
end
|
|
125
127
|
|
|
126
128
|
def already_localized?(url_str)
|
|
127
|
-
%r
|
|
129
|
+
%r{^/#{Constants::LOCALE_CODE_SEGMENT}(?=/|\?)}o.match?(url_str)
|
|
128
130
|
end
|
|
129
131
|
|
|
130
132
|
def external_url?(url_str)
|
|
@@ -192,7 +194,7 @@ module Jekyll
|
|
|
192
194
|
|
|
193
195
|
def strip_locale_from_url(url)
|
|
194
196
|
# Strip leading locale prefix like /es/, /fr/, /pt_BR/, /zh-CN/
|
|
195
|
-
url.sub(%r
|
|
197
|
+
url.sub(%r{^/(#{Constants::LOCALE_CODE_SEGMENT})(?=/|$)}o, '')
|
|
196
198
|
end
|
|
197
199
|
end
|
|
198
200
|
end
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../constants'
|
|
4
|
+
|
|
5
|
+
module Jekyll
|
|
6
|
+
module L10n
|
|
7
|
+
# Finds the closest matching old PO entry for a new msgid using normalized
|
|
8
|
+
# Levenshtein similarity. Mirrors GNU msgmerge fuzzy-matching behaviour.
|
|
9
|
+
#
|
|
10
|
+
# Key responsibilities:
|
|
11
|
+
# * Compute normalized edit-distance similarity between two strings
|
|
12
|
+
# * Select the best-scoring candidate from a pool of orphaned old entries
|
|
13
|
+
# * Return the matched old msgid and its msgstr for use as a fuzzy hint
|
|
14
|
+
class PoFuzzyMatcher
|
|
15
|
+
THRESHOLD = Constants::DEFAULT_FUZZY_THRESHOLD
|
|
16
|
+
|
|
17
|
+
# Extract msgstr from a PO entry that is either a plain String or a metadata Hash.
|
|
18
|
+
#
|
|
19
|
+
# @param entry [String, Hash] PO entry value
|
|
20
|
+
# @return [String]
|
|
21
|
+
def self.msgstr_from_entry(entry)
|
|
22
|
+
entry.is_a?(Hash) ? entry[:msgstr].to_s : entry.to_s
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Find the best fuzzy match for new_msgid among candidates.
|
|
26
|
+
#
|
|
27
|
+
# Skips new_msgid values longer than MAX_FUZZY_MSGID_LENGTH (long strings are
|
|
28
|
+
# unique HTML fragments with no useful near-duplicate and Levenshtein is O(n²)).
|
|
29
|
+
# Pre-filters candidates to the length range where similarity ≥ threshold is
|
|
30
|
+
# mathematically possible before invoking Levenshtein.
|
|
31
|
+
#
|
|
32
|
+
# @param new_msgid [String] the new source string to match
|
|
33
|
+
# @param candidates [Hash] { old_msgid => entry } where entry is either a
|
|
34
|
+
# String msgstr or a Hash with :msgstr key
|
|
35
|
+
# @param threshold [Float] minimum similarity score to accept (0.0–1.0)
|
|
36
|
+
# @return [Hash, nil] { msgid: String, msgstr: String } or nil if no match
|
|
37
|
+
def self.find_match(new_msgid, candidates, threshold: THRESHOLD)
|
|
38
|
+
return nil if candidates.empty?
|
|
39
|
+
return nil if new_msgid.nil? || new_msgid.length > Constants::MAX_FUZZY_MSGID_LENGTH
|
|
40
|
+
|
|
41
|
+
len = new_msgid.length
|
|
42
|
+
min_feas = (len * threshold).ceil
|
|
43
|
+
max_feas = threshold.positive? ? (len / threshold).floor : Float::INFINITY
|
|
44
|
+
|
|
45
|
+
best = best_candidate(new_msgid, candidates, min_feas, max_feas, threshold)
|
|
46
|
+
return nil unless best
|
|
47
|
+
|
|
48
|
+
{ msgid: best[:msgid], msgstr: msgstr_from_entry(best[:entry]) }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def self.best_candidate(new_msgid, candidates, min_feas, max_feas, threshold)
|
|
52
|
+
best_msgid = best_entry = nil
|
|
53
|
+
best_score = 0.0
|
|
54
|
+
|
|
55
|
+
candidates.each do |old_msgid, entry|
|
|
56
|
+
ol = old_msgid.length
|
|
57
|
+
next if ol < min_feas || ol > max_feas
|
|
58
|
+
|
|
59
|
+
score = similarity(new_msgid, old_msgid)
|
|
60
|
+
next unless score > best_score && score >= threshold
|
|
61
|
+
|
|
62
|
+
best_score = score
|
|
63
|
+
best_msgid = old_msgid
|
|
64
|
+
best_entry = entry
|
|
65
|
+
break if best_score >= 1.0
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
best_msgid ? { msgid: best_msgid, entry: best_entry } : nil
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Normalized Levenshtein similarity between two strings.
|
|
72
|
+
#
|
|
73
|
+
# Returns 0.0 immediately when the length ratio falls below the threshold —
|
|
74
|
+
# the maximum achievable similarity is min_len/max_len, so Levenshtein cannot
|
|
75
|
+
# produce a useful result and the O(n²) computation is skipped.
|
|
76
|
+
#
|
|
77
|
+
# @param str_a [String]
|
|
78
|
+
# @param str_b [String]
|
|
79
|
+
# @return [Float] 0.0 (completely different) to 1.0 (identical)
|
|
80
|
+
def self.similarity(str_a, str_b)
|
|
81
|
+
return 1.0 if str_a == str_b
|
|
82
|
+
return 0.0 if str_a.empty? || str_b.empty?
|
|
83
|
+
|
|
84
|
+
max_len = [str_a.length, str_b.length].max
|
|
85
|
+
min_len = [str_a.length, str_b.length].min
|
|
86
|
+
return 0.0 if min_len.to_f / max_len < THRESHOLD
|
|
87
|
+
|
|
88
|
+
dist = levenshtein(str_a, str_b)
|
|
89
|
+
1.0 - (dist.to_f / max_len)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def self.levenshtein(str_a, str_b)
|
|
93
|
+
# Keep the shorter string as the row to minimise the dp array size.
|
|
94
|
+
str_a, str_b = str_b, str_a if str_a.length > str_b.length
|
|
95
|
+
|
|
96
|
+
a_chars = str_a.chars
|
|
97
|
+
b_chars = str_b.chars
|
|
98
|
+
m = a_chars.length
|
|
99
|
+
n = b_chars.length
|
|
100
|
+
|
|
101
|
+
curr = Array.new(m + 1) { |i| i }
|
|
102
|
+
prev = Array.new(m + 1, 0)
|
|
103
|
+
|
|
104
|
+
n.times do |j|
|
|
105
|
+
curr, prev = prev, curr # swap in place — no allocation
|
|
106
|
+
curr[0] = j + 1
|
|
107
|
+
m.times do |i|
|
|
108
|
+
cost = a_chars[i] == b_chars[j] ? 0 : 1
|
|
109
|
+
curr[i + 1] = [curr[i] + 1, prev[i + 1] + 1, prev[i] + cost].min
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
curr[m]
|
|
113
|
+
end
|
|
114
|
+
private_class_method :levenshtein, :best_candidate
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
require_relative 'reader'
|
|
4
4
|
require_relative 'manager'
|
|
5
5
|
require_relative 'path_builder'
|
|
6
|
+
require_relative '../utils/error_handler'
|
|
6
7
|
|
|
7
8
|
module Jekyll
|
|
8
9
|
module L10n
|
|
@@ -52,11 +53,11 @@ module Jekyll
|
|
|
52
53
|
end
|
|
53
54
|
|
|
54
55
|
def self.load_and_cache(cache_key, po_path)
|
|
55
|
-
translations = PoFileReader.
|
|
56
|
+
translations = PoFileReader.parse_for_translation(po_path)
|
|
56
57
|
PoFileManager.cache[cache_key] = translations
|
|
57
58
|
translations
|
|
58
59
|
rescue StandardError => e
|
|
59
|
-
|
|
60
|
+
ErrorHandler.log_warning("loading PO file #{po_path}", e)
|
|
60
61
|
{}
|
|
61
62
|
end
|
|
62
63
|
end
|
|
@@ -7,6 +7,7 @@ require_relative 'path_builder'
|
|
|
7
7
|
require_relative '../utils/site_config_accessor'
|
|
8
8
|
require_relative '../utils/file_operations'
|
|
9
9
|
require_relative '../utils/logger_formatter'
|
|
10
|
+
require_relative '../utils/error_handler'
|
|
10
11
|
|
|
11
12
|
module Jekyll
|
|
12
13
|
module L10n
|
|
@@ -113,7 +114,7 @@ module Jekyll
|
|
|
113
114
|
prepare_and_write_po_file(po_path, entries, locale, page_path: page_path,
|
|
114
115
|
skip_merge: skip_merge)
|
|
115
116
|
rescue StandardError => e
|
|
116
|
-
|
|
117
|
+
ErrorHandler.log_error("saving PO file #{po_path}", e)
|
|
117
118
|
false
|
|
118
119
|
end
|
|
119
120
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative 'reader'
|
|
4
|
+
require_relative '../utils/error_handler'
|
|
4
5
|
|
|
5
6
|
module Jekyll
|
|
6
7
|
module L10n
|
|
@@ -73,7 +74,7 @@ module Jekyll
|
|
|
73
74
|
merged[msgid] ||= entry
|
|
74
75
|
end
|
|
75
76
|
rescue StandardError => e
|
|
76
|
-
|
|
77
|
+
ErrorHandler.log_warning("merging PO file #{po_file}", e)
|
|
77
78
|
end
|
|
78
79
|
end
|
|
79
80
|
end
|
|
@@ -89,6 +89,20 @@ module Jekyll
|
|
|
89
89
|
process_po_lines_instance(content, :merge)
|
|
90
90
|
end
|
|
91
91
|
|
|
92
|
+
# Parse a PO file for translation injection, excluding fuzzy entries.
|
|
93
|
+
#
|
|
94
|
+
# Parses with full merge metadata (including fuzzy flags), then strips fuzzy
|
|
95
|
+
# entries and returns a simple msgid → msgstr hash. Fuzzy entries are treated
|
|
96
|
+
# as untranslated per the GNU Gettext standard: msgfmt skips them by default
|
|
97
|
+
# and they should fall through to the active fallback mode.
|
|
98
|
+
#
|
|
99
|
+
# @return [Hash] Simple translation hash { msgid => msgstr }, fuzzy entries omitted
|
|
100
|
+
def parse_for_translation
|
|
101
|
+
parse_for_merge
|
|
102
|
+
.reject { |_msgid, v| v.is_a?(Hash) && v[:fuzzy] }
|
|
103
|
+
.transform_values { |v| v.is_a?(Hash) ? v[:msgstr] : v }
|
|
104
|
+
end
|
|
105
|
+
|
|
92
106
|
# Parse a PO file (class method, for backward compatibility).
|
|
93
107
|
#
|
|
94
108
|
# @param po_path [String] Path to PO file
|
|
@@ -113,6 +127,14 @@ module Jekyll
|
|
|
113
127
|
new(po_path).parse_for_merge
|
|
114
128
|
end
|
|
115
129
|
|
|
130
|
+
# Parse a PO file for translation injection, excluding fuzzy entries (class method).
|
|
131
|
+
#
|
|
132
|
+
# @param po_path [String] Path to PO file
|
|
133
|
+
# @return [Hash] Simple translation hash { msgid => msgstr }, fuzzy entries omitted
|
|
134
|
+
def self.parse_for_translation(po_path)
|
|
135
|
+
new(po_path).parse_for_translation
|
|
136
|
+
end
|
|
137
|
+
|
|
116
138
|
# Backward compatibility wrapper
|
|
117
139
|
def self.process_po_lines(content)
|
|
118
140
|
process_po_lines_internal(content, false)
|
|
@@ -149,7 +171,8 @@ module Jekyll
|
|
|
149
171
|
# with_mode: false (default, simple format), true (with reference), :merge (with both)
|
|
150
172
|
# rubocop:disable Metrics/ParameterLists, Metrics/AbcSize, Metrics/PerceivedComplexity
|
|
151
173
|
def self.process_msgid_msgstr_pair(lines, start_idx, translations,
|
|
152
|
-
reference: nil, fuzzy: nil,
|
|
174
|
+
reference: nil, fuzzy: nil, previous_msgid: nil,
|
|
175
|
+
with_mode: false)
|
|
153
176
|
# rubocop:enable Metrics/ParameterLists, Metrics/AbcSize, Metrics/PerceivedComplexity
|
|
154
177
|
# Handle nil sentinel values (from NO_REFERENCE constant)
|
|
155
178
|
reference = nil if reference == NO_REFERENCE
|
|
@@ -168,7 +191,8 @@ module Jekyll
|
|
|
168
191
|
with_metadata = with_mode == true || with_mode == :merge || !reference.nil? || !fuzzy.nil?
|
|
169
192
|
store_translation(
|
|
170
193
|
translations, msgid_value, msgstr_value,
|
|
171
|
-
reference: reference, fuzzy: fuzzy,
|
|
194
|
+
reference: reference, fuzzy: fuzzy, previous_msgid: previous_msgid,
|
|
195
|
+
with_metadata: with_metadata
|
|
172
196
|
)
|
|
173
197
|
else
|
|
174
198
|
i += 1
|
|
@@ -259,10 +283,10 @@ module Jekyll
|
|
|
259
283
|
reference: reference, fuzzy: nil, with_mode: true
|
|
260
284
|
)
|
|
261
285
|
when :merge
|
|
262
|
-
reference, fuzzy = extract_reference_and_fuzzy_before_msgid(lines, idx)
|
|
286
|
+
reference, fuzzy, previous_msgid = extract_reference_and_fuzzy_before_msgid(lines, idx)
|
|
263
287
|
process_msgid_msgstr_pair(
|
|
264
288
|
lines, idx, translations,
|
|
265
|
-
reference: reference, fuzzy: fuzzy, with_mode: :merge
|
|
289
|
+
reference: reference, fuzzy: fuzzy, previous_msgid: previous_msgid, with_mode: :merge
|
|
266
290
|
)
|
|
267
291
|
else
|
|
268
292
|
process_msgid_msgstr_pair(lines, idx, translations, reference: nil, fuzzy: nil,
|
|
@@ -270,29 +294,48 @@ module Jekyll
|
|
|
270
294
|
end
|
|
271
295
|
end
|
|
272
296
|
|
|
273
|
-
# Unified metadata extraction: extracts reference and
|
|
297
|
+
# Unified metadata extraction: extracts reference, fuzzy flag, and previous msgid.
|
|
298
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
274
299
|
def self.extract_metadata_before_msgid(lines, msgid_idx, include_fuzzy: false)
|
|
275
|
-
reference
|
|
276
|
-
fuzzy
|
|
277
|
-
|
|
300
|
+
reference = nil
|
|
301
|
+
fuzzy = false
|
|
302
|
+
previous_msgid = nil
|
|
303
|
+
comments_end = msgid_idx - 1
|
|
278
304
|
|
|
279
305
|
while comments_end >= 0
|
|
280
306
|
comment_line = lines[comments_end].strip
|
|
281
307
|
break unless comment_line.start_with?('#') || comment_line.empty?
|
|
282
308
|
|
|
283
|
-
reference
|
|
284
|
-
|
|
309
|
+
reference = extract_reference_from_line(comment_line) || reference
|
|
310
|
+
previous_msgid = extract_previous_msgid_from_line(comment_line) || previous_msgid if include_fuzzy
|
|
311
|
+
fuzzy = true if include_fuzzy && fuzzy_line?(comment_line)
|
|
285
312
|
|
|
286
313
|
comments_end -= 1
|
|
287
314
|
end
|
|
288
315
|
|
|
289
|
-
include_fuzzy ? [reference, fuzzy] : reference
|
|
316
|
+
include_fuzzy ? [reference, fuzzy, previous_msgid] : reference
|
|
290
317
|
end
|
|
318
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
291
319
|
|
|
292
320
|
def self.extract_reference_from_line(comment_line)
|
|
293
321
|
comment_line.sub(/^#:\s*/, '').strip if comment_line.start_with?('#:')
|
|
294
322
|
end
|
|
295
323
|
|
|
324
|
+
# Parse a #| msgid "..." previous-msgid comment line.
|
|
325
|
+
def self.extract_previous_msgid_from_line(comment_line)
|
|
326
|
+
return nil unless comment_line.start_with?('#|') && comment_line.include?('msgid')
|
|
327
|
+
|
|
328
|
+
raw = comment_line.sub(/^#\|\s*msgid\s*/, '').strip
|
|
329
|
+
# Strip surrounding quotes and unescape
|
|
330
|
+
if raw.start_with?('"') && raw.end_with?('"')
|
|
331
|
+
unescape_string(raw[1...-1], '"')
|
|
332
|
+
elsif raw.start_with?("'") && raw.end_with?("'")
|
|
333
|
+
unescape_string(raw[1...-1], "'")
|
|
334
|
+
else
|
|
335
|
+
raw
|
|
336
|
+
end
|
|
337
|
+
end
|
|
338
|
+
|
|
296
339
|
def self.fuzzy_line?(comment_line)
|
|
297
340
|
comment_line.start_with?('#,') && comment_line.include?('fuzzy')
|
|
298
341
|
end
|
|
@@ -302,7 +345,7 @@ module Jekyll
|
|
|
302
345
|
extract_metadata_before_msgid(lines, msgid_idx, include_fuzzy: false)
|
|
303
346
|
end
|
|
304
347
|
|
|
305
|
-
# Backward compatibility wrapper
|
|
348
|
+
# Backward compatibility wrapper — now returns [reference, fuzzy, previous_msgid]
|
|
306
349
|
def self.extract_reference_and_fuzzy_before_msgid(lines, msgid_idx)
|
|
307
350
|
extract_metadata_before_msgid(lines, msgid_idx, include_fuzzy: true)
|
|
308
351
|
end
|
|
@@ -404,24 +447,29 @@ module Jekyll
|
|
|
404
447
|
# - Backward compatibility: Supports legacy calling conventions
|
|
405
448
|
# rubocop:disable Metrics/ParameterLists
|
|
406
449
|
def self.store_translation(translations, msgid, msgstr, reference: nil, fuzzy: nil,
|
|
407
|
-
with_metadata: false)
|
|
450
|
+
previous_msgid: nil, with_metadata: false)
|
|
408
451
|
# rubocop:enable Metrics/ParameterLists
|
|
409
452
|
return if msgid.nil? || msgstr.nil? || msgid.empty?
|
|
410
453
|
|
|
411
|
-
translations[msgid] = build_translation_entry(msgstr, reference, fuzzy,
|
|
454
|
+
translations[msgid] = build_translation_entry(msgstr, reference, fuzzy,
|
|
455
|
+
previous_msgid, with_metadata: with_metadata)
|
|
412
456
|
end
|
|
413
457
|
|
|
414
|
-
|
|
458
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
459
|
+
def self.build_translation_entry(msgstr, reference, fuzzy, previous_msgid = nil,
|
|
460
|
+
with_metadata: false)
|
|
415
461
|
# Simple format when no metadata requested and none provided
|
|
416
|
-
return msgstr if !with_metadata && reference.nil? && fuzzy.nil?
|
|
462
|
+
return msgstr if !with_metadata && reference.nil? && fuzzy.nil? && previous_msgid.nil?
|
|
417
463
|
|
|
418
464
|
# Build metadata hash based on what's provided
|
|
419
465
|
entry = { msgstr: msgstr }
|
|
420
|
-
entry[:reference]
|
|
421
|
-
entry[:fuzzy]
|
|
422
|
-
entry[:
|
|
466
|
+
entry[:reference] = reference unless reference.nil?
|
|
467
|
+
entry[:fuzzy] = fuzzy unless fuzzy.nil?
|
|
468
|
+
entry[:previous_msgid] = previous_msgid unless previous_msgid.nil?
|
|
469
|
+
entry[:comment] = nil if !fuzzy.nil? || !reference.nil?
|
|
423
470
|
entry
|
|
424
471
|
end
|
|
472
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
425
473
|
|
|
426
474
|
# Kept for backward compatibility with existing tests
|
|
427
475
|
# Supports both positional and keyword argument calling styles
|
|
@@ -499,9 +547,11 @@ module Jekyll
|
|
|
499
547
|
self.class.process_msgid_msgstr_pair(lines, idx, translations, reference: reference,
|
|
500
548
|
fuzzy: nil, with_mode: true)
|
|
501
549
|
when :merge
|
|
502
|
-
reference, fuzzy = self.class.extract_reference_and_fuzzy_before_msgid(lines, idx)
|
|
550
|
+
reference, fuzzy, previous_msgid = self.class.extract_reference_and_fuzzy_before_msgid(lines, idx)
|
|
503
551
|
self.class.process_msgid_msgstr_pair(lines, idx, translations, reference: reference,
|
|
504
|
-
fuzzy: fuzzy,
|
|
552
|
+
fuzzy: fuzzy,
|
|
553
|
+
previous_msgid: previous_msgid,
|
|
554
|
+
with_mode: :merge)
|
|
505
555
|
else
|
|
506
556
|
self.class.process_msgid_msgstr_pair(lines, idx, translations, reference: nil,
|
|
507
557
|
fuzzy: nil, with_mode: false)
|