isodoc-i18n 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 134575d665c75368d3640ef87c9c63a3f6fdc2bf668e8d8df233a3cb139fce6b
4
- data.tar.gz: ed1fc0e49c62f27b3199ba8357b2f1454309485efffbcb93ed30d51f01c36fe7
3
+ metadata.gz: e08e5e4a6c9b89a5f628ee426ce22f8bcdbae97e7cccd4411f4962844d29b231
4
+ data.tar.gz: 9d8cbc5526c1d8aabe9db00c25deeb581a3604358b50b18e6f5a9bb7faab6909
5
5
  SHA512:
6
- metadata.gz: f6e5ff44068372afc9ad75e1d6bff6483114a3424071c136a86f17745513ff5f31794c4555c9484eff556c50f638cb13c3ba712c4ae3e3b4d762685d1e888c2d
7
- data.tar.gz: 9c5131cb3dbb8800304530629dfe5b47a3d7b15695ff0762011998c5aa3d3273b7730dde057dd2f2f15e3f997c9466704be80d7ef9cd47a5055e93afc0a5ba32
6
+ metadata.gz: 3880d68a1f094ab500840e74767f970af57fca01a12ae751da68e9cdc72ba00df9139516df3c74dda5059114fe723b800c057c665efdd485e44aa9a247cdde22
7
+ data.tar.gz: 80643511d523a3535293a2b7b82e60fe81a8910dbe9fc336cf8a4ab95a9763b1adf1375f802f5b36177a6e4570c51688b803576f7d2d39b8fc7c29e894d2f235
data/isodoc-i18n.gemspec CHANGED
@@ -22,12 +22,13 @@ Gem::Specification.new do |spec|
22
22
  end
23
23
  spec.required_ruby_version = Gem::Requirement.new(">= 2.7.0")
24
24
 
25
+ spec.add_dependency "base64"
25
26
  spec.add_dependency "htmlentities", "~> 4.3.4"
26
27
  spec.add_dependency "liquid", "~> 5"
27
28
  spec.add_dependency "metanorma-utils", ">= 1.7.0"
28
29
  spec.add_dependency "twitter_cldr"
29
- spec.add_dependency "base64"
30
30
 
31
+ spec.add_development_dependency "canon"
31
32
  spec.add_development_dependency "debug"
32
33
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
33
34
  spec.add_development_dependency "guard", "~> 2.14"
@@ -35,10 +36,9 @@ Gem::Specification.new do |spec|
35
36
  spec.add_development_dependency "rake", "~> 13.0"
36
37
  spec.add_development_dependency "rspec", "~> 3.6"
37
38
  spec.add_development_dependency "rubocop", "~> 1"
38
- spec.add_development_dependency "rubocop-performance"
39
+ spec.add_development_dependency "rubocop-performance"
39
40
  spec.add_development_dependency "simplecov", "~> 0.15"
40
41
  spec.add_development_dependency "timecop", "~> 0.9"
41
42
  spec.add_development_dependency "webmock"
42
- spec.add_development_dependency "canon"
43
43
  # spec.metadata["rubygems_mfa_required"] = "true"
44
44
  end
@@ -1,5 +1,5 @@
1
1
  module IsoDoc
2
2
  class I18n
3
- VERSION = "1.4.0".freeze
3
+ VERSION = "1.4.1".freeze
4
4
  end
5
5
  end
@@ -8,10 +8,79 @@ module IsoDoc
8
8
  def load_yaml(lang, script, i18nyaml = nil, i18nhash = nil)
9
9
  ret = load_yaml1(lang, script)
10
10
  i18nyaml and
11
- return normalise_hash(ret.deep_merge(YAML.load_file(i18nyaml)))
12
- i18nhash and return normalise_hash(ret.deep_merge(i18nhash))
11
+ return postprocess(ret.deep_merge(YAML.load_file(i18nyaml)))
12
+ i18nhash and return postprocess(ret.deep_merge(i18nhash))
13
13
 
14
- normalise_hash(ret)
14
+ postprocess(ret)
15
+ end
16
+
17
+ def postprocess(labels)
18
+ self_reference_resolve(normalise_hash(labels))
19
+ end
20
+
21
+ def self_reference_resolve(labels)
22
+ resolve_references(labels, labels)
23
+ end
24
+
25
+ def resolve_references(obj, labels)
26
+ case obj
27
+ when Hash
28
+ obj.transform_values { |v| resolve_references(v, labels) }
29
+ when Array
30
+ obj.map { |item| resolve_references(item, labels) }
31
+ when String
32
+ resolve_string_references(obj, labels)
33
+ else
34
+ obj
35
+ end
36
+ end
37
+
38
+ def resolve_string_references(str, labels)
39
+ # Match patterns like #{self["key"]["subkey"]} or #{self.key.subkey}
40
+ # Allow spaces around the self expression
41
+ str.gsub(/\#\{\s*self([^\}]+?)\s*\}/) do |match|
42
+ path_expr = Regexp.last_match(1)
43
+ resolve_path(path_expr, labels, match)
44
+ end
45
+ end
46
+
47
+ def resolve_path(path_expr, labels, original_expr)
48
+ segments = parse_path(path_expr)
49
+ current = labels
50
+
51
+ segments.each do |segment|
52
+ case current
53
+ when Hash
54
+ current.key?(segment) or
55
+ raise "Self-reference error: Path '#{original_expr}' not found - key '#{segment}' does not exist"
56
+ current = current[segment]
57
+ when Array
58
+ index = segment.to_i
59
+ segment =~ /^\d+$/ && index >= 0 && index < current.length or
60
+ raise "Self-reference error: Path '#{original_expr}' not found - invalid array index '#{segment}'"
61
+ current = current[index]
62
+ else
63
+ raise "Self-reference error: Path '#{original_expr}' not found - cannot navigate through non-collection type"
64
+ end
65
+ end
66
+
67
+ current.to_s
68
+ end
69
+
70
+ def parse_path(path_expr)
71
+ segments = []
72
+ path_expr = path_expr.sub(/^\./, "")
73
+ # Split by dots and brackets while preserving the content
74
+ parts = path_expr.scan(/\.?([\w-]+)|\[([^\]]+)\]/)
75
+ parts.each do |dot_part, bracket_part|
76
+ if dot_part
77
+ segments << dot_part
78
+ elsif bracket_part
79
+ segment = bracket_part.strip.gsub(/^["']|["']$/, "")
80
+ segments << segment
81
+ end
82
+ end
83
+ segments
15
84
  end
16
85
 
17
86
  def normalise_hash(ret)
@@ -30,8 +99,8 @@ module IsoDoc
30
99
  def load_yaml1(lang, script)
31
100
  case lang
32
101
  when "zh"
33
- if script == "Hans" then load_yaml2("zh-Hans")
34
- else load_yaml2("en")
102
+ if script then load_yaml2("zh-#{script}")
103
+ else load_yaml2("zh-Hans")
35
104
  end
36
105
  else
37
106
  load_yaml2(lang)
data/lib/isodoc/i18n.rb CHANGED
@@ -68,7 +68,9 @@ module IsoDoc
68
68
  end
69
69
 
70
70
  def enum_comma
71
- CJK_SCRIPTS.include?(@script) and return "<enum-comma>、</enum-comma>"
71
+ c = @labels.dig("punct", "enum-comma")
72
+ c && CJK_SCRIPTS.include?(@script) and
73
+ return "<enum-comma>#{c}</enum-comma>"
72
74
  "<enum-comma>,</enum-comma> "
73
75
  end
74
76
 
data/lib/isodoc/l10n.rb CHANGED
@@ -1,55 +1,23 @@
1
1
  require "metanorma-utils"
2
+ require_relative "l10n_cjk"
2
3
 
3
4
  module IsoDoc
4
5
  class I18n
5
- # Use comprehensive CJK definition from metanorma-utils
6
- # This includes Han, Katakana, Hiragana, Hangul, Bopomofo and all CJK extensions
7
- ZH_CHAR = "(#{Metanorma::Utils::CJK})".freeze
8
- LATIN_PUNCT = /[:,.()\[\];?!-]/.freeze
9
-
10
- # Condition for converting punctuation to double width:
11
- # 1. (Strict condition) CJK before, CJK after, modulo ignorable characters:
12
- # 1a. CJK character, or start of string. Latin spaces optional.
13
- ZH1_PUNCT = /(#{ZH_CHAR}|^)(\s*)$/xo.freeze
14
- # 1b. Latin spaces optional, Latin punct which will also convert to CJK,
15
- # CJK character, or end of string.
16
- ZH2_PUNCT = /^\s*#{LATIN_PUNCT}*(#{ZH_CHAR}|$)/xo.freeze
17
- # 2. CJK before, space after:
18
- # 2a. CJK char, followed by optional Latin punct which will also convert to CJK
19
- ZH1_NO_SPACE = /#{ZH_CHAR}#{LATIN_PUNCT}*$/xo.freeze
20
- # 2b. optional Latin punct which wil also convert to CJK, then space
21
- OPT_PUNCT_SPACE = /^($|#{LATIN_PUNCT}*\s)/xo.freeze
22
-
23
- # Contexts for converting en-dashes to full-width
24
- # Before: CJK or start of string, optional digits
25
- ZH1_DASH = /(#{ZH_CHAR}|^)(\d*)$/xo.freeze
26
- # After: optional digits, CJK or end of string
27
- ZH2_DASH = /^\d*(#{ZH_CHAR}|$)/xo.freeze
28
-
29
- # Pre-defined punctuation mappings for efficiency
30
- ZH_PUNCT_MAP = [
31
- ["::", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
32
- [",,", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
33
- [".。", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
34
- ["))", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
35
- ["]]", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
36
- [";;", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
37
- ["??", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
38
- ["!!", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
39
- ["((", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
40
- ["[[", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]]
41
- ].freeze
42
-
43
6
  def self.l10n(text, lang = @lang, script = @script, options = {})
44
7
  l10n(text, lang, script, options)
45
8
  end
46
9
 
47
10
  # function localising spaces and punctuation
48
11
  # options[:prev] and options[:foll] are optional context strings
12
+ # options[:proportional_mixed_cjk] allows contextual full-width vs
13
+ # half-width punctuation
49
14
  def l10n(text, lang = @lang, script = @script, options = {})
50
15
  locale = options[:locale] || @locale
51
- %w(zh ja ko).include?(lang) and text = l10n_zh(text, script, options[:prev], options[:foll])
52
- lang == "fr" && text = l10n_fr(text, locale || "FR", options[:prev], options[:foll])
16
+ %w(zh ja ko).include?(lang) and
17
+ text = l10n_zh(text, script, options)
18
+ lang == "fr" and
19
+ text = l10n_fr(text, locale || "FR", options)
20
+ text&.gsub!(/<esc>|<\/esc>/, "") # Strip esc tags
53
21
  bidiwrap(text, lang, script)
54
22
  end
55
23
 
@@ -71,30 +39,25 @@ module IsoDoc
71
39
  .default_script(@lang))]
72
40
  end
73
41
 
74
- # CJK
75
- def l10n_zh(text, script, prev, foll)
76
- script ||= "Hans"
77
- t, text_cache, xml = l10n_prep(text, prev, foll)
78
- t.each_with_index do |n, i|
79
- # Adjust index if prev context prepended
80
- prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
81
- text = cleanup_entities(n.text, is_xml: false)
82
- n.replace(l10_zh1(text, prev_ctx, foll_ctx, script))
42
+ def l10n_prep(text, options)
43
+ xml = Nokogiri::XML::DocumentFragment.parse(text)
44
+ t = xml.xpath(".//text()").reject { |node| node.text.empty? }
45
+ text_cache = build_text_cache(t, options[:prev], options[:foll])
46
+
47
+ # Identify which text nodes are within <esc> tags
48
+ esc_indices = Set.new
49
+ t.each_with_index do |node, i|
50
+ esc_indices.add(i) if node.ancestors("esc").any?
83
51
  end
84
- to_xml(xml).gsub(/<b>|<\/b>|<\?[^>]+>/, "")
85
- end
86
52
 
87
- def l10n_prep(text, prev, foll)
88
- xml = Nokogiri::XML::DocumentFragment.parse(text)
89
- t = xml.xpath(".//text()")
90
- text_cache = build_text_cache(t, prev, foll)
91
- [t, text_cache, xml]
53
+ [t, text_cache, xml, options[:prev], options[:foll], esc_indices]
92
54
  end
93
55
 
94
56
  # Cache text content once per method call to avoid repeated .text calls
95
57
  # Build text cache with optional prepended/appended context
58
+ # Also, reduce multiple spaces to single, to avoid miscrecognition of space
96
59
  def build_text_cache(text_nodes, prev_context = nil, foll_context = nil)
97
- text_cache = text_nodes.map(&:text)
60
+ text_cache = text_nodes.map(&:text).map { |x| x.gsub(/\s+/, " ") }
98
61
  text_cache.unshift(prev_context) if prev_context
99
62
  text_cache.push(foll_context) if foll_context
100
63
  text_cache
@@ -117,9 +80,11 @@ module IsoDoc
117
80
  [prev, foll]
118
81
  end
119
82
 
120
- def l10n_fr(text, locale, prev, foll)
121
- t, text_cache, xml = l10n_prep(text, prev, foll)
83
+ def l10n_fr(text, locale, options)
84
+ t, text_cache, xml, prev, _foll, esc_indices = l10n_prep(text, options)
122
85
  t.each_with_index do |n, i|
86
+ next if esc_indices.include?(i) # Skip escaped nodes
87
+
123
88
  prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
124
89
  text = cleanup_entities(n.text, is_xml: false)
125
90
  n.replace(l10n_fr1(text, prev_ctx, foll_ctx, locale))
@@ -127,35 +92,15 @@ module IsoDoc
127
92
  to_xml(xml)
128
93
  end
129
94
 
130
- # note: we can't differentiate comma from enumeration comma 、
131
- # def l10_zh1(text, _script)
132
- def l10_zh1(text, prev, foll, _script)
133
- r = l10n_zh_punct(text, prev, foll)
134
- r = l10n_zh_remove_space(r, prev, foll)
135
- l10n_zh_dash(r, prev, foll)
136
- end
137
-
138
- def l10n_zh_punct(text, prev, foll)
139
- # Use pre-defined mapping for better performance
140
- ZH_PUNCT_MAP.each do |mapping|
141
- punct_pair, regexes = mapping
142
- text = l10n_gsub(text, prev, foll, [punct_pair[0], punct_pair[1]], regexes)
143
- end
144
- text
145
- end
146
-
147
- def l10n_zh_dash(text, prev, foll)
148
- l10n_gsub(text, prev, foll, %w(– ~), [[ZH1_DASH, ZH2_DASH]])
149
- end
150
-
151
95
  # text: string we are scanning for instances of delim[0] to replace
152
96
  # prev: string preceding text, as additional token of context
153
97
  # foll: string following text, as additional token of context
154
98
  # delim: delim[0] is the symbol we want to replace, delim[1] its replacement
155
99
  # regexes: a list of regex pairs: the context before the found token,
156
100
  # and the context after the found token, under which replacing it
157
- # with delim[1] is permitted
101
+ # with delim[1] is permitted. If regex is nil, always allow the replacement
158
102
  def l10n_gsub(text, prev, foll, delim, regexes)
103
+ delim[1] or return text
159
104
  context = l10n_gsub_context(text, prev, foll, delim) or return text
160
105
  (1...(context.size - 1)).each do |i|
161
106
  l10_context_valid?(context, i, delim, regexes) and
@@ -170,11 +115,12 @@ module IsoDoc
170
115
  d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
171
116
  context = text.split(/(#{d})/) # delim to replace
172
117
  context.size == 1 and return
173
- [prev, context, foll].flatten
118
+ [prev, context.reject(&:empty?), foll].flatten
174
119
  end
175
120
 
176
121
  def l10_context_valid?(context, idx, delim, regex)
177
122
  l10n_context_found_delimiter?(context[idx], delim) or return false
123
+ regex.nil? and return true
178
124
  regex.detect do |r|
179
125
  r[0].match?(context[0...idx].join) && # preceding context
180
126
  r[1].match?(context[(idx + 1)..-1].join) # foll context
@@ -189,13 +135,6 @@ module IsoDoc
189
135
  end
190
136
  end
191
137
 
192
- def l10n_zh_remove_space(text, prev, foll)
193
- text = l10n_gsub(text, prev, foll, [" ", ""],
194
- [[/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o]])
195
- l10n_gsub(text, prev, foll, [" ", ""],
196
- [[/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o]])
197
- end
198
-
199
138
  def l10n_fr1(text, prev, foll, locale)
200
139
  text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
201
140
  [[/\p{Alnum}$/, /^(\s|$)/]])
@@ -206,30 +145,6 @@ module IsoDoc
206
145
  [[/\p{Alnum}$/, /^(\s|$)/]])
207
146
  end
208
147
 
209
- def self.cjk_extend(text)
210
- cjk_extend(text)
211
- end
212
-
213
- def cjk_extend(title)
214
- @c.decode(title).chars.map.with_index do |n, i|
215
- if i.zero? || !interleave_space_cjk?(title[i - 1] + title[i])
216
- n
217
- else "\u3000#{n}"
218
- end
219
- end.join
220
- end
221
-
222
- def interleave_space_cjk?(text)
223
- text.size == 2 or return
224
- ["\u2014\u2014", "\u2025\u2025", "\u2026\u2026",
225
- "\u22ef\u22ef"].include?(text) ||
226
- /\d\d|\p{Latin}\p{Latin}|[[:space:]]/.match?(text) ||
227
- /^[\u2018\u201c(\u3014\[{\u3008\u300a\u300c\u300e\u3010\u2985\u3018\u3016\u00ab\u301d]/.match?(text) ||
228
- /[\u2019\u201d)\u3015\]}\u3009\u300b\u300d\u300f\u3011\u2986\u3019\u3017\u00bb\u301f]$/.match?(text) ||
229
- /[\u3002.\u3001,\u30fb:;\u2010\u301c\u30a0\u2013!?\u203c\u2047\u2048\u2049]/.match?(text) and return false
230
- true
231
- end
232
-
233
148
  def to_xml(node)
234
149
  node&.to_xml(encoding: "UTF-8", indent: 0,
235
150
  save_with: Nokogiri::XML::Node::SaveOptions::AS_XML)
@@ -0,0 +1,165 @@
1
+ module IsoDoc
2
+ class I18n
3
+ # Use comprehensive CJK definition from metanorma-utils
4
+ # This includes Han, Katakana, Hiragana, Hangul, Bopomofo
5
+ # and all CJK extensions
6
+ ZH_CHAR = "(#{Metanorma::Utils::CJK})".freeze
7
+ LATIN_PUNCT = /[:,.()\[\];?!-]/.freeze
8
+ # CJK character which is not punctuation
9
+ ZH_NON_PUNCT = "(#{
10
+ [
11
+ Metanorma::Utils.singleton_class::HAN,
12
+ Metanorma::Utils.singleton_class::HAN_IDC,
13
+ Metanorma::Utils.singleton_class::KANBUN,
14
+ Metanorma::Utils.singleton_class::CJK_COMPAT_IDEOGRAPHS,
15
+ Metanorma::Utils.singleton_class::HAN_COMPAT_IDEOGRAPHS,
16
+ Metanorma::Utils.singleton_class::HANGUL,
17
+ Metanorma::Utils.singleton_class::HIRAGANA,
18
+ Metanorma::Utils.singleton_class::KATAKANA,
19
+ Metanorma::Utils.singleton_class::BOPOMOFO,
20
+ ].join("|")})".freeze
21
+
22
+ # Condition for converting punctuation to double width,
23
+ # in case of options[:proportional_mixed_cjk]
24
+ # 1. (Strict condition) CJK before, CJK after, modulo ignorable characters:
25
+ # 1a. CJK character, or start of string. Latin spaces optional.
26
+ ZH1_PUNCT = /(#{ZH_CHAR}|^)(\s*)$/xo.freeze
27
+ # 1b. Latin spaces optional, Latin punct which will also convert to CJK,
28
+ # CJK character, or end of string.
29
+ ZH2_PUNCT = /^\s*#{LATIN_PUNCT}*(#{ZH_CHAR}|$)/xo.freeze
30
+ # 2. CJK before, space after:
31
+ # 2a. CJK char, followed by optional Latin punct which will also convert to CJK
32
+ ZH1_NO_SPACE = /#{ZH_CHAR}#{LATIN_PUNCT}*$/xo.freeze
33
+ # 2b. optional Latin punct which wil also convert to CJK, then space
34
+ OPT_PUNCT_SPACE = /^($|#{LATIN_PUNCT}*\s)/xo.freeze
35
+
36
+ # Chinese numerals (common + formal/financial forms)
37
+ # Explicit characters needed because Chinese numeral ideographs
38
+ # are not tagged with Unicode Number property
39
+ # Using alternation instead of character class to properly include \p{N}
40
+ ZH_NUMERALS = "(?:[零一二三四五六七八九十百千万亿壹贰叁肆伍陆柒捌玖拾佰仟萬億兆]|\\p{N})".freeze
41
+
42
+ # Contexts for converting en-dashes to full-width
43
+ # Before: CJK or start of string, no digits
44
+ ZH1_DASH = /(#{ZH_CHAR}|^)(?<!=#{ZH_NUMERALS})$/xo.freeze
45
+ # After: no optional digits, CJK or end of string
46
+ ZH2_DASH = /^(?!#{ZH_NUMERALS})(#{ZH_CHAR}|$)/xo.freeze
47
+ # Before: CJK or start of string, optional digits
48
+ ZH1_NUM_DASH = /#{ZH_NUMERALS}$/xo.freeze
49
+ # After: optional digits, CJK or end of string
50
+ ZH2_NUM_DASH = /^#{ZH_NUMERALS}/xo.freeze
51
+
52
+ ZH_PUNCT_CONTEXTS =
53
+ [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE],
54
+ [/(\s|^)$/, /^#{ZH_CHAR}/o]].freeze
55
+
56
+ # map of YAML punct keys to auto-text Latin equivalents
57
+ ZH_PUNCT_AUTOTEXT = {
58
+ colon: ":",
59
+ comma: ",",
60
+ "enum-comma": ",",
61
+ semicolon: ";",
62
+ period: ".",
63
+ "close-paren": ")",
64
+ "open-paren": "(",
65
+ "close-bracket": "]",
66
+ "open-bracket": "[",
67
+ "question-mark": "?",
68
+ "exclamation-mark": "!",
69
+ "em-dash": "—",
70
+ "open-quote": "“",
71
+ "close-quote": "”",
72
+ "open-nested-quote": "’",
73
+ "close-nested-quote": "’",
74
+ ellipse: "…",
75
+ }.freeze
76
+
77
+ # Pre-defined punctuation mappings for efficiency
78
+ def init_zh_punct_map
79
+ ZH_PUNCT_AUTOTEXT.each_with_object([]) do |(k, v), m|
80
+ @labels.dig("punct", k.to_s) or next
81
+ m << [v, @labels["punct"][k.to_s], ZH_PUNCT_CONTEXTS]
82
+ end
83
+ end
84
+
85
+ def l10n_zh(text, script, options)
86
+ script ||= "Hans"
87
+ t, text_cache, xml, prev, _foll, esc_indices = l10n_prep(text, options)
88
+ t.each_with_index do |n, i|
89
+ next if esc_indices.include?(i) # Skip escaped nodes
90
+
91
+ # Adjust index if prev context prepended
92
+ prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
93
+ text = cleanup_entities(n.text, is_xml: false)
94
+ n.replace(l10_zh1(text, prev_ctx, foll_ctx, script, options))
95
+ end
96
+ to_xml(xml) #.gsub(/<\/?em>|<\/?strong>|<\/?i>|<\/?b>/, "")
97
+ end
98
+
99
+ # note: we can't differentiate comma from enumeration comma 、
100
+ # def l10_zh1(text, _script)
101
+ def l10_zh1(text, prev, foll, _script, options)
102
+ r = l10n_zh_punct(text, prev, foll, options)
103
+ r = l10n_zh_remove_space(r, prev, foll)
104
+ l10n_zh_dash(r, prev, foll)
105
+ end
106
+
107
+ def l10n_zh_punct(text, prev, foll, options)
108
+ # Use pre-defined mapping for better performance
109
+ @zh_punct_map ||= init_zh_punct_map
110
+ @zh_punct_map.each do |mapping|
111
+ punct_from, punct_to, regexes = mapping
112
+ options[:proportional_mixed_cjk] or regexes = nil
113
+ text = l10n_gsub(text, prev, foll, [punct_from, punct_to],
114
+ regexes)
115
+ end
116
+ text
117
+ end
118
+
119
+ def l10n_zh_dash(text, prev, foll)
120
+ text = l10n_gsub(text, prev, foll, ["–", @labels.dig("punct", "en-dash")],
121
+ [[ZH1_DASH, ZH2_DASH]])
122
+ l10n_gsub(text, prev, foll, ["–", @labels.dig("punct", "number-en-dash")],
123
+ [[ZH1_NUM_DASH, ZH2_NUM_DASH]])
124
+ end
125
+
126
+ def l10n_zh_remove_space(text, prev, foll)
127
+ text = l10n_gsub(text, prev, foll, [/\s+/, ""],
128
+ [[/(#{ZH_CHAR})$/o, /^#{ZH_CHAR}/o]])
129
+ if sep = @labels.dig("punct", "cjk-latin-separator")
130
+ # Skip over punctuation to find Latin letters/numbers
131
+ text = l10n_gsub(text, prev, foll, [/\s+/, sep],
132
+ [[/#{ZH_CHAR}$/o, /^\p{P}*[\p{Latin}\p{N}]/o]])
133
+ l10n_gsub(text, prev, foll, [/\s+/, sep],
134
+ [[/[\p{Latin}\p{N}]\p{P}*$/o, /^#{ZH_NON_PUNCT}/o]])
135
+ else
136
+ l10n_gsub(text, prev, foll, [/\s+/, ""],
137
+ [[/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o]])
138
+ end
139
+ end
140
+
141
+ def self.cjk_extend(text)
142
+ cjk_extend(text)
143
+ end
144
+
145
+ def cjk_extend(title)
146
+ @c.decode(title).chars.map.with_index do |n, i|
147
+ if i.zero? || !interleave_space_cjk?(title[i - 1] + title[i])
148
+ n
149
+ else "\u3000#{n}"
150
+ end
151
+ end.join
152
+ end
153
+
154
+ def interleave_space_cjk?(text)
155
+ text.size == 2 or return
156
+ ["\u2014\u2014", "\u2025\u2025", "\u2026\u2026",
157
+ "\u22ef\u22ef"].include?(text) ||
158
+ /\d\d|\p{Latin}\p{Latin}|[[:space:]]/.match?(text) ||
159
+ /^[\u2018\u201c(\u3014\[{\u3008\u300a\u300c\u300e\u3010\u2985\u3018\u3016\u00ab\u301d]/.match?(text) ||
160
+ /[\u2019\u201d)\u3015\]}\u3009\u300b\u300d\u300f\u3011\u2986\u3019\u3017\u00bb\u301f]$/.match?(text) ||
161
+ /[\u3002.\u3001,\u30fb:;\u2010\u301c\u30a0\u2013!?\u203c\u2047\u2048\u2049]/.match?(text) and return false
162
+ true
163
+ end
164
+ end
165
+ end
@@ -2,3 +2,24 @@ text: text
2
2
  at: at
3
3
  binary_and: "%1 <conn>and</conn> %2"
4
4
  multiple_and: "%1<conn>, and</conn> %2"
5
+ punct:
6
+ colon: ":"
7
+ comma: ","
8
+ enum_comma: ","
9
+ semicolon: ";"
10
+ period: "."
11
+ close_paren: ")"
12
+ open_paren: "("
13
+ close_bracket: "]"
14
+ open_bracket: "["
15
+ question_mark: "?"
16
+ exclamation_mark: "!"
17
+ em_dash: "—"
18
+ en_dash: "–"
19
+ number_en_dash: "–"
20
+ open_quote: "“"
21
+ close_quote: "”"
22
+ open_nested_quote: "’"
23
+ close_nested_quote: "’"
24
+ ellipse: …
25
+
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: isodoc-i18n
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-09-29 00:00:00.000000000 Z
11
+ date: 2025-10-13 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: base64
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: htmlentities
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -67,13 +81,13 @@ dependencies:
67
81
  - !ruby/object:Gem::Version
68
82
  version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
- name: base64
84
+ name: canon
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
87
  - - ">="
74
88
  - !ruby/object:Gem::Version
75
89
  version: '0'
76
- type: :runtime
90
+ type: :development
77
91
  prerelease: false
78
92
  version_requirements: !ruby/object:Gem::Requirement
79
93
  requirements:
@@ -234,20 +248,6 @@ dependencies:
234
248
  - - ">="
235
249
  - !ruby/object:Gem::Version
236
250
  version: '0'
237
- - !ruby/object:Gem::Dependency
238
- name: canon
239
- requirement: !ruby/object:Gem::Requirement
240
- requirements:
241
- - - ">="
242
- - !ruby/object:Gem::Version
243
- version: '0'
244
- type: :development
245
- prerelease: false
246
- version_requirements: !ruby/object:Gem::Requirement
247
- requirements:
248
- - - ">="
249
- - !ruby/object:Gem::Version
250
- version: '0'
251
251
  description: 'Internationalisation for Metanorma rendering
252
252
 
253
253
  '
@@ -271,6 +271,7 @@ files:
271
271
  - lib/isodoc/i18n.rb
272
272
  - lib/isodoc/i18n/version.rb
273
273
  - lib/isodoc/l10n.rb
274
+ - lib/isodoc/l10n_cjk.rb
274
275
  - lib/isodoc/liquid/liquid.rb
275
276
  homepage: https://github.com/metanorma/isodoc-i18n
276
277
  licenses: