isodoc-i18n 1.4.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 134575d665c75368d3640ef87c9c63a3f6fdc2bf668e8d8df233a3cb139fce6b
4
- data.tar.gz: ed1fc0e49c62f27b3199ba8357b2f1454309485efffbcb93ed30d51f01c36fe7
3
+ metadata.gz: 02e65f54740c83c1d9698623c18711219680d6541f88e2cd0aaa62d3595c103d
4
+ data.tar.gz: f57b0e7bf29a2d02576dda04b3d7b34ded97a42ec382ebdbc9caefb12a1aa17b
5
5
  SHA512:
6
- metadata.gz: f6e5ff44068372afc9ad75e1d6bff6483114a3424071c136a86f17745513ff5f31794c4555c9484eff556c50f638cb13c3ba712c4ae3e3b4d762685d1e888c2d
7
- data.tar.gz: 9c5131cb3dbb8800304530629dfe5b47a3d7b15695ff0762011998c5aa3d3273b7730dde057dd2f2f15e3f997c9466704be80d7ef9cd47a5055e93afc0a5ba32
6
+ metadata.gz: b0a869f0df1ef445e5f18d336d0d3f18968e7055444e81535d7da4219a7c24f4e370ea45e0033d1ed2766d8fc5ea2c577f74ea07c232fc6217ff236aacc65e8e
7
+ data.tar.gz: 801f9ccb9c81c372c77834ad3667b88491f0aa1429c588e1a8c1854fc5ca8f1a817607de4bd0c44ed1d02f2a3984d5c336714692cff31c860d6c14a658502e9b
data/isodoc-i18n.gemspec CHANGED
@@ -22,12 +22,13 @@ Gem::Specification.new do |spec|
22
22
  end
23
23
  spec.required_ruby_version = Gem::Requirement.new(">= 2.7.0")
24
24
 
25
+ spec.add_dependency "base64"
25
26
  spec.add_dependency "htmlentities", "~> 4.3.4"
26
27
  spec.add_dependency "liquid", "~> 5"
27
28
  spec.add_dependency "metanorma-utils", ">= 1.7.0"
28
29
  spec.add_dependency "twitter_cldr"
29
- spec.add_dependency "base64"
30
30
 
31
+ spec.add_development_dependency "canon", "= 0.1.3"
31
32
  spec.add_development_dependency "debug"
32
33
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
33
34
  spec.add_development_dependency "guard", "~> 2.14"
@@ -35,10 +36,9 @@ Gem::Specification.new do |spec|
35
36
  spec.add_development_dependency "rake", "~> 13.0"
36
37
  spec.add_development_dependency "rspec", "~> 3.6"
37
38
  spec.add_development_dependency "rubocop", "~> 1"
38
- spec.add_development_dependency "rubocop-performance"
39
+ spec.add_development_dependency "rubocop-performance"
39
40
  spec.add_development_dependency "simplecov", "~> 0.15"
40
41
  spec.add_development_dependency "timecop", "~> 0.9"
41
42
  spec.add_development_dependency "webmock"
42
- spec.add_development_dependency "canon"
43
43
  # spec.metadata["rubygems_mfa_required"] = "true"
44
44
  end
@@ -1,5 +1,5 @@
1
1
  module IsoDoc
2
2
  class I18n
3
- VERSION = "1.4.0".freeze
3
+ VERSION = "1.4.2".freeze
4
4
  end
5
5
  end
@@ -7,11 +7,80 @@ module IsoDoc
7
7
 
8
8
  def load_yaml(lang, script, i18nyaml = nil, i18nhash = nil)
9
9
  ret = load_yaml1(lang, script)
10
- i18nyaml and
11
- return normalise_hash(ret.deep_merge(YAML.load_file(i18nyaml)))
12
- i18nhash and return normalise_hash(ret.deep_merge(i18nhash))
10
+ if i18nyaml
11
+ Array(i18nyaml).compact.each do |y|
12
+ ret = ret.deep_merge(YAML.load_file(y))
13
+ end
14
+ return postprocess(ret)
15
+ end
16
+ i18nhash and return postprocess(ret.deep_merge(i18nhash))
17
+ postprocess(ret)
18
+ end
19
+
20
+ def postprocess(labels)
21
+ self_reference_resolve(normalise_hash(labels))
22
+ end
23
+
24
+ def self_reference_resolve(labels)
25
+ resolve_references(labels, labels)
26
+ end
13
27
 
14
- normalise_hash(ret)
28
+ def resolve_references(obj, labels)
29
+ case obj
30
+ when Hash
31
+ obj.transform_values { |v| resolve_references(v, labels) }
32
+ when Array
33
+ obj.map { |item| resolve_references(item, labels) }
34
+ when String
35
+ resolve_string_references(obj, labels)
36
+ else
37
+ obj
38
+ end
39
+ end
40
+
41
+ def resolve_string_references(str, labels)
42
+ # Match patterns like #{self["key"]["subkey"]} or #{self.key.subkey}
43
+ # Allow spaces around the self expression
44
+ str.gsub(/\#\{\s*self([^}]+?)\s*\}/) do |match|
45
+ path_expr = Regexp.last_match(1)
46
+ resolve_path(path_expr, labels, match)
47
+ end
48
+ end
49
+
50
+ def resolve_path(path_expr, labels, original_expr)
51
+ segments = parse_path(path_expr)
52
+ current = labels
53
+
54
+ segments.each do |segment|
55
+ case current
56
+ when Hash
57
+ current.key?(segment) or
58
+ raise "Self-reference error: Path '#{original_expr}' not found - key '#{segment}' does not exist"
59
+ current = current[segment]
60
+ when Array
61
+ index = segment.to_i
62
+ segment =~ /^\d+$/ && index >= 0 && index < current.length or
63
+ raise "Self-reference error: Path '#{original_expr}' not found - invalid array index '#{segment}'"
64
+ current = current[index]
65
+ else
66
+ raise "Self-reference error: Path '#{original_expr}' not found - cannot navigate through non-collection type"
67
+ end
68
+ end
69
+
70
+ current.to_s
71
+ end
72
+
73
+ def parse_path(path_expr)
74
+ # Split by dots and brackets while preserving the content
75
+ parts = path_expr.sub(/^\./, "").scan(/\.?([\w-]+)|\[([^\]]+)\]/)
76
+ parts.each_with_object([]) do |(dot_part, bracket_part), segments|
77
+ if dot_part
78
+ segments << dot_part
79
+ elsif bracket_part
80
+ segment = bracket_part.strip.gsub(/^["']|["']$/, "")
81
+ segments << segment
82
+ end
83
+ end
15
84
  end
16
85
 
17
86
  def normalise_hash(ret)
@@ -30,8 +99,8 @@ module IsoDoc
30
99
  def load_yaml1(lang, script)
31
100
  case lang
32
101
  when "zh"
33
- if script == "Hans" then load_yaml2("zh-Hans")
34
- else load_yaml2("en")
102
+ if script then load_yaml2("zh-#{script}")
103
+ else load_yaml2("zh-Hans")
35
104
  end
36
105
  else
37
106
  load_yaml2(lang)
@@ -54,5 +123,9 @@ module IsoDoc
54
123
  def set(key, val)
55
124
  @labels[key] = val
56
125
  end
126
+
127
+ def merge(new_labels)
128
+ @labels = @labels.deep_merge(new_labels)
129
+ end
57
130
  end
58
131
  end
data/lib/isodoc/i18n.rb CHANGED
@@ -27,7 +27,6 @@ module IsoDoc
27
27
  CJK_SCRIPTS = %w(Hans Hant Jpan Kore).freeze
28
28
 
29
29
  def liquid_init
30
- ::IsoDoc::I18n::Liquid.set(self)
31
30
  ::Liquid::Environment.default.register_filter(::IsoDoc::I18n::Liquid)
32
31
  end
33
32
 
@@ -50,6 +49,7 @@ module IsoDoc
50
49
 
51
50
  # populate with variables, Liquid, inflections, ordinals/spellout
52
51
  def populate(keys, vars = {})
52
+ ::IsoDoc::I18n::Liquid.set(self)
53
53
  ::Liquid::Template.parse(@labels.dig(*Array(keys)))
54
54
  .render(vars.merge("labels" => @labels))
55
55
  end
@@ -68,7 +68,9 @@ module IsoDoc
68
68
  end
69
69
 
70
70
  def enum_comma
71
- CJK_SCRIPTS.include?(@script) and return "<enum-comma>、</enum-comma>"
71
+ c = @labels.dig("punct", "enum-comma")
72
+ c && CJK_SCRIPTS.include?(@script) and
73
+ return "<enum-comma>#{c}</enum-comma>"
72
74
  "<enum-comma>,</enum-comma> "
73
75
  end
74
76
 
data/lib/isodoc/l10n.rb CHANGED
@@ -1,55 +1,23 @@
1
1
  require "metanorma-utils"
2
+ require_relative "l10n_cjk"
2
3
 
3
4
  module IsoDoc
4
5
  class I18n
5
- # Use comprehensive CJK definition from metanorma-utils
6
- # This includes Han, Katakana, Hiragana, Hangul, Bopomofo and all CJK extensions
7
- ZH_CHAR = "(#{Metanorma::Utils::CJK})".freeze
8
- LATIN_PUNCT = /[:,.()\[\];?!-]/.freeze
9
-
10
- # Condition for converting punctuation to double width:
11
- # 1. (Strict condition) CJK before, CJK after, modulo ignorable characters:
12
- # 1a. CJK character, or start of string. Latin spaces optional.
13
- ZH1_PUNCT = /(#{ZH_CHAR}|^)(\s*)$/xo.freeze
14
- # 1b. Latin spaces optional, Latin punct which will also convert to CJK,
15
- # CJK character, or end of string.
16
- ZH2_PUNCT = /^\s*#{LATIN_PUNCT}*(#{ZH_CHAR}|$)/xo.freeze
17
- # 2. CJK before, space after:
18
- # 2a. CJK char, followed by optional Latin punct which will also convert to CJK
19
- ZH1_NO_SPACE = /#{ZH_CHAR}#{LATIN_PUNCT}*$/xo.freeze
20
- # 2b. optional Latin punct which wil also convert to CJK, then space
21
- OPT_PUNCT_SPACE = /^($|#{LATIN_PUNCT}*\s)/xo.freeze
22
-
23
- # Contexts for converting en-dashes to full-width
24
- # Before: CJK or start of string, optional digits
25
- ZH1_DASH = /(#{ZH_CHAR}|^)(\d*)$/xo.freeze
26
- # After: optional digits, CJK or end of string
27
- ZH2_DASH = /^\d*(#{ZH_CHAR}|$)/xo.freeze
28
-
29
- # Pre-defined punctuation mappings for efficiency
30
- ZH_PUNCT_MAP = [
31
- ["::", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
32
- [",,", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
33
- [".。", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
34
- ["))", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
35
- ["]]", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
36
- [";;", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
37
- ["??", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
38
- ["!!", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
39
- ["((", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
40
- ["[[", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]]
41
- ].freeze
42
-
43
6
  def self.l10n(text, lang = @lang, script = @script, options = {})
44
7
  l10n(text, lang, script, options)
45
8
  end
46
9
 
47
10
  # function localising spaces and punctuation
48
11
  # options[:prev] and options[:foll] are optional context strings
12
+ # options[:proportional_mixed_cjk] allows contextual full-width vs
13
+ # half-width punctuation
49
14
  def l10n(text, lang = @lang, script = @script, options = {})
50
15
  locale = options[:locale] || @locale
51
- %w(zh ja ko).include?(lang) and text = l10n_zh(text, script, options[:prev], options[:foll])
52
- lang == "fr" && text = l10n_fr(text, locale || "FR", options[:prev], options[:foll])
16
+ %w(zh ja ko).include?(lang) and
17
+ text = l10n_zh(text, script, options)
18
+ lang == "fr" and
19
+ text = l10n_fr(text, locale || "FR", options)
20
+ text&.gsub!(/<esc>|<\/esc>/, "") # Strip esc tags
53
21
  bidiwrap(text, lang, script)
54
22
  end
55
23
 
@@ -71,30 +39,36 @@ module IsoDoc
71
39
  .default_script(@lang))]
72
40
  end
73
41
 
74
- # CJK
75
- def l10n_zh(text, script, prev, foll)
76
- script ||= "Hans"
77
- t, text_cache, xml = l10n_prep(text, prev, foll)
78
- t.each_with_index do |n, i|
79
- # Adjust index if prev context prepended
80
- prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
81
- text = cleanup_entities(n.text, is_xml: false)
82
- n.replace(l10_zh1(text, prev_ctx, foll_ctx, script))
83
- end
84
- to_xml(xml).gsub(/<b>|<\/b>|<\?[^>]+>/, "")
42
+ def l10n_prep(text, options)
43
+ xml = Nokogiri::XML::DocumentFragment.parse(text)
44
+ t = xml.xpath(".//text()").reject { |node| node.text.empty? }
45
+ text_cache = build_text_cache(t, options[:prev], options[:foll])
46
+
47
+ # Find all text nodes within <esc> tags in one XPath query
48
+ # This is O(n) instead of O(n*m) where m is tree depth
49
+ esc_indices = build_esc_indices(xml, t)
50
+
51
+ [t, text_cache, xml, options[:prev], options[:foll], esc_indices]
85
52
  end
86
53
 
87
- def l10n_prep(text, prev, foll)
88
- xml = Nokogiri::XML::DocumentFragment.parse(text)
89
- t = xml.xpath(".//text()")
90
- text_cache = build_text_cache(t, prev, foll)
91
- [t, text_cache, xml]
54
+ # Build set of indices for text nodes within <esc> tags
55
+ # Handles both namespaced and non-namespaced <esc> elements
56
+ def build_esc_indices(xml, text_nodes)
57
+ # Try both non-namespaced and namespace-agnostic queries
58
+ esc_text_nodes = Set.new(xml.xpath(".//esc//text()") +
59
+ xml.xpath(".//*[local-name()='esc']//text()"))
60
+ Set.new.tap do |indices|
61
+ text_nodes.each_with_index do |node, i|
62
+ indices.add(i) if esc_text_nodes.include?(node)
63
+ end
64
+ end
92
65
  end
93
66
 
94
67
  # Cache text content once per method call to avoid repeated .text calls
95
68
  # Build text cache with optional prepended/appended context
69
+ # Also, reduce multiple spaces to single, to avoid miscrecognition of space
96
70
  def build_text_cache(text_nodes, prev_context = nil, foll_context = nil)
97
- text_cache = text_nodes.map(&:text)
71
+ text_cache = text_nodes.map(&:text).map { |x| x.gsub(/\s+/, " ") }
98
72
  text_cache.unshift(prev_context) if prev_context
99
73
  text_cache.push(foll_context) if foll_context
100
74
  text_cache
@@ -117,9 +91,11 @@ module IsoDoc
117
91
  [prev, foll]
118
92
  end
119
93
 
120
- def l10n_fr(text, locale, prev, foll)
121
- t, text_cache, xml = l10n_prep(text, prev, foll)
94
+ def l10n_fr(text, locale, options)
95
+ t, text_cache, xml, prev, _foll, esc_indices = l10n_prep(text, options)
122
96
  t.each_with_index do |n, i|
97
+ next if esc_indices.include?(i) # Skip escaped nodes
98
+
123
99
  prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
124
100
  text = cleanup_entities(n.text, is_xml: false)
125
101
  n.replace(l10n_fr1(text, prev_ctx, foll_ctx, locale))
@@ -127,35 +103,15 @@ module IsoDoc
127
103
  to_xml(xml)
128
104
  end
129
105
 
130
- # note: we can't differentiate comma from enumeration comma 、
131
- # def l10_zh1(text, _script)
132
- def l10_zh1(text, prev, foll, _script)
133
- r = l10n_zh_punct(text, prev, foll)
134
- r = l10n_zh_remove_space(r, prev, foll)
135
- l10n_zh_dash(r, prev, foll)
136
- end
137
-
138
- def l10n_zh_punct(text, prev, foll)
139
- # Use pre-defined mapping for better performance
140
- ZH_PUNCT_MAP.each do |mapping|
141
- punct_pair, regexes = mapping
142
- text = l10n_gsub(text, prev, foll, [punct_pair[0], punct_pair[1]], regexes)
143
- end
144
- text
145
- end
146
-
147
- def l10n_zh_dash(text, prev, foll)
148
- l10n_gsub(text, prev, foll, %w(– ~), [[ZH1_DASH, ZH2_DASH]])
149
- end
150
-
151
106
  # text: string we are scanning for instances of delim[0] to replace
152
107
  # prev: string preceding text, as additional token of context
153
108
  # foll: string following text, as additional token of context
154
109
  # delim: delim[0] is the symbol we want to replace, delim[1] its replacement
155
110
  # regexes: a list of regex pairs: the context before the found token,
156
111
  # and the context after the found token, under which replacing it
157
- # with delim[1] is permitted
112
+ # with delim[1] is permitted. If regex is nil, always allow the replacement
158
113
  def l10n_gsub(text, prev, foll, delim, regexes)
114
+ delim[1] or return text
159
115
  context = l10n_gsub_context(text, prev, foll, delim) or return text
160
116
  (1...(context.size - 1)).each do |i|
161
117
  l10_context_valid?(context, i, delim, regexes) and
@@ -170,11 +126,12 @@ module IsoDoc
170
126
  d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
171
127
  context = text.split(/(#{d})/) # delim to replace
172
128
  context.size == 1 and return
173
- [prev, context, foll].flatten
129
+ [prev, context.reject(&:empty?), foll].flatten
174
130
  end
175
131
 
176
132
  def l10_context_valid?(context, idx, delim, regex)
177
133
  l10n_context_found_delimiter?(context[idx], delim) or return false
134
+ regex.nil? and return true
178
135
  regex.detect do |r|
179
136
  r[0].match?(context[0...idx].join) && # preceding context
180
137
  r[1].match?(context[(idx + 1)..-1].join) # foll context
@@ -189,13 +146,6 @@ module IsoDoc
189
146
  end
190
147
  end
191
148
 
192
- def l10n_zh_remove_space(text, prev, foll)
193
- text = l10n_gsub(text, prev, foll, [" ", ""],
194
- [[/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o]])
195
- l10n_gsub(text, prev, foll, [" ", ""],
196
- [[/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o]])
197
- end
198
-
199
149
  def l10n_fr1(text, prev, foll, locale)
200
150
  text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
201
151
  [[/\p{Alnum}$/, /^(\s|$)/]])
@@ -206,30 +156,6 @@ module IsoDoc
206
156
  [[/\p{Alnum}$/, /^(\s|$)/]])
207
157
  end
208
158
 
209
- def self.cjk_extend(text)
210
- cjk_extend(text)
211
- end
212
-
213
- def cjk_extend(title)
214
- @c.decode(title).chars.map.with_index do |n, i|
215
- if i.zero? || !interleave_space_cjk?(title[i - 1] + title[i])
216
- n
217
- else "\u3000#{n}"
218
- end
219
- end.join
220
- end
221
-
222
- def interleave_space_cjk?(text)
223
- text.size == 2 or return
224
- ["\u2014\u2014", "\u2025\u2025", "\u2026\u2026",
225
- "\u22ef\u22ef"].include?(text) ||
226
- /\d\d|\p{Latin}\p{Latin}|[[:space:]]/.match?(text) ||
227
- /^[\u2018\u201c(\u3014\[{\u3008\u300a\u300c\u300e\u3010\u2985\u3018\u3016\u00ab\u301d]/.match?(text) ||
228
- /[\u2019\u201d)\u3015\]}\u3009\u300b\u300d\u300f\u3011\u2986\u3019\u3017\u00bb\u301f]$/.match?(text) ||
229
- /[\u3002.\u3001,\u30fb:;\u2010\u301c\u30a0\u2013!?\u203c\u2047\u2048\u2049]/.match?(text) and return false
230
- true
231
- end
232
-
233
159
  def to_xml(node)
234
160
  node&.to_xml(encoding: "UTF-8", indent: 0,
235
161
  save_with: Nokogiri::XML::Node::SaveOptions::AS_XML)
@@ -0,0 +1,165 @@
1
+ module IsoDoc
2
+ class I18n
3
+ # Use comprehensive CJK definition from metanorma-utils
4
+ # This includes Han, Katakana, Hiragana, Hangul, Bopomofo
5
+ # and all CJK extensions
6
+ ZH_CHAR = "(#{Metanorma::Utils::CJK})".freeze
7
+ LATIN_PUNCT = /[:,.()\[\];?!-]/.freeze
8
+ # CJK character which is not punctuation
9
+ ZH_NON_PUNCT = "(#{
10
+ [
11
+ Metanorma::Utils.singleton_class::HAN,
12
+ Metanorma::Utils.singleton_class::HAN_IDC,
13
+ Metanorma::Utils.singleton_class::KANBUN,
14
+ Metanorma::Utils.singleton_class::CJK_COMPAT_IDEOGRAPHS,
15
+ Metanorma::Utils.singleton_class::HAN_COMPAT_IDEOGRAPHS,
16
+ Metanorma::Utils.singleton_class::HANGUL,
17
+ Metanorma::Utils.singleton_class::HIRAGANA,
18
+ Metanorma::Utils.singleton_class::KATAKANA,
19
+ Metanorma::Utils.singleton_class::BOPOMOFO,
20
+ ].join("|")})".freeze
21
+
22
+ # Condition for converting punctuation to double width,
23
+ # in case of options[:proportional_mixed_cjk]
24
+ # 1. (Strict condition) CJK before, CJK after, modulo ignorable characters:
25
+ # 1a. CJK character, or start of string. Latin spaces optional.
26
+ ZH1_PUNCT = /(#{ZH_CHAR}|^)(\s*)$/xo.freeze
27
+ # 1b. Latin spaces optional, Latin punct which will also convert to CJK,
28
+ # CJK character, or end of string.
29
+ ZH2_PUNCT = /^\s*#{LATIN_PUNCT}*(#{ZH_CHAR}|$)/xo.freeze
30
+ # 2. CJK before, space after:
31
+ # 2a. CJK char, followed by optional Latin punct which will also convert to CJK
32
+ ZH1_NO_SPACE = /#{ZH_CHAR}#{LATIN_PUNCT}*$/xo.freeze
33
+ # 2b. optional Latin punct which wil also convert to CJK, then space
34
+ OPT_PUNCT_SPACE = /^($|#{LATIN_PUNCT}*\s)/xo.freeze
35
+
36
+ # Chinese numerals (common + formal/financial forms)
37
+ # Explicit characters needed because Chinese numeral ideographs
38
+ # are not tagged with Unicode Number property
39
+ # Using alternation instead of character class to properly include \p{N}
40
+ ZH_NUMERALS = "(?:[零一二三四五六七八九十百千万亿壹贰叁肆伍陆柒捌玖拾佰仟萬億兆]|\\p{N})".freeze
41
+
42
+ # Contexts for converting en-dashes to full-width
43
+ # Before: CJK or start of string, no digits
44
+ ZH1_DASH = /(#{ZH_CHAR}|^)(?<!=#{ZH_NUMERALS})$/xo.freeze
45
+ # After: no optional digits, CJK or end of string
46
+ ZH2_DASH = /^(?!#{ZH_NUMERALS})(#{ZH_CHAR}|$)/xo.freeze
47
+ # Before: CJK or start of string, optional digits
48
+ ZH1_NUM_DASH = /#{ZH_NUMERALS}$/xo.freeze
49
+ # After: optional digits, CJK or end of string
50
+ ZH2_NUM_DASH = /^#{ZH_NUMERALS}/xo.freeze
51
+
52
+ ZH_PUNCT_CONTEXTS =
53
+ [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE],
54
+ [/(\s|^)$/, /^#{ZH_CHAR}/o]].freeze
55
+
56
+ # map of YAML punct keys to auto-text Latin equivalents
57
+ ZH_PUNCT_AUTOTEXT = {
58
+ colon: ":",
59
+ comma: ",",
60
+ # "enum-comma": ",", # enum-comma is ambiguous with comma
61
+ semicolon: ";",
62
+ period: ".",
63
+ "close-paren": ")",
64
+ "open-paren": "(",
65
+ "close-bracket": "]",
66
+ "open-bracket": "[",
67
+ "question-mark": "?",
68
+ "exclamation-mark": "!",
69
+ "em-dash": "—",
70
+ "open-quote": "“",
71
+ "close-quote": "”",
72
+ "open-nested-quote": "’",
73
+ "close-nested-quote": "’",
74
+ ellipse: "…",
75
+ }.freeze
76
+
77
+ # Pre-defined punctuation mappings for efficiency
78
+ def init_zh_punct_map
79
+ ZH_PUNCT_AUTOTEXT.each_with_object([]) do |(k, v), m|
80
+ @labels.dig("punct", k.to_s) or next
81
+ m << [v, @labels["punct"][k.to_s], ZH_PUNCT_CONTEXTS]
82
+ end
83
+ end
84
+
85
+ def l10n_zh(text, script, options)
86
+ script ||= "Hans"
87
+ t, text_cache, xml, prev, _foll, esc_indices = l10n_prep(text, options)
88
+ t.each_with_index do |n, i|
89
+ next if esc_indices.include?(i) # Skip escaped nodes
90
+
91
+ # Adjust index if prev context prepended
92
+ prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
93
+ text = cleanup_entities(n.text, is_xml: false)
94
+ n.replace(l10_zh1(text, prev_ctx, foll_ctx, script, options))
95
+ end
96
+ to_xml(xml) #.gsub(/<\/?em>|<\/?strong>|<\/?i>|<\/?b>/, "")
97
+ end
98
+
99
+ # note: we can't differentiate comma from enumeration comma 、
100
+ # def l10_zh1(text, _script)
101
+ def l10_zh1(text, prev, foll, _script, options)
102
+ r = l10n_zh_punct(text, prev, foll, options)
103
+ r = l10n_zh_remove_space(r, prev, foll)
104
+ l10n_zh_dash(r, prev, foll)
105
+ end
106
+
107
+ def l10n_zh_punct(text, prev, foll, options)
108
+ # Use pre-defined mapping for better performance
109
+ @zh_punct_map ||= init_zh_punct_map
110
+ @zh_punct_map.each do |mapping|
111
+ punct_from, punct_to, regexes = mapping
112
+ options[:proportional_mixed_cjk] or regexes = nil
113
+ text = l10n_gsub(text, prev, foll, [punct_from, punct_to],
114
+ regexes)
115
+ end
116
+ text
117
+ end
118
+
119
+ def l10n_zh_dash(text, prev, foll)
120
+ text = l10n_gsub(text, prev, foll, ["–", @labels.dig("punct", "en-dash")],
121
+ [[ZH1_DASH, ZH2_DASH]])
122
+ l10n_gsub(text, prev, foll, ["–", @labels.dig("punct", "number-en-dash")],
123
+ [[ZH1_NUM_DASH, ZH2_NUM_DASH]])
124
+ end
125
+
126
+ def l10n_zh_remove_space(text, prev, foll)
127
+ text = l10n_gsub(text, prev, foll, [/\s+/, ""],
128
+ [[/(#{ZH_CHAR})$/o, /^#{ZH_CHAR}/o]])
129
+ if sep = @labels.dig("punct", "cjk-latin-separator")
130
+ # Skip over punctuation to find Latin letters/numbers
131
+ text = l10n_gsub(text, prev, foll, [/\s+/, sep],
132
+ [[/#{ZH_CHAR}$/o, /^\p{P}*[\p{Latin}\p{N}]/o]])
133
+ l10n_gsub(text, prev, foll, [/\s+/, sep],
134
+ [[/[\p{Latin}\p{N}]\p{P}*$/o, /^#{ZH_NON_PUNCT}/o]])
135
+ else
136
+ l10n_gsub(text, prev, foll, [/\s+/, ""],
137
+ [[/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o]])
138
+ end
139
+ end
140
+
141
+ def self.cjk_extend(text)
142
+ cjk_extend(text)
143
+ end
144
+
145
+ def cjk_extend(title)
146
+ @c.decode(title).chars.map.with_index do |n, i|
147
+ if i.zero? || !interleave_space_cjk?(title[i - 1] + title[i])
148
+ n
149
+ else "\u3000#{n}"
150
+ end
151
+ end.join
152
+ end
153
+
154
+ def interleave_space_cjk?(text)
155
+ text.size == 2 or return
156
+ ["\u2014\u2014", "\u2025\u2025", "\u2026\u2026",
157
+ "\u22ef\u22ef"].include?(text) ||
158
+ /\d\d|\p{Latin}\p{Latin}|[[:space:]]/.match?(text) ||
159
+ /^[\u2018\u201c(\u3014\[{\u3008\u300a\u300c\u300e\u3010\u2985\u3018\u3016\u00ab\u301d]/.match?(text) ||
160
+ /[\u2019\u201d)\u3015\]}\u3009\u300b\u300d\u300f\u3011\u2986\u3019\u3017\u00bb\u301f]$/.match?(text) ||
161
+ /[\u3002.\u3001,\u30fb:;\u2010\u301c\u30a0\u2013!?\u203c\u2047\u2048\u2049]/.match?(text) and return false
162
+ true
163
+ end
164
+ end
165
+ end
@@ -2,3 +2,24 @@ text: text
2
2
  at: at
3
3
  binary_and: "%1 <conn>and</conn> %2"
4
4
  multiple_and: "%1<conn>, and</conn> %2"
5
+ punct:
6
+ colon: ":"
7
+ comma: ","
8
+ enum_comma: ","
9
+ semicolon: ";"
10
+ period: "."
11
+ close_paren: ")"
12
+ open_paren: "("
13
+ close_bracket: "]"
14
+ open_bracket: "["
15
+ question_mark: "?"
16
+ exclamation_mark: "!"
17
+ em_dash: "—"
18
+ en_dash: "–"
19
+ number_en_dash: "–"
20
+ open_quote: "“"
21
+ close_quote: "”"
22
+ open_nested_quote: "’"
23
+ close_nested_quote: "’"
24
+ ellipse: …
25
+
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: isodoc-i18n
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-09-29 00:00:00.000000000 Z
11
+ date: 2025-11-03 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: base64
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: htmlentities
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -67,19 +81,19 @@ dependencies:
67
81
  - !ruby/object:Gem::Version
68
82
  version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
- name: base64
84
+ name: canon
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
- - - ">="
87
+ - - '='
74
88
  - !ruby/object:Gem::Version
75
- version: '0'
76
- type: :runtime
89
+ version: 0.1.3
90
+ type: :development
77
91
  prerelease: false
78
92
  version_requirements: !ruby/object:Gem::Requirement
79
93
  requirements:
80
- - - ">="
94
+ - - '='
81
95
  - !ruby/object:Gem::Version
82
- version: '0'
96
+ version: 0.1.3
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: debug
85
99
  requirement: !ruby/object:Gem::Requirement
@@ -234,20 +248,6 @@ dependencies:
234
248
  - - ">="
235
249
  - !ruby/object:Gem::Version
236
250
  version: '0'
237
- - !ruby/object:Gem::Dependency
238
- name: canon
239
- requirement: !ruby/object:Gem::Requirement
240
- requirements:
241
- - - ">="
242
- - !ruby/object:Gem::Version
243
- version: '0'
244
- type: :development
245
- prerelease: false
246
- version_requirements: !ruby/object:Gem::Requirement
247
- requirements:
248
- - - ">="
249
- - !ruby/object:Gem::Version
250
- version: '0'
251
251
  description: 'Internationalisation for Metanorma rendering
252
252
 
253
253
  '
@@ -271,6 +271,7 @@ files:
271
271
  - lib/isodoc/i18n.rb
272
272
  - lib/isodoc/i18n/version.rb
273
273
  - lib/isodoc/l10n.rb
274
+ - lib/isodoc/l10n_cjk.rb
274
275
  - lib/isodoc/liquid/liquid.rb
275
276
  homepage: https://github.com/metanorma/isodoc-i18n
276
277
  licenses: