isodoc-i18n 1.3.2 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2a242d0bc7609246ea502ffec35e515457e739c478b25478a01c7ff641dc4523
4
- data.tar.gz: 0f89c271766d244f037a0b6c6f40431888c2b6140814ac2b6e4d2906e142b1d5
3
+ metadata.gz: e08e5e4a6c9b89a5f628ee426ce22f8bcdbae97e7cccd4411f4962844d29b231
4
+ data.tar.gz: 9d8cbc5526c1d8aabe9db00c25deeb581a3604358b50b18e6f5a9bb7faab6909
5
5
  SHA512:
6
- metadata.gz: cb3f3f3a28b1b8fddd35ff74290ccf62f905ef54d51d62efd4fd8000d2d38a389eb4c78ec02aefd83ab0d1d98aaec462a0138c51257f8548c07f4a4726854c38
7
- data.tar.gz: f6dcb2d4c05c630527d02888bf614a0e09396166e2f27437dedc0cc0a619149c6606fe2a8a78ce3c98948d4279ef3bfa224d9c1b18fbb1d38e74d92d280b74db
6
+ metadata.gz: 3880d68a1f094ab500840e74767f970af57fca01a12ae751da68e9cdc72ba00df9139516df3c74dda5059114fe723b800c057c665efdd485e44aa9a247cdde22
7
+ data.tar.gz: 80643511d523a3535293a2b7b82e60fe81a8910dbe9fc336cf8a4ab95a9763b1adf1375f802f5b36177a6e4570c51688b803576f7d2d39b8fc7c29e894d2f235
data/README.adoc CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  image:https://img.shields.io/gem/v/isodoc-i18n.svg["Gem Version", link="https://rubygems.org/gems/isodoc-i18n"]
4
4
  image:https://github.com/metanorma/isodoc-i18n/workflows/rake/badge.svg["Build Status", link="https://github.com/metanorma/isodoc-i18n/actions?query=workflow%3Arake"]
5
- image:https://codeclimate.com/github/metanorma/isodoc-i18n/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/isodoc-i18n"]
5
+ // image:https://codeclimate.com/github/metanorma/isodoc-i18n/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/isodoc-i18n"]
6
6
  image:https://img.shields.io/github/issues-pr-raw/metanorma/isodoc-i18n.svg["Pull Requests", link="https://github.com/metanorma/isodoc-i18n/pulls"]
7
7
  image:https://img.shields.io/github/commits-since/metanorma/isodoc-i18n/latest.svg["Commits since latest",link="https://github.com/metanorma/isodoc-i18n/releases"]
8
8
 
data/isodoc-i18n.gemspec CHANGED
@@ -22,22 +22,23 @@ Gem::Specification.new do |spec|
22
22
  end
23
23
  spec.required_ruby_version = Gem::Requirement.new(">= 2.7.0")
24
24
 
25
+ spec.add_dependency "base64"
25
26
  spec.add_dependency "htmlentities", "~> 4.3.4"
26
27
  spec.add_dependency "liquid", "~> 5"
27
28
  spec.add_dependency "metanorma-utils", ">= 1.7.0"
28
29
  spec.add_dependency "twitter_cldr"
29
- spec.add_dependency "base64"
30
30
 
31
+ spec.add_development_dependency "canon"
31
32
  spec.add_development_dependency "debug"
32
33
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
33
34
  spec.add_development_dependency "guard", "~> 2.14"
34
35
  spec.add_development_dependency "guard-rspec", "~> 4.7"
35
36
  spec.add_development_dependency "rake", "~> 13.0"
36
37
  spec.add_development_dependency "rspec", "~> 3.6"
37
- spec.add_development_dependency "rubocop", "~> 1.5.2"
38
+ spec.add_development_dependency "rubocop", "~> 1"
39
+ spec.add_development_dependency "rubocop-performance"
38
40
  spec.add_development_dependency "simplecov", "~> 0.15"
39
41
  spec.add_development_dependency "timecop", "~> 0.9"
40
42
  spec.add_development_dependency "webmock"
41
- spec.add_development_dependency "xml-c14n"
42
43
  # spec.metadata["rubygems_mfa_required"] = "true"
43
44
  end
@@ -1,5 +1,5 @@
1
1
  module IsoDoc
2
2
  class I18n
3
- VERSION = "1.3.2".freeze
3
+ VERSION = "1.4.1".freeze
4
4
  end
5
5
  end
@@ -8,10 +8,79 @@ module IsoDoc
8
8
  def load_yaml(lang, script, i18nyaml = nil, i18nhash = nil)
9
9
  ret = load_yaml1(lang, script)
10
10
  i18nyaml and
11
- return normalise_hash(ret.deep_merge(YAML.load_file(i18nyaml)))
12
- i18nhash and return normalise_hash(ret.deep_merge(i18nhash))
11
+ return postprocess(ret.deep_merge(YAML.load_file(i18nyaml)))
12
+ i18nhash and return postprocess(ret.deep_merge(i18nhash))
13
13
 
14
- normalise_hash(ret)
14
+ postprocess(ret)
15
+ end
16
+
17
+ def postprocess(labels)
18
+ self_reference_resolve(normalise_hash(labels))
19
+ end
20
+
21
+ def self_reference_resolve(labels)
22
+ resolve_references(labels, labels)
23
+ end
24
+
25
+ def resolve_references(obj, labels)
26
+ case obj
27
+ when Hash
28
+ obj.transform_values { |v| resolve_references(v, labels) }
29
+ when Array
30
+ obj.map { |item| resolve_references(item, labels) }
31
+ when String
32
+ resolve_string_references(obj, labels)
33
+ else
34
+ obj
35
+ end
36
+ end
37
+
38
+ def resolve_string_references(str, labels)
39
+ # Match patterns like #{self["key"]["subkey"]} or #{self.key.subkey}
40
+ # Allow spaces around the self expression
41
+ str.gsub(/\#\{\s*self([^\}]+?)\s*\}/) do |match|
42
+ path_expr = Regexp.last_match(1)
43
+ resolve_path(path_expr, labels, match)
44
+ end
45
+ end
46
+
47
+ def resolve_path(path_expr, labels, original_expr)
48
+ segments = parse_path(path_expr)
49
+ current = labels
50
+
51
+ segments.each do |segment|
52
+ case current
53
+ when Hash
54
+ current.key?(segment) or
55
+ raise "Self-reference error: Path '#{original_expr}' not found - key '#{segment}' does not exist"
56
+ current = current[segment]
57
+ when Array
58
+ index = segment.to_i
59
+ segment =~ /^\d+$/ && index >= 0 && index < current.length or
60
+ raise "Self-reference error: Path '#{original_expr}' not found - invalid array index '#{segment}'"
61
+ current = current[index]
62
+ else
63
+ raise "Self-reference error: Path '#{original_expr}' not found - cannot navigate through non-collection type"
64
+ end
65
+ end
66
+
67
+ current.to_s
68
+ end
69
+
70
+ def parse_path(path_expr)
71
+ segments = []
72
+ path_expr = path_expr.sub(/^\./, "")
73
+ # Split by dots and brackets while preserving the content
74
+ parts = path_expr.scan(/\.?([\w-]+)|\[([^\]]+)\]/)
75
+ parts.each do |dot_part, bracket_part|
76
+ if dot_part
77
+ segments << dot_part
78
+ elsif bracket_part
79
+ segment = bracket_part.strip.gsub(/^["']|["']$/, "")
80
+ segments << segment
81
+ end
82
+ end
83
+ segments
15
84
  end
16
85
 
17
86
  def normalise_hash(ret)
@@ -30,8 +99,8 @@ module IsoDoc
30
99
  def load_yaml1(lang, script)
31
100
  case lang
32
101
  when "zh"
33
- if script == "Hans" then load_yaml2("zh-Hans")
34
- else load_yaml2("en")
102
+ if script then load_yaml2("zh-#{script}")
103
+ else load_yaml2("zh-Hans")
35
104
  end
36
105
  else
37
106
  load_yaml2(lang)
data/lib/isodoc/i18n.rb CHANGED
@@ -24,6 +24,8 @@ module IsoDoc
24
24
  self
25
25
  end
26
26
 
27
+ CJK_SCRIPTS = %w(Hans Hant Jpan Kore).freeze
28
+
27
29
  def liquid_init
28
30
  ::IsoDoc::I18n::Liquid.set(self)
29
31
  ::Liquid::Environment.default.register_filter(::IsoDoc::I18n::Liquid)
@@ -66,7 +68,9 @@ module IsoDoc
66
68
  end
67
69
 
68
70
  def enum_comma
69
- %w(Hans Hant).include?(@script) and return "<enum-comma>、</enum-comma>"
71
+ c = @labels.dig("punct", "enum-comma")
72
+ c && CJK_SCRIPTS.include?(@script) and
73
+ return "<enum-comma>#{c}</enum-comma>"
70
74
  "<enum-comma>,</enum-comma> "
71
75
  end
72
76
 
data/lib/isodoc/l10n.rb CHANGED
@@ -1,13 +1,23 @@
1
+ require "metanorma-utils"
2
+ require_relative "l10n_cjk"
3
+
1
4
  module IsoDoc
2
5
  class I18n
3
- def self.l10n(text, lang = @lang, script = @script, locale = @locale)
4
- l10n(text, lang, script, locale)
5
- end
6
-
7
- # function localising spaces and punctuation.
8
- def l10n(text, lang = @lang, script = @script, locale = @locale)
9
- %w(zh ja ko).include?(lang) and text = l10n_zh(text, script)
10
- lang == "fr" && text = l10n_fr(text, locale || "FR")
6
+ def self.l10n(text, lang = @lang, script = @script, options = {})
7
+ l10n(text, lang, script, options)
8
+ end
9
+
10
+ # function localising spaces and punctuation
11
+ # options[:prev] and options[:foll] are optional context strings
12
+ # options[:proportional_mixed_cjk] allows contextual full-width vs
13
+ # half-width punctuation
14
+ def l10n(text, lang = @lang, script = @script, options = {})
15
+ locale = options[:locale] || @locale
16
+ %w(zh ja ko).include?(lang) and
17
+ text = l10n_zh(text, script, options)
18
+ lang == "fr" and
19
+ text = l10n_fr(text, locale || "FR", options)
20
+ text&.gsub!(/<esc>|<\/esc>/, "") # Strip esc tags
11
21
  bidiwrap(text, lang, script)
12
22
  end
13
23
 
@@ -29,145 +39,110 @@ module IsoDoc
29
39
  .default_script(@lang))]
30
40
  end
31
41
 
32
- # CJK
33
- def l10n_zh(text, script = "Hans")
42
+ def l10n_prep(text, options)
34
43
  xml = Nokogiri::XML::DocumentFragment.parse(text)
35
- t = xml.xpath(".//text()")
36
- t.each_with_index do |n, i|
37
- prev, foll = l10n_context(t, i)
38
- text = cleanup_entities(n.text, is_xml: false)
39
- n.replace(l10_zh1(text, prev, foll, script))
44
+ t = xml.xpath(".//text()").reject { |node| node.text.empty? }
45
+ text_cache = build_text_cache(t, options[:prev], options[:foll])
46
+
47
+ # Identify which text nodes are within <esc> tags
48
+ esc_indices = Set.new
49
+ t.each_with_index do |node, i|
50
+ esc_indices.add(i) if node.ancestors("esc").any?
40
51
  end
41
- to_xml(xml).gsub(/<b>/, "").gsub("</b>", "")
42
- .gsub(/<\?[^>]+>/, "")
52
+
53
+ [t, text_cache, xml, options[:prev], options[:foll], esc_indices]
54
+ end
55
+
56
+ # Cache text content once per method call to avoid repeated .text calls
57
+ # Build text cache with optional prepended/appended context
58
+ # Also, reduce multiple spaces to single, to avoid miscrecognition of space
59
+ def build_text_cache(text_nodes, prev_context = nil, foll_context = nil)
60
+ text_cache = text_nodes.map(&:text).map { |x| x.gsub(/\s+/, " ") }
61
+ text_cache.unshift(prev_context) if prev_context
62
+ text_cache.push(foll_context) if foll_context
63
+ text_cache
43
64
  end
44
65
 
45
66
  # previous, following context of current text node:
46
67
  # do not use just the immediately adjoining text tokens for context
47
68
  # deal with spaces and empty text by just concatenating entire context
69
+ # Optimized to avoid O(n²) complexity by using pre-cached text content
70
+ def l10n_context_cached(text_cache, idx)
71
+ prev = text_cache[0...idx].join
72
+ foll = text_cache[(idx + 1)...text_cache.size].join
73
+ [prev, foll]
74
+ end
75
+
76
+ # Fallback method for backward compatibility
48
77
  def l10n_context(nodes, idx)
49
78
  prev = nodes[0...idx].map(&:text).join
50
79
  foll = nodes[(idx + 1)...(nodes.size)].map(&:text).join
51
80
  [prev, foll]
52
81
  end
53
82
 
54
- def l10n_fr(text, locale)
55
- xml = Nokogiri::XML::DocumentFragment.parse(text)
56
- t = xml.xpath(".//text()")
83
+ def l10n_fr(text, locale, options)
84
+ t, text_cache, xml, prev, _foll, esc_indices = l10n_prep(text, options)
57
85
  t.each_with_index do |n, i|
58
- prev, foll = l10n_context(t, i)
86
+ next if esc_indices.include?(i) # Skip escaped nodes
87
+
88
+ prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
59
89
  text = cleanup_entities(n.text, is_xml: false)
60
- n.replace(l10n_fr1(text, prev, foll, locale))
90
+ n.replace(l10n_fr1(text, prev_ctx, foll_ctx, locale))
61
91
  end
62
92
  to_xml(xml)
63
93
  end
64
94
 
65
- ZH_CHAR = "(\\p{Han}|\\p{In CJK Symbols And Punctuation}|" \
66
- "\\p{In Halfwidth And Fullwidth Forms})".freeze
67
-
68
- # note: we can't differentiate comma from enumeration comma 、
69
- # def l10_zh1(text, _script)
70
- def l10_zh1(text, prev, foll, _script)
71
- # l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
72
- r = l10n_zh_punct(text, prev, foll)
73
- r = l10n_zh_remove_space(r, prev, foll)
74
- l10n_zh_dash(r, prev, foll)
75
- end
76
-
77
- ZH1_PUNCT = /(#{ZH_CHAR}|^) # CJK character, or start of string
78
- (\s*)$ # Latin spaces optional
79
- /xo.freeze
80
- ZH2_PUNCT = /^\s* # followed by ignorable Latin spaces
81
- [:,.()\[\];?!-]* # Latin punct which will also convert to CJK
82
- (#{ZH_CHAR}|$) # CJK character, or end of string
83
- /xo.freeze
84
-
85
- # CJK punct if (^|CJK).($|CJK)
86
- def l10n_zh_punct(text, prev, foll)
87
- ["::", ",,", "..", "))", "]]", ";;", "??", "!!", "((", "[["].each do |m|
88
- text = l10n_gsub(text, prev, foll, [m[0], m[1]],
89
- [ZH1_PUNCT, ZH2_PUNCT])
90
- end
91
- text
92
- end
93
-
94
- ZH1_DASH = /(#{ZH_CHAR}|^) # CJK character, or start of string
95
- (\d*) # optional digits
96
- $/xo.freeze
97
-
98
- ZH2_DASH = /^\d* # followed by optional digits
99
- (#{ZH_CHAR}|$) # CJK character, or end of string
100
- /xo.freeze
101
-
102
- def l10n_zh_dash(text, prev, foll)
103
- l10n_gsub(text, prev, foll, %w(– ~), [ZH1_DASH, ZH2_DASH])
104
- end
105
-
106
- def l10n_gsub(text, prev, foll, delim, regex)
95
+ # text: string we are scanning for instances of delim[0] to replace
96
+ # prev: string preceding text, as additional token of context
97
+ # foll: string following text, as additional token of context
98
+ # delim: delim[0] is the symbol we want to replace, delim[1] its replacement
99
+ # regexes: a list of regex pairs: the context before the found token,
100
+ # and the context after the found token, under which replacing it
101
+ # with delim[1] is permitted. If regex is nil, always allow the replacement
102
+ def l10n_gsub(text, prev, foll, delim, regexes)
103
+ delim[1] or return text
107
104
  context = l10n_gsub_context(text, prev, foll, delim) or return text
108
105
  (1...(context.size - 1)).each do |i|
109
- l10_context_valid?(context, i, delim, regex) and
106
+ l10_context_valid?(context, i, delim, regexes) and
110
107
  context[i] = delim[1].gsub("\\0", context[i]) # Full-width equivalent
111
108
  end
112
109
  context[1...(context.size - 1)].join
113
110
  end
114
111
 
112
+ # split string being scanned, and its contextual tokens before and after,
113
+ # into array of tokens determining whether to replace instances of delim[0]
115
114
  def l10n_gsub_context(text, prev, foll, delim)
116
115
  d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
117
116
  context = text.split(/(#{d})/) # delim to replace
118
117
  context.size == 1 and return
119
- [prev, context, foll].flatten
118
+ [prev, context.reject(&:empty?), foll].flatten
120
119
  end
121
120
 
122
121
  def l10_context_valid?(context, idx, delim, regex)
123
- found_delim = if delim[0].is_a?(Regexp) # punct to convert
124
- delim[0].match?(context[idx])
125
- else
126
- context[idx] == delim[0]
127
- end
128
- found_delim &&
129
- regex[0].match?(context[0...idx].join) && # preceding context
130
- regex[1].match?(context[(idx + 1)..-1].join) # foll context
122
+ l10n_context_found_delimiter?(context[idx], delim) or return false
123
+ regex.nil? and return true
124
+ regex.detect do |r|
125
+ r[0].match?(context[0...idx].join) && # preceding context
126
+ r[1].match?(context[(idx + 1)..-1].join) # foll context
127
+ end
131
128
  end
132
129
 
133
- def l10n_zh_remove_space(text, prev, foll)
134
- text = l10n_gsub(text, prev, foll, [" ", ""],
135
- [/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o])
136
- l10n_gsub(text, prev, foll, [" ", ""],
137
- [/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o])
130
+ def l10n_context_found_delimiter?(token, delim)
131
+ if delim[0].is_a?(Regexp) # punct to convert
132
+ delim[0].match?(token)
133
+ else
134
+ token == delim[0]
135
+ end
138
136
  end
139
137
 
140
138
  def l10n_fr1(text, prev, foll, locale)
141
139
  text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
142
- [/\p{Alnum}$/, /^(\s|$)/])
143
- text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"], [/$/, /^(?!\p{Zs})./])
140
+ [[/\p{Alnum}$/, /^(\s|$)/]])
141
+ text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"],
142
+ [[/$/, /^(?!\p{Zs})./]])
144
143
  colonsp = locale == "CH" ? "\u202f" : "\u00a0"
145
144
  l10n_gsub(text, prev, foll, [":", "#{colonsp}\\0"],
146
- [/\p{Alnum}$/, /^(\s|$)/])
147
- end
148
-
149
- def self.cjk_extend(text)
150
- cjk_extend(text)
151
- end
152
-
153
- def cjk_extend(title)
154
- @c.decode(title).chars.map.with_index do |n, i|
155
- if i.zero? || !interleave_space_cjk?(title[i - 1] + title[i])
156
- n
157
- else "\u3000#{n}"
158
- end
159
- end.join
160
- end
161
-
162
- def interleave_space_cjk?(text)
163
- text.size == 2 or return
164
- ["\u2014\u2014", "\u2025\u2025", "\u2026\u2026",
165
- "\u22ef\u22ef"].include?(text) ||
166
- /\d\d|\p{Latin}\p{Latin}|[[:space:]]/.match?(text) ||
167
- /^[\u2018\u201c(\u3014\[{\u3008\u300a\u300c\u300e\u3010\u2985\u3018\u3016\u00ab\u301d]/.match?(text) ||
168
- /[\u2019\u201d)\u3015\]}\u3009\u300b\u300d\u300f\u3011\u2986\u3019\u3017\u00bb\u301f]$/.match?(text) ||
169
- /[\u3002.\u3001,\u30fb:;\u2010\u301c\u30a0\u2013!?\u203c\u2047\u2048\u2049]/.match?(text) and return false
170
- true
145
+ [[/\p{Alnum}$/, /^(\s|$)/]])
171
146
  end
172
147
 
173
148
  def to_xml(node)
@@ -0,0 +1,165 @@
1
+ module IsoDoc
2
+ class I18n
3
+ # Use comprehensive CJK definition from metanorma-utils
4
+ # This includes Han, Katakana, Hiragana, Hangul, Bopomofo
5
+ # and all CJK extensions
6
+ ZH_CHAR = "(#{Metanorma::Utils::CJK})".freeze
7
+ LATIN_PUNCT = /[:,.()\[\];?!-]/.freeze
8
+ # CJK character which is not punctuation
9
+ ZH_NON_PUNCT = "(#{
10
+ [
11
+ Metanorma::Utils.singleton_class::HAN,
12
+ Metanorma::Utils.singleton_class::HAN_IDC,
13
+ Metanorma::Utils.singleton_class::KANBUN,
14
+ Metanorma::Utils.singleton_class::CJK_COMPAT_IDEOGRAPHS,
15
+ Metanorma::Utils.singleton_class::HAN_COMPAT_IDEOGRAPHS,
16
+ Metanorma::Utils.singleton_class::HANGUL,
17
+ Metanorma::Utils.singleton_class::HIRAGANA,
18
+ Metanorma::Utils.singleton_class::KATAKANA,
19
+ Metanorma::Utils.singleton_class::BOPOMOFO,
20
+ ].join("|")})".freeze
21
+
22
+ # Condition for converting punctuation to double width,
23
+ # in case of options[:proportional_mixed_cjk]
24
+ # 1. (Strict condition) CJK before, CJK after, modulo ignorable characters:
25
+ # 1a. CJK character, or start of string. Latin spaces optional.
26
+ ZH1_PUNCT = /(#{ZH_CHAR}|^)(\s*)$/xo.freeze
27
+ # 1b. Latin spaces optional, Latin punct which will also convert to CJK,
28
+ # CJK character, or end of string.
29
+ ZH2_PUNCT = /^\s*#{LATIN_PUNCT}*(#{ZH_CHAR}|$)/xo.freeze
30
+ # 2. CJK before, space after:
31
+ # 2a. CJK char, followed by optional Latin punct which will also convert to CJK
32
+ ZH1_NO_SPACE = /#{ZH_CHAR}#{LATIN_PUNCT}*$/xo.freeze
33
+ # 2b. optional Latin punct which wil also convert to CJK, then space
34
+ OPT_PUNCT_SPACE = /^($|#{LATIN_PUNCT}*\s)/xo.freeze
35
+
36
+ # Chinese numerals (common + formal/financial forms)
37
+ # Explicit characters needed because Chinese numeral ideographs
38
+ # are not tagged with Unicode Number property
39
+ # Using alternation instead of character class to properly include \p{N}
40
+ ZH_NUMERALS = "(?:[零一二三四五六七八九十百千万亿壹贰叁肆伍陆柒捌玖拾佰仟萬億兆]|\\p{N})".freeze
41
+
42
+ # Contexts for converting en-dashes to full-width
43
+ # Before: CJK or start of string, no digits
44
+ ZH1_DASH = /(#{ZH_CHAR}|^)(?<!=#{ZH_NUMERALS})$/xo.freeze
45
+ # After: no optional digits, CJK or end of string
46
+ ZH2_DASH = /^(?!#{ZH_NUMERALS})(#{ZH_CHAR}|$)/xo.freeze
47
+ # Before: CJK or start of string, optional digits
48
+ ZH1_NUM_DASH = /#{ZH_NUMERALS}$/xo.freeze
49
+ # After: optional digits, CJK or end of string
50
+ ZH2_NUM_DASH = /^#{ZH_NUMERALS}/xo.freeze
51
+
52
+ ZH_PUNCT_CONTEXTS =
53
+ [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE],
54
+ [/(\s|^)$/, /^#{ZH_CHAR}/o]].freeze
55
+
56
+ # map of YAML punct keys to auto-text Latin equivalents
57
+ ZH_PUNCT_AUTOTEXT = {
58
+ colon: ":",
59
+ comma: ",",
60
+ "enum-comma": ",",
61
+ semicolon: ";",
62
+ period: ".",
63
+ "close-paren": ")",
64
+ "open-paren": "(",
65
+ "close-bracket": "]",
66
+ "open-bracket": "[",
67
+ "question-mark": "?",
68
+ "exclamation-mark": "!",
69
+ "em-dash": "—",
70
+ "open-quote": "“",
71
+ "close-quote": "”",
72
+ "open-nested-quote": "’",
73
+ "close-nested-quote": "’",
74
+ ellipse: "…",
75
+ }.freeze
76
+
77
+ # Pre-defined punctuation mappings for efficiency
78
+ def init_zh_punct_map
79
+ ZH_PUNCT_AUTOTEXT.each_with_object([]) do |(k, v), m|
80
+ @labels.dig("punct", k.to_s) or next
81
+ m << [v, @labels["punct"][k.to_s], ZH_PUNCT_CONTEXTS]
82
+ end
83
+ end
84
+
85
+ def l10n_zh(text, script, options)
86
+ script ||= "Hans"
87
+ t, text_cache, xml, prev, _foll, esc_indices = l10n_prep(text, options)
88
+ t.each_with_index do |n, i|
89
+ next if esc_indices.include?(i) # Skip escaped nodes
90
+
91
+ # Adjust index if prev context prepended
92
+ prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
93
+ text = cleanup_entities(n.text, is_xml: false)
94
+ n.replace(l10_zh1(text, prev_ctx, foll_ctx, script, options))
95
+ end
96
+ to_xml(xml) #.gsub(/<\/?em>|<\/?strong>|<\/?i>|<\/?b>/, "")
97
+ end
98
+
99
+ # note: we can't differentiate comma from enumeration comma 、
100
+ # def l10_zh1(text, _script)
101
+ def l10_zh1(text, prev, foll, _script, options)
102
+ r = l10n_zh_punct(text, prev, foll, options)
103
+ r = l10n_zh_remove_space(r, prev, foll)
104
+ l10n_zh_dash(r, prev, foll)
105
+ end
106
+
107
+ def l10n_zh_punct(text, prev, foll, options)
108
+ # Use pre-defined mapping for better performance
109
+ @zh_punct_map ||= init_zh_punct_map
110
+ @zh_punct_map.each do |mapping|
111
+ punct_from, punct_to, regexes = mapping
112
+ options[:proportional_mixed_cjk] or regexes = nil
113
+ text = l10n_gsub(text, prev, foll, [punct_from, punct_to],
114
+ regexes)
115
+ end
116
+ text
117
+ end
118
+
119
+ def l10n_zh_dash(text, prev, foll)
120
+ text = l10n_gsub(text, prev, foll, ["–", @labels.dig("punct", "en-dash")],
121
+ [[ZH1_DASH, ZH2_DASH]])
122
+ l10n_gsub(text, prev, foll, ["–", @labels.dig("punct", "number-en-dash")],
123
+ [[ZH1_NUM_DASH, ZH2_NUM_DASH]])
124
+ end
125
+
126
+ def l10n_zh_remove_space(text, prev, foll)
127
+ text = l10n_gsub(text, prev, foll, [/\s+/, ""],
128
+ [[/(#{ZH_CHAR})$/o, /^#{ZH_CHAR}/o]])
129
+ if sep = @labels.dig("punct", "cjk-latin-separator")
130
+ # Skip over punctuation to find Latin letters/numbers
131
+ text = l10n_gsub(text, prev, foll, [/\s+/, sep],
132
+ [[/#{ZH_CHAR}$/o, /^\p{P}*[\p{Latin}\p{N}]/o]])
133
+ l10n_gsub(text, prev, foll, [/\s+/, sep],
134
+ [[/[\p{Latin}\p{N}]\p{P}*$/o, /^#{ZH_NON_PUNCT}/o]])
135
+ else
136
+ l10n_gsub(text, prev, foll, [/\s+/, ""],
137
+ [[/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o]])
138
+ end
139
+ end
140
+
141
+ def self.cjk_extend(text)
142
+ cjk_extend(text)
143
+ end
144
+
145
+ def cjk_extend(title)
146
+ @c.decode(title).chars.map.with_index do |n, i|
147
+ if i.zero? || !interleave_space_cjk?(title[i - 1] + title[i])
148
+ n
149
+ else "\u3000#{n}"
150
+ end
151
+ end.join
152
+ end
153
+
154
+ def interleave_space_cjk?(text)
155
+ text.size == 2 or return
156
+ ["\u2014\u2014", "\u2025\u2025", "\u2026\u2026",
157
+ "\u22ef\u22ef"].include?(text) ||
158
+ /\d\d|\p{Latin}\p{Latin}|[[:space:]]/.match?(text) ||
159
+ /^[\u2018\u201c(\u3014\[{\u3008\u300a\u300c\u300e\u3010\u2985\u3018\u3016\u00ab\u301d]/.match?(text) ||
160
+ /[\u2019\u201d)\u3015\]}\u3009\u300b\u300d\u300f\u3011\u2986\u3019\u3017\u00bb\u301f]$/.match?(text) ||
161
+ /[\u3002.\u3001,\u30fb:;\u2010\u301c\u30a0\u2013!?\u203c\u2047\u2048\u2049]/.match?(text) and return false
162
+ true
163
+ end
164
+ end
165
+ end
@@ -2,3 +2,24 @@ text: text
2
2
  at: at
3
3
  binary_and: "%1 <conn>and</conn> %2"
4
4
  multiple_and: "%1<conn>, and</conn> %2"
5
+ punct:
6
+ colon: ":"
7
+ comma: ","
8
+ enum_comma: ","
9
+ semicolon: ";"
10
+ period: "."
11
+ close_paren: ")"
12
+ open_paren: "("
13
+ close_bracket: "]"
14
+ open_bracket: "["
15
+ question_mark: "?"
16
+ exclamation_mark: "!"
17
+ em_dash: "—"
18
+ en_dash: "–"
19
+ number_en_dash: "–"
20
+ open_quote: "“"
21
+ close_quote: "”"
22
+ open_nested_quote: "’"
23
+ close_nested_quote: "’"
24
+ ellipse: …
25
+
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: isodoc-i18n
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.2
4
+ version: 1.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-02-13 00:00:00.000000000 Z
11
+ date: 2025-10-13 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: base64
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: htmlentities
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -67,13 +81,13 @@ dependencies:
67
81
  - !ruby/object:Gem::Version
68
82
  version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
- name: base64
84
+ name: canon
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
87
  - - ">="
74
88
  - !ruby/object:Gem::Version
75
89
  version: '0'
76
- type: :runtime
90
+ type: :development
77
91
  prerelease: false
78
92
  version_requirements: !ruby/object:Gem::Requirement
79
93
  requirements:
@@ -170,58 +184,58 @@ dependencies:
170
184
  requirements:
171
185
  - - "~>"
172
186
  - !ruby/object:Gem::Version
173
- version: 1.5.2
187
+ version: '1'
174
188
  type: :development
175
189
  prerelease: false
176
190
  version_requirements: !ruby/object:Gem::Requirement
177
191
  requirements:
178
192
  - - "~>"
179
193
  - !ruby/object:Gem::Version
180
- version: 1.5.2
194
+ version: '1'
181
195
  - !ruby/object:Gem::Dependency
182
- name: simplecov
196
+ name: rubocop-performance
183
197
  requirement: !ruby/object:Gem::Requirement
184
198
  requirements:
185
- - - "~>"
199
+ - - ">="
186
200
  - !ruby/object:Gem::Version
187
- version: '0.15'
201
+ version: '0'
188
202
  type: :development
189
203
  prerelease: false
190
204
  version_requirements: !ruby/object:Gem::Requirement
191
205
  requirements:
192
- - - "~>"
206
+ - - ">="
193
207
  - !ruby/object:Gem::Version
194
- version: '0.15'
208
+ version: '0'
195
209
  - !ruby/object:Gem::Dependency
196
- name: timecop
210
+ name: simplecov
197
211
  requirement: !ruby/object:Gem::Requirement
198
212
  requirements:
199
213
  - - "~>"
200
214
  - !ruby/object:Gem::Version
201
- version: '0.9'
215
+ version: '0.15'
202
216
  type: :development
203
217
  prerelease: false
204
218
  version_requirements: !ruby/object:Gem::Requirement
205
219
  requirements:
206
220
  - - "~>"
207
221
  - !ruby/object:Gem::Version
208
- version: '0.9'
222
+ version: '0.15'
209
223
  - !ruby/object:Gem::Dependency
210
- name: webmock
224
+ name: timecop
211
225
  requirement: !ruby/object:Gem::Requirement
212
226
  requirements:
213
- - - ">="
227
+ - - "~>"
214
228
  - !ruby/object:Gem::Version
215
- version: '0'
229
+ version: '0.9'
216
230
  type: :development
217
231
  prerelease: false
218
232
  version_requirements: !ruby/object:Gem::Requirement
219
233
  requirements:
220
- - - ">="
234
+ - - "~>"
221
235
  - !ruby/object:Gem::Version
222
- version: '0'
236
+ version: '0.9'
223
237
  - !ruby/object:Gem::Dependency
224
- name: xml-c14n
238
+ name: webmock
225
239
  requirement: !ruby/object:Gem::Requirement
226
240
  requirements:
227
241
  - - ">="
@@ -257,6 +271,7 @@ files:
257
271
  - lib/isodoc/i18n.rb
258
272
  - lib/isodoc/i18n/version.rb
259
273
  - lib/isodoc/l10n.rb
274
+ - lib/isodoc/l10n_cjk.rb
260
275
  - lib/isodoc/liquid/liquid.rb
261
276
  homepage: https://github.com/metanorma/isodoc-i18n
262
277
  licenses:
@@ -277,7 +292,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
277
292
  - !ruby/object:Gem::Version
278
293
  version: '0'
279
294
  requirements: []
280
- rubygems_version: 3.3.27
295
+ rubygems_version: 3.5.22
281
296
  signing_key:
282
297
  specification_version: 4
283
298
  summary: isodoc-i18n