isodoc-i18n 1.3.2 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2a242d0bc7609246ea502ffec35e515457e739c478b25478a01c7ff641dc4523
4
- data.tar.gz: 0f89c271766d244f037a0b6c6f40431888c2b6140814ac2b6e4d2906e142b1d5
3
+ metadata.gz: 134575d665c75368d3640ef87c9c63a3f6fdc2bf668e8d8df233a3cb139fce6b
4
+ data.tar.gz: ed1fc0e49c62f27b3199ba8357b2f1454309485efffbcb93ed30d51f01c36fe7
5
5
  SHA512:
6
- metadata.gz: cb3f3f3a28b1b8fddd35ff74290ccf62f905ef54d51d62efd4fd8000d2d38a389eb4c78ec02aefd83ab0d1d98aaec462a0138c51257f8548c07f4a4726854c38
7
- data.tar.gz: f6dcb2d4c05c630527d02888bf614a0e09396166e2f27437dedc0cc0a619149c6606fe2a8a78ce3c98948d4279ef3bfa224d9c1b18fbb1d38e74d92d280b74db
6
+ metadata.gz: f6e5ff44068372afc9ad75e1d6bff6483114a3424071c136a86f17745513ff5f31794c4555c9484eff556c50f638cb13c3ba712c4ae3e3b4d762685d1e888c2d
7
+ data.tar.gz: 9c5131cb3dbb8800304530629dfe5b47a3d7b15695ff0762011998c5aa3d3273b7730dde057dd2f2f15e3f997c9466704be80d7ef9cd47a5055e93afc0a5ba32
data/README.adoc CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  image:https://img.shields.io/gem/v/isodoc-i18n.svg["Gem Version", link="https://rubygems.org/gems/isodoc-i18n"]
4
4
  image:https://github.com/metanorma/isodoc-i18n/workflows/rake/badge.svg["Build Status", link="https://github.com/metanorma/isodoc-i18n/actions?query=workflow%3Arake"]
5
- image:https://codeclimate.com/github/metanorma/isodoc-i18n/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/isodoc-i18n"]
5
+ // image:https://codeclimate.com/github/metanorma/isodoc-i18n/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/isodoc-i18n"]
6
6
  image:https://img.shields.io/github/issues-pr-raw/metanorma/isodoc-i18n.svg["Pull Requests", link="https://github.com/metanorma/isodoc-i18n/pulls"]
7
7
  image:https://img.shields.io/github/commits-since/metanorma/isodoc-i18n/latest.svg["Commits since latest",link="https://github.com/metanorma/isodoc-i18n/releases"]
8
8
 
data/isodoc-i18n.gemspec CHANGED
@@ -34,10 +34,11 @@ Gem::Specification.new do |spec|
34
34
  spec.add_development_dependency "guard-rspec", "~> 4.7"
35
35
  spec.add_development_dependency "rake", "~> 13.0"
36
36
  spec.add_development_dependency "rspec", "~> 3.6"
37
- spec.add_development_dependency "rubocop", "~> 1.5.2"
37
+ spec.add_development_dependency "rubocop", "~> 1"
38
+ spec.add_development_dependency "rubocop-performance"
38
39
  spec.add_development_dependency "simplecov", "~> 0.15"
39
40
  spec.add_development_dependency "timecop", "~> 0.9"
40
41
  spec.add_development_dependency "webmock"
41
- spec.add_development_dependency "xml-c14n"
42
+ spec.add_development_dependency "canon"
42
43
  # spec.metadata["rubygems_mfa_required"] = "true"
43
44
  end
@@ -1,5 +1,5 @@
1
1
  module IsoDoc
2
2
  class I18n
3
- VERSION = "1.3.2".freeze
3
+ VERSION = "1.4.0".freeze
4
4
  end
5
5
  end
data/lib/isodoc/i18n.rb CHANGED
@@ -24,6 +24,8 @@ module IsoDoc
24
24
  self
25
25
  end
26
26
 
27
+ CJK_SCRIPTS = %w(Hans Hant Jpan Kore).freeze
28
+
27
29
  def liquid_init
28
30
  ::IsoDoc::I18n::Liquid.set(self)
29
31
  ::Liquid::Environment.default.register_filter(::IsoDoc::I18n::Liquid)
@@ -66,7 +68,7 @@ module IsoDoc
66
68
  end
67
69
 
68
70
  def enum_comma
69
- %w(Hans Hant).include?(@script) and return "<enum-comma>、</enum-comma>"
71
+ CJK_SCRIPTS.include?(@script) and return "<enum-comma>、</enum-comma>"
70
72
  "<enum-comma>,</enum-comma> "
71
73
  end
72
74
 
data/lib/isodoc/l10n.rb CHANGED
@@ -1,13 +1,55 @@
1
+ require "metanorma-utils"
2
+
1
3
  module IsoDoc
2
4
  class I18n
3
- def self.l10n(text, lang = @lang, script = @script, locale = @locale)
4
- l10n(text, lang, script, locale)
5
- end
6
-
7
- # function localising spaces and punctuation.
8
- def l10n(text, lang = @lang, script = @script, locale = @locale)
9
- %w(zh ja ko).include?(lang) and text = l10n_zh(text, script)
10
- lang == "fr" && text = l10n_fr(text, locale || "FR")
5
+ # Use comprehensive CJK definition from metanorma-utils
6
+ # This includes Han, Katakana, Hiragana, Hangul, Bopomofo and all CJK extensions
7
+ ZH_CHAR = "(#{Metanorma::Utils::CJK})".freeze
8
+ LATIN_PUNCT = /[:,.()\[\];?!-]/.freeze
9
+
10
+ # Condition for converting punctuation to double width:
11
+ # 1. (Strict condition) CJK before, CJK after, modulo ignorable characters:
12
+ # 1a. CJK character, or start of string. Latin spaces optional.
13
+ ZH1_PUNCT = /(#{ZH_CHAR}|^)(\s*)$/xo.freeze
14
+ # 1b. Latin spaces optional, Latin punct which will also convert to CJK,
15
+ # CJK character, or end of string.
16
+ ZH2_PUNCT = /^\s*#{LATIN_PUNCT}*(#{ZH_CHAR}|$)/xo.freeze
17
+ # 2. CJK before, space after:
18
+ # 2a. CJK char, followed by optional Latin punct which will also convert to CJK
19
+ ZH1_NO_SPACE = /#{ZH_CHAR}#{LATIN_PUNCT}*$/xo.freeze
20
+ # 2b. optional Latin punct which wil also convert to CJK, then space
21
+ OPT_PUNCT_SPACE = /^($|#{LATIN_PUNCT}*\s)/xo.freeze
22
+
23
+ # Contexts for converting en-dashes to full-width
24
+ # Before: CJK or start of string, optional digits
25
+ ZH1_DASH = /(#{ZH_CHAR}|^)(\d*)$/xo.freeze
26
+ # After: optional digits, CJK or end of string
27
+ ZH2_DASH = /^\d*(#{ZH_CHAR}|$)/xo.freeze
28
+
29
+ # Pre-defined punctuation mappings for efficiency
30
+ ZH_PUNCT_MAP = [
31
+ ["::", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
32
+ [",,", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
33
+ [".。", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
34
+ ["))", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
35
+ ["]]", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
36
+ [";;", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
37
+ ["??", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
38
+ ["!!", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
39
+ ["((", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
40
+ ["[[", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]]
41
+ ].freeze
42
+
43
+ def self.l10n(text, lang = @lang, script = @script, options = {})
44
+ l10n(text, lang, script, options)
45
+ end
46
+
47
+ # function localising spaces and punctuation
48
+ # options[:prev] and options[:foll] are optional context strings
49
+ def l10n(text, lang = @lang, script = @script, options = {})
50
+ locale = options[:locale] || @locale
51
+ %w(zh ja ko).include?(lang) and text = l10n_zh(text, script, options[:prev], options[:foll])
52
+ lang == "fr" && text = l10n_fr(text, locale || "FR", options[:prev], options[:foll])
11
53
  bidiwrap(text, lang, script)
12
54
  end
13
55
 
@@ -30,88 +72,100 @@ module IsoDoc
30
72
  end
31
73
 
32
74
  # CJK
33
- def l10n_zh(text, script = "Hans")
34
- xml = Nokogiri::XML::DocumentFragment.parse(text)
35
- t = xml.xpath(".//text()")
75
+ def l10n_zh(text, script, prev, foll)
76
+ script ||= "Hans"
77
+ t, text_cache, xml = l10n_prep(text, prev, foll)
36
78
  t.each_with_index do |n, i|
37
- prev, foll = l10n_context(t, i)
79
+ # Adjust index if prev context prepended
80
+ prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
38
81
  text = cleanup_entities(n.text, is_xml: false)
39
- n.replace(l10_zh1(text, prev, foll, script))
82
+ n.replace(l10_zh1(text, prev_ctx, foll_ctx, script))
40
83
  end
41
- to_xml(xml).gsub(/<b>/, "").gsub("</b>", "")
42
- .gsub(/<\?[^>]+>/, "")
84
+ to_xml(xml).gsub(/<b>|<\/b>|<\?[^>]+>/, "")
85
+ end
86
+
87
+ def l10n_prep(text, prev, foll)
88
+ xml = Nokogiri::XML::DocumentFragment.parse(text)
89
+ t = xml.xpath(".//text()")
90
+ text_cache = build_text_cache(t, prev, foll)
91
+ [t, text_cache, xml]
92
+ end
93
+
94
+ # Cache text content once per method call to avoid repeated .text calls
95
+ # Build text cache with optional prepended/appended context
96
+ def build_text_cache(text_nodes, prev_context = nil, foll_context = nil)
97
+ text_cache = text_nodes.map(&:text)
98
+ text_cache.unshift(prev_context) if prev_context
99
+ text_cache.push(foll_context) if foll_context
100
+ text_cache
43
101
  end
44
102
 
45
103
  # previous, following context of current text node:
46
104
  # do not use just the immediately adjoining text tokens for context
47
105
  # deal with spaces and empty text by just concatenating entire context
106
+ # Optimized to avoid O(n²) complexity by using pre-cached text content
107
+ def l10n_context_cached(text_cache, idx)
108
+ prev = text_cache[0...idx].join
109
+ foll = text_cache[(idx + 1)...text_cache.size].join
110
+ [prev, foll]
111
+ end
112
+
113
+ # Fallback method for backward compatibility
48
114
  def l10n_context(nodes, idx)
49
115
  prev = nodes[0...idx].map(&:text).join
50
116
  foll = nodes[(idx + 1)...(nodes.size)].map(&:text).join
51
117
  [prev, foll]
52
118
  end
53
119
 
54
- def l10n_fr(text, locale)
55
- xml = Nokogiri::XML::DocumentFragment.parse(text)
56
- t = xml.xpath(".//text()")
120
+ def l10n_fr(text, locale, prev, foll)
121
+ t, text_cache, xml = l10n_prep(text, prev, foll)
57
122
  t.each_with_index do |n, i|
58
- prev, foll = l10n_context(t, i)
123
+ prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
59
124
  text = cleanup_entities(n.text, is_xml: false)
60
- n.replace(l10n_fr1(text, prev, foll, locale))
125
+ n.replace(l10n_fr1(text, prev_ctx, foll_ctx, locale))
61
126
  end
62
127
  to_xml(xml)
63
128
  end
64
129
 
65
- ZH_CHAR = "(\\p{Han}|\\p{In CJK Symbols And Punctuation}|" \
66
- "\\p{In Halfwidth And Fullwidth Forms})".freeze
67
-
68
130
  # note: we can't differentiate comma from enumeration comma 、
69
131
  # def l10_zh1(text, _script)
70
132
  def l10_zh1(text, prev, foll, _script)
71
- # l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
72
133
  r = l10n_zh_punct(text, prev, foll)
73
134
  r = l10n_zh_remove_space(r, prev, foll)
74
135
  l10n_zh_dash(r, prev, foll)
75
136
  end
76
137
 
77
- ZH1_PUNCT = /(#{ZH_CHAR}|^) # CJK character, or start of string
78
- (\s*)$ # Latin spaces optional
79
- /xo.freeze
80
- ZH2_PUNCT = /^\s* # followed by ignorable Latin spaces
81
- [:,.()\[\];?!-]* # Latin punct which will also convert to CJK
82
- (#{ZH_CHAR}|$) # CJK character, or end of string
83
- /xo.freeze
84
-
85
- # CJK punct if (^|CJK).($|CJK)
86
138
  def l10n_zh_punct(text, prev, foll)
87
- ["::", ",,", "..", "))", "]]", ";;", "??", "!!", "((", "[["].each do |m|
88
- text = l10n_gsub(text, prev, foll, [m[0], m[1]],
89
- [ZH1_PUNCT, ZH2_PUNCT])
139
+ # Use pre-defined mapping for better performance
140
+ ZH_PUNCT_MAP.each do |mapping|
141
+ punct_pair, regexes = mapping
142
+ text = l10n_gsub(text, prev, foll, [punct_pair[0], punct_pair[1]], regexes)
90
143
  end
91
144
  text
92
145
  end
93
146
 
94
- ZH1_DASH = /(#{ZH_CHAR}|^) # CJK character, or start of string
95
- (\d*) # optional digits
96
- $/xo.freeze
97
-
98
- ZH2_DASH = /^\d* # followed by optional digits
99
- (#{ZH_CHAR}|$) # CJK character, or end of string
100
- /xo.freeze
101
-
102
147
  def l10n_zh_dash(text, prev, foll)
103
- l10n_gsub(text, prev, foll, %w(– ~), [ZH1_DASH, ZH2_DASH])
148
+ l10n_gsub(text, prev, foll, %w(– ~), [[ZH1_DASH, ZH2_DASH]])
104
149
  end
105
150
 
106
- def l10n_gsub(text, prev, foll, delim, regex)
151
+ # text: string we are scanning for instances of delim[0] to replace
152
+ # prev: string preceding text, as additional token of context
153
+ # foll: string following text, as additional token of context
154
+ # delim: delim[0] is the symbol we want to replace, delim[1] its replacement
155
+ # regexes: a list of regex pairs: the context before the found token,
156
+ # and the context after the found token, under which replacing it
157
+ # with delim[1] is permitted
158
+ def l10n_gsub(text, prev, foll, delim, regexes)
107
159
  context = l10n_gsub_context(text, prev, foll, delim) or return text
108
160
  (1...(context.size - 1)).each do |i|
109
- l10_context_valid?(context, i, delim, regex) and
161
+ l10_context_valid?(context, i, delim, regexes) and
110
162
  context[i] = delim[1].gsub("\\0", context[i]) # Full-width equivalent
111
163
  end
112
164
  context[1...(context.size - 1)].join
113
165
  end
114
166
 
167
+ # split string being scanned, and its contextual tokens before and after,
168
+ # into array of tokens determining whether to replace instances of delim[0]
115
169
  def l10n_gsub_context(text, prev, foll, delim)
116
170
  d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
117
171
  context = text.split(/(#{d})/) # delim to replace
@@ -120,30 +174,36 @@ module IsoDoc
120
174
  end
121
175
 
122
176
  def l10_context_valid?(context, idx, delim, regex)
123
- found_delim = if delim[0].is_a?(Regexp) # punct to convert
124
- delim[0].match?(context[idx])
125
- else
126
- context[idx] == delim[0]
127
- end
128
- found_delim &&
129
- regex[0].match?(context[0...idx].join) && # preceding context
130
- regex[1].match?(context[(idx + 1)..-1].join) # foll context
177
+ l10n_context_found_delimiter?(context[idx], delim) or return false
178
+ regex.detect do |r|
179
+ r[0].match?(context[0...idx].join) && # preceding context
180
+ r[1].match?(context[(idx + 1)..-1].join) # foll context
181
+ end
182
+ end
183
+
184
+ def l10n_context_found_delimiter?(token, delim)
185
+ if delim[0].is_a?(Regexp) # punct to convert
186
+ delim[0].match?(token)
187
+ else
188
+ token == delim[0]
189
+ end
131
190
  end
132
191
 
133
192
  def l10n_zh_remove_space(text, prev, foll)
134
193
  text = l10n_gsub(text, prev, foll, [" ", ""],
135
- [/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o])
194
+ [[/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o]])
136
195
  l10n_gsub(text, prev, foll, [" ", ""],
137
- [/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o])
196
+ [[/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o]])
138
197
  end
139
198
 
140
199
  def l10n_fr1(text, prev, foll, locale)
141
200
  text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
142
- [/\p{Alnum}$/, /^(\s|$)/])
143
- text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"], [/$/, /^(?!\p{Zs})./])
201
+ [[/\p{Alnum}$/, /^(\s|$)/]])
202
+ text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"],
203
+ [[/$/, /^(?!\p{Zs})./]])
144
204
  colonsp = locale == "CH" ? "\u202f" : "\u00a0"
145
205
  l10n_gsub(text, prev, foll, [":", "#{colonsp}\\0"],
146
- [/\p{Alnum}$/, /^(\s|$)/])
206
+ [[/\p{Alnum}$/, /^(\s|$)/]])
147
207
  end
148
208
 
149
209
  def self.cjk_extend(text)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: isodoc-i18n
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.2
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-02-13 00:00:00.000000000 Z
11
+ date: 2025-09-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities
@@ -170,14 +170,28 @@ dependencies:
170
170
  requirements:
171
171
  - - "~>"
172
172
  - !ruby/object:Gem::Version
173
- version: 1.5.2
173
+ version: '1'
174
174
  type: :development
175
175
  prerelease: false
176
176
  version_requirements: !ruby/object:Gem::Requirement
177
177
  requirements:
178
178
  - - "~>"
179
179
  - !ruby/object:Gem::Version
180
- version: 1.5.2
180
+ version: '1'
181
+ - !ruby/object:Gem::Dependency
182
+ name: rubocop-performance
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - ">="
186
+ - !ruby/object:Gem::Version
187
+ version: '0'
188
+ type: :development
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - ">="
193
+ - !ruby/object:Gem::Version
194
+ version: '0'
181
195
  - !ruby/object:Gem::Dependency
182
196
  name: simplecov
183
197
  requirement: !ruby/object:Gem::Requirement
@@ -221,7 +235,7 @@ dependencies:
221
235
  - !ruby/object:Gem::Version
222
236
  version: '0'
223
237
  - !ruby/object:Gem::Dependency
224
- name: xml-c14n
238
+ name: canon
225
239
  requirement: !ruby/object:Gem::Requirement
226
240
  requirements:
227
241
  - - ">="
@@ -277,7 +291,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
277
291
  - !ruby/object:Gem::Version
278
292
  version: '0'
279
293
  requirements: []
280
- rubygems_version: 3.3.27
294
+ rubygems_version: 3.5.22
281
295
  signing_key:
282
296
  specification_version: 4
283
297
  summary: isodoc-i18n