isodoc-i18n 1.3.2 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +1 -1
- data/isodoc-i18n.gemspec +3 -2
- data/lib/isodoc/i18n/version.rb +1 -1
- data/lib/isodoc/i18n.rb +3 -1
- data/lib/isodoc/l10n.rb +120 -60
- metadata +20 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 134575d665c75368d3640ef87c9c63a3f6fdc2bf668e8d8df233a3cb139fce6b
|
4
|
+
data.tar.gz: ed1fc0e49c62f27b3199ba8357b2f1454309485efffbcb93ed30d51f01c36fe7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f6e5ff44068372afc9ad75e1d6bff6483114a3424071c136a86f17745513ff5f31794c4555c9484eff556c50f638cb13c3ba712c4ae3e3b4d762685d1e888c2d
|
7
|
+
data.tar.gz: 9c5131cb3dbb8800304530629dfe5b47a3d7b15695ff0762011998c5aa3d3273b7730dde057dd2f2f15e3f997c9466704be80d7ef9cd47a5055e93afc0a5ba32
|
data/README.adoc
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
image:https://img.shields.io/gem/v/isodoc-i18n.svg["Gem Version", link="https://rubygems.org/gems/isodoc-i18n"]
|
4
4
|
image:https://github.com/metanorma/isodoc-i18n/workflows/rake/badge.svg["Build Status", link="https://github.com/metanorma/isodoc-i18n/actions?query=workflow%3Arake"]
|
5
|
-
image:https://codeclimate.com/github/metanorma/isodoc-i18n/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/isodoc-i18n"]
|
5
|
+
// image:https://codeclimate.com/github/metanorma/isodoc-i18n/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/isodoc-i18n"]
|
6
6
|
image:https://img.shields.io/github/issues-pr-raw/metanorma/isodoc-i18n.svg["Pull Requests", link="https://github.com/metanorma/isodoc-i18n/pulls"]
|
7
7
|
image:https://img.shields.io/github/commits-since/metanorma/isodoc-i18n/latest.svg["Commits since latest",link="https://github.com/metanorma/isodoc-i18n/releases"]
|
8
8
|
|
data/isodoc-i18n.gemspec
CHANGED
@@ -34,10 +34,11 @@ Gem::Specification.new do |spec|
|
|
34
34
|
spec.add_development_dependency "guard-rspec", "~> 4.7"
|
35
35
|
spec.add_development_dependency "rake", "~> 13.0"
|
36
36
|
spec.add_development_dependency "rspec", "~> 3.6"
|
37
|
-
spec.add_development_dependency "rubocop", "~> 1
|
37
|
+
spec.add_development_dependency "rubocop", "~> 1"
|
38
|
+
spec.add_development_dependency "rubocop-performance"
|
38
39
|
spec.add_development_dependency "simplecov", "~> 0.15"
|
39
40
|
spec.add_development_dependency "timecop", "~> 0.9"
|
40
41
|
spec.add_development_dependency "webmock"
|
41
|
-
spec.add_development_dependency "
|
42
|
+
spec.add_development_dependency "canon"
|
42
43
|
# spec.metadata["rubygems_mfa_required"] = "true"
|
43
44
|
end
|
data/lib/isodoc/i18n/version.rb
CHANGED
data/lib/isodoc/i18n.rb
CHANGED
@@ -24,6 +24,8 @@ module IsoDoc
|
|
24
24
|
self
|
25
25
|
end
|
26
26
|
|
27
|
+
CJK_SCRIPTS = %w(Hans Hant Jpan Kore).freeze
|
28
|
+
|
27
29
|
def liquid_init
|
28
30
|
::IsoDoc::I18n::Liquid.set(self)
|
29
31
|
::Liquid::Environment.default.register_filter(::IsoDoc::I18n::Liquid)
|
@@ -66,7 +68,7 @@ module IsoDoc
|
|
66
68
|
end
|
67
69
|
|
68
70
|
def enum_comma
|
69
|
-
|
71
|
+
CJK_SCRIPTS.include?(@script) and return "<enum-comma>、</enum-comma>"
|
70
72
|
"<enum-comma>,</enum-comma> "
|
71
73
|
end
|
72
74
|
|
data/lib/isodoc/l10n.rb
CHANGED
@@ -1,13 +1,55 @@
|
|
1
|
+
require "metanorma-utils"
|
2
|
+
|
1
3
|
module IsoDoc
|
2
4
|
class I18n
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
5
|
+
# Use comprehensive CJK definition from metanorma-utils
|
6
|
+
# This includes Han, Katakana, Hiragana, Hangul, Bopomofo and all CJK extensions
|
7
|
+
ZH_CHAR = "(#{Metanorma::Utils::CJK})".freeze
|
8
|
+
LATIN_PUNCT = /[:,.()\[\];?!-]/.freeze
|
9
|
+
|
10
|
+
# Condition for converting punctuation to double width:
|
11
|
+
# 1. (Strict condition) CJK before, CJK after, modulo ignorable characters:
|
12
|
+
# 1a. CJK character, or start of string. Latin spaces optional.
|
13
|
+
ZH1_PUNCT = /(#{ZH_CHAR}|^)(\s*)$/xo.freeze
|
14
|
+
# 1b. Latin spaces optional, Latin punct which will also convert to CJK,
|
15
|
+
# CJK character, or end of string.
|
16
|
+
ZH2_PUNCT = /^\s*#{LATIN_PUNCT}*(#{ZH_CHAR}|$)/xo.freeze
|
17
|
+
# 2. CJK before, space after:
|
18
|
+
# 2a. CJK char, followed by optional Latin punct which will also convert to CJK
|
19
|
+
ZH1_NO_SPACE = /#{ZH_CHAR}#{LATIN_PUNCT}*$/xo.freeze
|
20
|
+
# 2b. optional Latin punct which wil also convert to CJK, then space
|
21
|
+
OPT_PUNCT_SPACE = /^($|#{LATIN_PUNCT}*\s)/xo.freeze
|
22
|
+
|
23
|
+
# Contexts for converting en-dashes to full-width
|
24
|
+
# Before: CJK or start of string, optional digits
|
25
|
+
ZH1_DASH = /(#{ZH_CHAR}|^)(\d*)$/xo.freeze
|
26
|
+
# After: optional digits, CJK or end of string
|
27
|
+
ZH2_DASH = /^\d*(#{ZH_CHAR}|$)/xo.freeze
|
28
|
+
|
29
|
+
# Pre-defined punctuation mappings for efficiency
|
30
|
+
ZH_PUNCT_MAP = [
|
31
|
+
["::", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
32
|
+
[",,", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
33
|
+
[".。", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
34
|
+
["))", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
35
|
+
["]]", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
36
|
+
[";;", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
37
|
+
["??", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
38
|
+
["!!", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
39
|
+
["((", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
40
|
+
["[[", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]]
|
41
|
+
].freeze
|
42
|
+
|
43
|
+
def self.l10n(text, lang = @lang, script = @script, options = {})
|
44
|
+
l10n(text, lang, script, options)
|
45
|
+
end
|
46
|
+
|
47
|
+
# function localising spaces and punctuation
|
48
|
+
# options[:prev] and options[:foll] are optional context strings
|
49
|
+
def l10n(text, lang = @lang, script = @script, options = {})
|
50
|
+
locale = options[:locale] || @locale
|
51
|
+
%w(zh ja ko).include?(lang) and text = l10n_zh(text, script, options[:prev], options[:foll])
|
52
|
+
lang == "fr" && text = l10n_fr(text, locale || "FR", options[:prev], options[:foll])
|
11
53
|
bidiwrap(text, lang, script)
|
12
54
|
end
|
13
55
|
|
@@ -30,88 +72,100 @@ module IsoDoc
|
|
30
72
|
end
|
31
73
|
|
32
74
|
# CJK
|
33
|
-
def l10n_zh(text, script
|
34
|
-
|
35
|
-
t =
|
75
|
+
def l10n_zh(text, script, prev, foll)
|
76
|
+
script ||= "Hans"
|
77
|
+
t, text_cache, xml = l10n_prep(text, prev, foll)
|
36
78
|
t.each_with_index do |n, i|
|
37
|
-
|
79
|
+
# Adjust index if prev context prepended
|
80
|
+
prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
|
38
81
|
text = cleanup_entities(n.text, is_xml: false)
|
39
|
-
n.replace(l10_zh1(text,
|
82
|
+
n.replace(l10_zh1(text, prev_ctx, foll_ctx, script))
|
40
83
|
end
|
41
|
-
to_xml(xml).gsub(/<b
|
42
|
-
|
84
|
+
to_xml(xml).gsub(/<b>|<\/b>|<\?[^>]+>/, "")
|
85
|
+
end
|
86
|
+
|
87
|
+
def l10n_prep(text, prev, foll)
|
88
|
+
xml = Nokogiri::XML::DocumentFragment.parse(text)
|
89
|
+
t = xml.xpath(".//text()")
|
90
|
+
text_cache = build_text_cache(t, prev, foll)
|
91
|
+
[t, text_cache, xml]
|
92
|
+
end
|
93
|
+
|
94
|
+
# Cache text content once per method call to avoid repeated .text calls
|
95
|
+
# Build text cache with optional prepended/appended context
|
96
|
+
def build_text_cache(text_nodes, prev_context = nil, foll_context = nil)
|
97
|
+
text_cache = text_nodes.map(&:text)
|
98
|
+
text_cache.unshift(prev_context) if prev_context
|
99
|
+
text_cache.push(foll_context) if foll_context
|
100
|
+
text_cache
|
43
101
|
end
|
44
102
|
|
45
103
|
# previous, following context of current text node:
|
46
104
|
# do not use just the immediately adjoining text tokens for context
|
47
105
|
# deal with spaces and empty text by just concatenating entire context
|
106
|
+
# Optimized to avoid O(n²) complexity by using pre-cached text content
|
107
|
+
def l10n_context_cached(text_cache, idx)
|
108
|
+
prev = text_cache[0...idx].join
|
109
|
+
foll = text_cache[(idx + 1)...text_cache.size].join
|
110
|
+
[prev, foll]
|
111
|
+
end
|
112
|
+
|
113
|
+
# Fallback method for backward compatibility
|
48
114
|
def l10n_context(nodes, idx)
|
49
115
|
prev = nodes[0...idx].map(&:text).join
|
50
116
|
foll = nodes[(idx + 1)...(nodes.size)].map(&:text).join
|
51
117
|
[prev, foll]
|
52
118
|
end
|
53
119
|
|
54
|
-
def l10n_fr(text, locale)
|
55
|
-
xml =
|
56
|
-
t = xml.xpath(".//text()")
|
120
|
+
def l10n_fr(text, locale, prev, foll)
|
121
|
+
t, text_cache, xml = l10n_prep(text, prev, foll)
|
57
122
|
t.each_with_index do |n, i|
|
58
|
-
|
123
|
+
prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
|
59
124
|
text = cleanup_entities(n.text, is_xml: false)
|
60
|
-
n.replace(l10n_fr1(text,
|
125
|
+
n.replace(l10n_fr1(text, prev_ctx, foll_ctx, locale))
|
61
126
|
end
|
62
127
|
to_xml(xml)
|
63
128
|
end
|
64
129
|
|
65
|
-
ZH_CHAR = "(\\p{Han}|\\p{In CJK Symbols And Punctuation}|" \
|
66
|
-
"\\p{In Halfwidth And Fullwidth Forms})".freeze
|
67
|
-
|
68
130
|
# note: we can't differentiate comma from enumeration comma 、
|
69
131
|
# def l10_zh1(text, _script)
|
70
132
|
def l10_zh1(text, prev, foll, _script)
|
71
|
-
# l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
|
72
133
|
r = l10n_zh_punct(text, prev, foll)
|
73
134
|
r = l10n_zh_remove_space(r, prev, foll)
|
74
135
|
l10n_zh_dash(r, prev, foll)
|
75
136
|
end
|
76
137
|
|
77
|
-
ZH1_PUNCT = /(#{ZH_CHAR}|^) # CJK character, or start of string
|
78
|
-
(\s*)$ # Latin spaces optional
|
79
|
-
/xo.freeze
|
80
|
-
ZH2_PUNCT = /^\s* # followed by ignorable Latin spaces
|
81
|
-
[:,.()\[\];?!-]* # Latin punct which will also convert to CJK
|
82
|
-
(#{ZH_CHAR}|$) # CJK character, or end of string
|
83
|
-
/xo.freeze
|
84
|
-
|
85
|
-
# CJK punct if (^|CJK).($|CJK)
|
86
138
|
def l10n_zh_punct(text, prev, foll)
|
87
|
-
|
88
|
-
|
89
|
-
|
139
|
+
# Use pre-defined mapping for better performance
|
140
|
+
ZH_PUNCT_MAP.each do |mapping|
|
141
|
+
punct_pair, regexes = mapping
|
142
|
+
text = l10n_gsub(text, prev, foll, [punct_pair[0], punct_pair[1]], regexes)
|
90
143
|
end
|
91
144
|
text
|
92
145
|
end
|
93
146
|
|
94
|
-
ZH1_DASH = /(#{ZH_CHAR}|^) # CJK character, or start of string
|
95
|
-
(\d*) # optional digits
|
96
|
-
$/xo.freeze
|
97
|
-
|
98
|
-
ZH2_DASH = /^\d* # followed by optional digits
|
99
|
-
(#{ZH_CHAR}|$) # CJK character, or end of string
|
100
|
-
/xo.freeze
|
101
|
-
|
102
147
|
def l10n_zh_dash(text, prev, foll)
|
103
|
-
l10n_gsub(text, prev, foll, %w(– ~), [ZH1_DASH, ZH2_DASH])
|
148
|
+
l10n_gsub(text, prev, foll, %w(– ~), [[ZH1_DASH, ZH2_DASH]])
|
104
149
|
end
|
105
150
|
|
106
|
-
|
151
|
+
# text: string we are scanning for instances of delim[0] to replace
|
152
|
+
# prev: string preceding text, as additional token of context
|
153
|
+
# foll: string following text, as additional token of context
|
154
|
+
# delim: delim[0] is the symbol we want to replace, delim[1] its replacement
|
155
|
+
# regexes: a list of regex pairs: the context before the found token,
|
156
|
+
# and the context after the found token, under which replacing it
|
157
|
+
# with delim[1] is permitted
|
158
|
+
def l10n_gsub(text, prev, foll, delim, regexes)
|
107
159
|
context = l10n_gsub_context(text, prev, foll, delim) or return text
|
108
160
|
(1...(context.size - 1)).each do |i|
|
109
|
-
l10_context_valid?(context, i, delim,
|
161
|
+
l10_context_valid?(context, i, delim, regexes) and
|
110
162
|
context[i] = delim[1].gsub("\\0", context[i]) # Full-width equivalent
|
111
163
|
end
|
112
164
|
context[1...(context.size - 1)].join
|
113
165
|
end
|
114
166
|
|
167
|
+
# split string being scanned, and its contextual tokens before and after,
|
168
|
+
# into array of tokens determining whether to replace instances of delim[0]
|
115
169
|
def l10n_gsub_context(text, prev, foll, delim)
|
116
170
|
d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
|
117
171
|
context = text.split(/(#{d})/) # delim to replace
|
@@ -120,30 +174,36 @@ module IsoDoc
|
|
120
174
|
end
|
121
175
|
|
122
176
|
def l10_context_valid?(context, idx, delim, regex)
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
177
|
+
l10n_context_found_delimiter?(context[idx], delim) or return false
|
178
|
+
regex.detect do |r|
|
179
|
+
r[0].match?(context[0...idx].join) && # preceding context
|
180
|
+
r[1].match?(context[(idx + 1)..-1].join) # foll context
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
def l10n_context_found_delimiter?(token, delim)
|
185
|
+
if delim[0].is_a?(Regexp) # punct to convert
|
186
|
+
delim[0].match?(token)
|
187
|
+
else
|
188
|
+
token == delim[0]
|
189
|
+
end
|
131
190
|
end
|
132
191
|
|
133
192
|
def l10n_zh_remove_space(text, prev, foll)
|
134
193
|
text = l10n_gsub(text, prev, foll, [" ", ""],
|
135
|
-
[/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o])
|
194
|
+
[[/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o]])
|
136
195
|
l10n_gsub(text, prev, foll, [" ", ""],
|
137
|
-
[/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o])
|
196
|
+
[[/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o]])
|
138
197
|
end
|
139
198
|
|
140
199
|
def l10n_fr1(text, prev, foll, locale)
|
141
200
|
text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
|
142
|
-
[/\p{Alnum}$/, /^(\s|$)/])
|
143
|
-
text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"],
|
201
|
+
[[/\p{Alnum}$/, /^(\s|$)/]])
|
202
|
+
text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"],
|
203
|
+
[[/$/, /^(?!\p{Zs})./]])
|
144
204
|
colonsp = locale == "CH" ? "\u202f" : "\u00a0"
|
145
205
|
l10n_gsub(text, prev, foll, [":", "#{colonsp}\\0"],
|
146
|
-
[/\p{Alnum}$/, /^(\s|$)/])
|
206
|
+
[[/\p{Alnum}$/, /^(\s|$)/]])
|
147
207
|
end
|
148
208
|
|
149
209
|
def self.cjk_extend(text)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: isodoc-i18n
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-09-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: htmlentities
|
@@ -170,14 +170,28 @@ dependencies:
|
|
170
170
|
requirements:
|
171
171
|
- - "~>"
|
172
172
|
- !ruby/object:Gem::Version
|
173
|
-
version: 1
|
173
|
+
version: '1'
|
174
174
|
type: :development
|
175
175
|
prerelease: false
|
176
176
|
version_requirements: !ruby/object:Gem::Requirement
|
177
177
|
requirements:
|
178
178
|
- - "~>"
|
179
179
|
- !ruby/object:Gem::Version
|
180
|
-
version: 1
|
180
|
+
version: '1'
|
181
|
+
- !ruby/object:Gem::Dependency
|
182
|
+
name: rubocop-performance
|
183
|
+
requirement: !ruby/object:Gem::Requirement
|
184
|
+
requirements:
|
185
|
+
- - ">="
|
186
|
+
- !ruby/object:Gem::Version
|
187
|
+
version: '0'
|
188
|
+
type: :development
|
189
|
+
prerelease: false
|
190
|
+
version_requirements: !ruby/object:Gem::Requirement
|
191
|
+
requirements:
|
192
|
+
- - ">="
|
193
|
+
- !ruby/object:Gem::Version
|
194
|
+
version: '0'
|
181
195
|
- !ruby/object:Gem::Dependency
|
182
196
|
name: simplecov
|
183
197
|
requirement: !ruby/object:Gem::Requirement
|
@@ -221,7 +235,7 @@ dependencies:
|
|
221
235
|
- !ruby/object:Gem::Version
|
222
236
|
version: '0'
|
223
237
|
- !ruby/object:Gem::Dependency
|
224
|
-
name:
|
238
|
+
name: canon
|
225
239
|
requirement: !ruby/object:Gem::Requirement
|
226
240
|
requirements:
|
227
241
|
- - ">="
|
@@ -277,7 +291,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
277
291
|
- !ruby/object:Gem::Version
|
278
292
|
version: '0'
|
279
293
|
requirements: []
|
280
|
-
rubygems_version: 3.
|
294
|
+
rubygems_version: 3.5.22
|
281
295
|
signing_key:
|
282
296
|
specification_version: 4
|
283
297
|
summary: isodoc-i18n
|