isodoc-i18n 1.3.2 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +1 -1
- data/isodoc-i18n.gemspec +4 -3
- data/lib/isodoc/i18n/version.rb +1 -1
- data/lib/isodoc/i18n-yaml.rb +74 -5
- data/lib/isodoc/i18n.rb +5 -1
- data/lib/isodoc/l10n.rb +81 -106
- data/lib/isodoc/l10n_cjk.rb +165 -0
- data/lib/isodoc-yaml/i18n-en.yaml +21 -0
- metadata +36 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e08e5e4a6c9b89a5f628ee426ce22f8bcdbae97e7cccd4411f4962844d29b231
|
4
|
+
data.tar.gz: 9d8cbc5526c1d8aabe9db00c25deeb581a3604358b50b18e6f5a9bb7faab6909
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3880d68a1f094ab500840e74767f970af57fca01a12ae751da68e9cdc72ba00df9139516df3c74dda5059114fe723b800c057c665efdd485e44aa9a247cdde22
|
7
|
+
data.tar.gz: 80643511d523a3535293a2b7b82e60fe81a8910dbe9fc336cf8a4ab95a9763b1adf1375f802f5b36177a6e4570c51688b803576f7d2d39b8fc7c29e894d2f235
|
data/README.adoc
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
image:https://img.shields.io/gem/v/isodoc-i18n.svg["Gem Version", link="https://rubygems.org/gems/isodoc-i18n"]
|
4
4
|
image:https://github.com/metanorma/isodoc-i18n/workflows/rake/badge.svg["Build Status", link="https://github.com/metanorma/isodoc-i18n/actions?query=workflow%3Arake"]
|
5
|
-
image:https://codeclimate.com/github/metanorma/isodoc-i18n/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/isodoc-i18n"]
|
5
|
+
// image:https://codeclimate.com/github/metanorma/isodoc-i18n/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/isodoc-i18n"]
|
6
6
|
image:https://img.shields.io/github/issues-pr-raw/metanorma/isodoc-i18n.svg["Pull Requests", link="https://github.com/metanorma/isodoc-i18n/pulls"]
|
7
7
|
image:https://img.shields.io/github/commits-since/metanorma/isodoc-i18n/latest.svg["Commits since latest",link="https://github.com/metanorma/isodoc-i18n/releases"]
|
8
8
|
|
data/isodoc-i18n.gemspec
CHANGED
@@ -22,22 +22,23 @@ Gem::Specification.new do |spec|
|
|
22
22
|
end
|
23
23
|
spec.required_ruby_version = Gem::Requirement.new(">= 2.7.0")
|
24
24
|
|
25
|
+
spec.add_dependency "base64"
|
25
26
|
spec.add_dependency "htmlentities", "~> 4.3.4"
|
26
27
|
spec.add_dependency "liquid", "~> 5"
|
27
28
|
spec.add_dependency "metanorma-utils", ">= 1.7.0"
|
28
29
|
spec.add_dependency "twitter_cldr"
|
29
|
-
spec.add_dependency "base64"
|
30
30
|
|
31
|
+
spec.add_development_dependency "canon"
|
31
32
|
spec.add_development_dependency "debug"
|
32
33
|
spec.add_development_dependency "equivalent-xml", "~> 0.6"
|
33
34
|
spec.add_development_dependency "guard", "~> 2.14"
|
34
35
|
spec.add_development_dependency "guard-rspec", "~> 4.7"
|
35
36
|
spec.add_development_dependency "rake", "~> 13.0"
|
36
37
|
spec.add_development_dependency "rspec", "~> 3.6"
|
37
|
-
spec.add_development_dependency "rubocop", "~> 1
|
38
|
+
spec.add_development_dependency "rubocop", "~> 1"
|
39
|
+
spec.add_development_dependency "rubocop-performance"
|
38
40
|
spec.add_development_dependency "simplecov", "~> 0.15"
|
39
41
|
spec.add_development_dependency "timecop", "~> 0.9"
|
40
42
|
spec.add_development_dependency "webmock"
|
41
|
-
spec.add_development_dependency "xml-c14n"
|
42
43
|
# spec.metadata["rubygems_mfa_required"] = "true"
|
43
44
|
end
|
data/lib/isodoc/i18n/version.rb
CHANGED
data/lib/isodoc/i18n-yaml.rb
CHANGED
@@ -8,10 +8,79 @@ module IsoDoc
|
|
8
8
|
def load_yaml(lang, script, i18nyaml = nil, i18nhash = nil)
|
9
9
|
ret = load_yaml1(lang, script)
|
10
10
|
i18nyaml and
|
11
|
-
return
|
12
|
-
i18nhash and return
|
11
|
+
return postprocess(ret.deep_merge(YAML.load_file(i18nyaml)))
|
12
|
+
i18nhash and return postprocess(ret.deep_merge(i18nhash))
|
13
13
|
|
14
|
-
|
14
|
+
postprocess(ret)
|
15
|
+
end
|
16
|
+
|
17
|
+
def postprocess(labels)
|
18
|
+
self_reference_resolve(normalise_hash(labels))
|
19
|
+
end
|
20
|
+
|
21
|
+
def self_reference_resolve(labels)
|
22
|
+
resolve_references(labels, labels)
|
23
|
+
end
|
24
|
+
|
25
|
+
def resolve_references(obj, labels)
|
26
|
+
case obj
|
27
|
+
when Hash
|
28
|
+
obj.transform_values { |v| resolve_references(v, labels) }
|
29
|
+
when Array
|
30
|
+
obj.map { |item| resolve_references(item, labels) }
|
31
|
+
when String
|
32
|
+
resolve_string_references(obj, labels)
|
33
|
+
else
|
34
|
+
obj
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def resolve_string_references(str, labels)
|
39
|
+
# Match patterns like #{self["key"]["subkey"]} or #{self.key.subkey}
|
40
|
+
# Allow spaces around the self expression
|
41
|
+
str.gsub(/\#\{\s*self([^\}]+?)\s*\}/) do |match|
|
42
|
+
path_expr = Regexp.last_match(1)
|
43
|
+
resolve_path(path_expr, labels, match)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def resolve_path(path_expr, labels, original_expr)
|
48
|
+
segments = parse_path(path_expr)
|
49
|
+
current = labels
|
50
|
+
|
51
|
+
segments.each do |segment|
|
52
|
+
case current
|
53
|
+
when Hash
|
54
|
+
current.key?(segment) or
|
55
|
+
raise "Self-reference error: Path '#{original_expr}' not found - key '#{segment}' does not exist"
|
56
|
+
current = current[segment]
|
57
|
+
when Array
|
58
|
+
index = segment.to_i
|
59
|
+
segment =~ /^\d+$/ && index >= 0 && index < current.length or
|
60
|
+
raise "Self-reference error: Path '#{original_expr}' not found - invalid array index '#{segment}'"
|
61
|
+
current = current[index]
|
62
|
+
else
|
63
|
+
raise "Self-reference error: Path '#{original_expr}' not found - cannot navigate through non-collection type"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
current.to_s
|
68
|
+
end
|
69
|
+
|
70
|
+
def parse_path(path_expr)
|
71
|
+
segments = []
|
72
|
+
path_expr = path_expr.sub(/^\./, "")
|
73
|
+
# Split by dots and brackets while preserving the content
|
74
|
+
parts = path_expr.scan(/\.?([\w-]+)|\[([^\]]+)\]/)
|
75
|
+
parts.each do |dot_part, bracket_part|
|
76
|
+
if dot_part
|
77
|
+
segments << dot_part
|
78
|
+
elsif bracket_part
|
79
|
+
segment = bracket_part.strip.gsub(/^["']|["']$/, "")
|
80
|
+
segments << segment
|
81
|
+
end
|
82
|
+
end
|
83
|
+
segments
|
15
84
|
end
|
16
85
|
|
17
86
|
def normalise_hash(ret)
|
@@ -30,8 +99,8 @@ module IsoDoc
|
|
30
99
|
def load_yaml1(lang, script)
|
31
100
|
case lang
|
32
101
|
when "zh"
|
33
|
-
if script
|
34
|
-
else load_yaml2("
|
102
|
+
if script then load_yaml2("zh-#{script}")
|
103
|
+
else load_yaml2("zh-Hans")
|
35
104
|
end
|
36
105
|
else
|
37
106
|
load_yaml2(lang)
|
data/lib/isodoc/i18n.rb
CHANGED
@@ -24,6 +24,8 @@ module IsoDoc
|
|
24
24
|
self
|
25
25
|
end
|
26
26
|
|
27
|
+
CJK_SCRIPTS = %w(Hans Hant Jpan Kore).freeze
|
28
|
+
|
27
29
|
def liquid_init
|
28
30
|
::IsoDoc::I18n::Liquid.set(self)
|
29
31
|
::Liquid::Environment.default.register_filter(::IsoDoc::I18n::Liquid)
|
@@ -66,7 +68,9 @@ module IsoDoc
|
|
66
68
|
end
|
67
69
|
|
68
70
|
def enum_comma
|
69
|
-
|
71
|
+
c = @labels.dig("punct", "enum-comma")
|
72
|
+
c && CJK_SCRIPTS.include?(@script) and
|
73
|
+
return "<enum-comma>#{c}</enum-comma>"
|
70
74
|
"<enum-comma>,</enum-comma> "
|
71
75
|
end
|
72
76
|
|
data/lib/isodoc/l10n.rb
CHANGED
@@ -1,13 +1,23 @@
|
|
1
|
+
require "metanorma-utils"
|
2
|
+
require_relative "l10n_cjk"
|
3
|
+
|
1
4
|
module IsoDoc
|
2
5
|
class I18n
|
3
|
-
def self.l10n(text, lang = @lang, script = @script,
|
4
|
-
l10n(text, lang, script,
|
5
|
-
end
|
6
|
-
|
7
|
-
# function localising spaces and punctuation
|
8
|
-
|
9
|
-
|
10
|
-
|
6
|
+
def self.l10n(text, lang = @lang, script = @script, options = {})
|
7
|
+
l10n(text, lang, script, options)
|
8
|
+
end
|
9
|
+
|
10
|
+
# function localising spaces and punctuation
|
11
|
+
# options[:prev] and options[:foll] are optional context strings
|
12
|
+
# options[:proportional_mixed_cjk] allows contextual full-width vs
|
13
|
+
# half-width punctuation
|
14
|
+
def l10n(text, lang = @lang, script = @script, options = {})
|
15
|
+
locale = options[:locale] || @locale
|
16
|
+
%w(zh ja ko).include?(lang) and
|
17
|
+
text = l10n_zh(text, script, options)
|
18
|
+
lang == "fr" and
|
19
|
+
text = l10n_fr(text, locale || "FR", options)
|
20
|
+
text&.gsub!(/<esc>|<\/esc>/, "") # Strip esc tags
|
11
21
|
bidiwrap(text, lang, script)
|
12
22
|
end
|
13
23
|
|
@@ -29,145 +39,110 @@ module IsoDoc
|
|
29
39
|
.default_script(@lang))]
|
30
40
|
end
|
31
41
|
|
32
|
-
|
33
|
-
def l10n_zh(text, script = "Hans")
|
42
|
+
def l10n_prep(text, options)
|
34
43
|
xml = Nokogiri::XML::DocumentFragment.parse(text)
|
35
|
-
t = xml.xpath(".//text()")
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
44
|
+
t = xml.xpath(".//text()").reject { |node| node.text.empty? }
|
45
|
+
text_cache = build_text_cache(t, options[:prev], options[:foll])
|
46
|
+
|
47
|
+
# Identify which text nodes are within <esc> tags
|
48
|
+
esc_indices = Set.new
|
49
|
+
t.each_with_index do |node, i|
|
50
|
+
esc_indices.add(i) if node.ancestors("esc").any?
|
40
51
|
end
|
41
|
-
|
42
|
-
|
52
|
+
|
53
|
+
[t, text_cache, xml, options[:prev], options[:foll], esc_indices]
|
54
|
+
end
|
55
|
+
|
56
|
+
# Cache text content once per method call to avoid repeated .text calls
|
57
|
+
# Build text cache with optional prepended/appended context
|
58
|
+
# Also, reduce multiple spaces to single, to avoid miscrecognition of space
|
59
|
+
def build_text_cache(text_nodes, prev_context = nil, foll_context = nil)
|
60
|
+
text_cache = text_nodes.map(&:text).map { |x| x.gsub(/\s+/, " ") }
|
61
|
+
text_cache.unshift(prev_context) if prev_context
|
62
|
+
text_cache.push(foll_context) if foll_context
|
63
|
+
text_cache
|
43
64
|
end
|
44
65
|
|
45
66
|
# previous, following context of current text node:
|
46
67
|
# do not use just the immediately adjoining text tokens for context
|
47
68
|
# deal with spaces and empty text by just concatenating entire context
|
69
|
+
# Optimized to avoid O(n²) complexity by using pre-cached text content
|
70
|
+
def l10n_context_cached(text_cache, idx)
|
71
|
+
prev = text_cache[0...idx].join
|
72
|
+
foll = text_cache[(idx + 1)...text_cache.size].join
|
73
|
+
[prev, foll]
|
74
|
+
end
|
75
|
+
|
76
|
+
# Fallback method for backward compatibility
|
48
77
|
def l10n_context(nodes, idx)
|
49
78
|
prev = nodes[0...idx].map(&:text).join
|
50
79
|
foll = nodes[(idx + 1)...(nodes.size)].map(&:text).join
|
51
80
|
[prev, foll]
|
52
81
|
end
|
53
82
|
|
54
|
-
def l10n_fr(text, locale)
|
55
|
-
xml =
|
56
|
-
t = xml.xpath(".//text()")
|
83
|
+
def l10n_fr(text, locale, options)
|
84
|
+
t, text_cache, xml, prev, _foll, esc_indices = l10n_prep(text, options)
|
57
85
|
t.each_with_index do |n, i|
|
58
|
-
|
86
|
+
next if esc_indices.include?(i) # Skip escaped nodes
|
87
|
+
|
88
|
+
prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
|
59
89
|
text = cleanup_entities(n.text, is_xml: false)
|
60
|
-
n.replace(l10n_fr1(text,
|
90
|
+
n.replace(l10n_fr1(text, prev_ctx, foll_ctx, locale))
|
61
91
|
end
|
62
92
|
to_xml(xml)
|
63
93
|
end
|
64
94
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
#
|
69
|
-
#
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
l10n_zh_dash(r, prev, foll)
|
75
|
-
end
|
76
|
-
|
77
|
-
ZH1_PUNCT = /(#{ZH_CHAR}|^) # CJK character, or start of string
|
78
|
-
(\s*)$ # Latin spaces optional
|
79
|
-
/xo.freeze
|
80
|
-
ZH2_PUNCT = /^\s* # followed by ignorable Latin spaces
|
81
|
-
[:,.()\[\];?!-]* # Latin punct which will also convert to CJK
|
82
|
-
(#{ZH_CHAR}|$) # CJK character, or end of string
|
83
|
-
/xo.freeze
|
84
|
-
|
85
|
-
# CJK punct if (^|CJK).($|CJK)
|
86
|
-
def l10n_zh_punct(text, prev, foll)
|
87
|
-
["::", ",,", "..", "))", "]]", ";;", "??", "!!", "((", "[["].each do |m|
|
88
|
-
text = l10n_gsub(text, prev, foll, [m[0], m[1]],
|
89
|
-
[ZH1_PUNCT, ZH2_PUNCT])
|
90
|
-
end
|
91
|
-
text
|
92
|
-
end
|
93
|
-
|
94
|
-
ZH1_DASH = /(#{ZH_CHAR}|^) # CJK character, or start of string
|
95
|
-
(\d*) # optional digits
|
96
|
-
$/xo.freeze
|
97
|
-
|
98
|
-
ZH2_DASH = /^\d* # followed by optional digits
|
99
|
-
(#{ZH_CHAR}|$) # CJK character, or end of string
|
100
|
-
/xo.freeze
|
101
|
-
|
102
|
-
def l10n_zh_dash(text, prev, foll)
|
103
|
-
l10n_gsub(text, prev, foll, %w(– ~), [ZH1_DASH, ZH2_DASH])
|
104
|
-
end
|
105
|
-
|
106
|
-
def l10n_gsub(text, prev, foll, delim, regex)
|
95
|
+
# text: string we are scanning for instances of delim[0] to replace
|
96
|
+
# prev: string preceding text, as additional token of context
|
97
|
+
# foll: string following text, as additional token of context
|
98
|
+
# delim: delim[0] is the symbol we want to replace, delim[1] its replacement
|
99
|
+
# regexes: a list of regex pairs: the context before the found token,
|
100
|
+
# and the context after the found token, under which replacing it
|
101
|
+
# with delim[1] is permitted. If regex is nil, always allow the replacement
|
102
|
+
def l10n_gsub(text, prev, foll, delim, regexes)
|
103
|
+
delim[1] or return text
|
107
104
|
context = l10n_gsub_context(text, prev, foll, delim) or return text
|
108
105
|
(1...(context.size - 1)).each do |i|
|
109
|
-
l10_context_valid?(context, i, delim,
|
106
|
+
l10_context_valid?(context, i, delim, regexes) and
|
110
107
|
context[i] = delim[1].gsub("\\0", context[i]) # Full-width equivalent
|
111
108
|
end
|
112
109
|
context[1...(context.size - 1)].join
|
113
110
|
end
|
114
111
|
|
112
|
+
# split string being scanned, and its contextual tokens before and after,
|
113
|
+
# into array of tokens determining whether to replace instances of delim[0]
|
115
114
|
def l10n_gsub_context(text, prev, foll, delim)
|
116
115
|
d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
|
117
116
|
context = text.split(/(#{d})/) # delim to replace
|
118
117
|
context.size == 1 and return
|
119
|
-
[prev, context, foll].flatten
|
118
|
+
[prev, context.reject(&:empty?), foll].flatten
|
120
119
|
end
|
121
120
|
|
122
121
|
def l10_context_valid?(context, idx, delim, regex)
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
regex[0].match?(context[0...idx].join) && # preceding context
|
130
|
-
regex[1].match?(context[(idx + 1)..-1].join) # foll context
|
122
|
+
l10n_context_found_delimiter?(context[idx], delim) or return false
|
123
|
+
regex.nil? and return true
|
124
|
+
regex.detect do |r|
|
125
|
+
r[0].match?(context[0...idx].join) && # preceding context
|
126
|
+
r[1].match?(context[(idx + 1)..-1].join) # foll context
|
127
|
+
end
|
131
128
|
end
|
132
129
|
|
133
|
-
def
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
130
|
+
def l10n_context_found_delimiter?(token, delim)
|
131
|
+
if delim[0].is_a?(Regexp) # punct to convert
|
132
|
+
delim[0].match?(token)
|
133
|
+
else
|
134
|
+
token == delim[0]
|
135
|
+
end
|
138
136
|
end
|
139
137
|
|
140
138
|
def l10n_fr1(text, prev, foll, locale)
|
141
139
|
text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
|
142
|
-
[/\p{Alnum}$/, /^(\s|$)/])
|
143
|
-
text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"],
|
140
|
+
[[/\p{Alnum}$/, /^(\s|$)/]])
|
141
|
+
text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"],
|
142
|
+
[[/$/, /^(?!\p{Zs})./]])
|
144
143
|
colonsp = locale == "CH" ? "\u202f" : "\u00a0"
|
145
144
|
l10n_gsub(text, prev, foll, [":", "#{colonsp}\\0"],
|
146
|
-
[/\p{Alnum}$/, /^(\s|$)/])
|
147
|
-
end
|
148
|
-
|
149
|
-
def self.cjk_extend(text)
|
150
|
-
cjk_extend(text)
|
151
|
-
end
|
152
|
-
|
153
|
-
def cjk_extend(title)
|
154
|
-
@c.decode(title).chars.map.with_index do |n, i|
|
155
|
-
if i.zero? || !interleave_space_cjk?(title[i - 1] + title[i])
|
156
|
-
n
|
157
|
-
else "\u3000#{n}"
|
158
|
-
end
|
159
|
-
end.join
|
160
|
-
end
|
161
|
-
|
162
|
-
def interleave_space_cjk?(text)
|
163
|
-
text.size == 2 or return
|
164
|
-
["\u2014\u2014", "\u2025\u2025", "\u2026\u2026",
|
165
|
-
"\u22ef\u22ef"].include?(text) ||
|
166
|
-
/\d\d|\p{Latin}\p{Latin}|[[:space:]]/.match?(text) ||
|
167
|
-
/^[\u2018\u201c(\u3014\[{\u3008\u300a\u300c\u300e\u3010\u2985\u3018\u3016\u00ab\u301d]/.match?(text) ||
|
168
|
-
/[\u2019\u201d)\u3015\]}\u3009\u300b\u300d\u300f\u3011\u2986\u3019\u3017\u00bb\u301f]$/.match?(text) ||
|
169
|
-
/[\u3002.\u3001,\u30fb:;\u2010\u301c\u30a0\u2013!?\u203c\u2047\u2048\u2049]/.match?(text) and return false
|
170
|
-
true
|
145
|
+
[[/\p{Alnum}$/, /^(\s|$)/]])
|
171
146
|
end
|
172
147
|
|
173
148
|
def to_xml(node)
|
@@ -0,0 +1,165 @@
|
|
1
|
+
module IsoDoc
|
2
|
+
class I18n
|
3
|
+
# Use comprehensive CJK definition from metanorma-utils
|
4
|
+
# This includes Han, Katakana, Hiragana, Hangul, Bopomofo
|
5
|
+
# and all CJK extensions
|
6
|
+
ZH_CHAR = "(#{Metanorma::Utils::CJK})".freeze
|
7
|
+
LATIN_PUNCT = /[:,.()\[\];?!-]/.freeze
|
8
|
+
# CJK character which is not punctuation
|
9
|
+
ZH_NON_PUNCT = "(#{
|
10
|
+
[
|
11
|
+
Metanorma::Utils.singleton_class::HAN,
|
12
|
+
Metanorma::Utils.singleton_class::HAN_IDC,
|
13
|
+
Metanorma::Utils.singleton_class::KANBUN,
|
14
|
+
Metanorma::Utils.singleton_class::CJK_COMPAT_IDEOGRAPHS,
|
15
|
+
Metanorma::Utils.singleton_class::HAN_COMPAT_IDEOGRAPHS,
|
16
|
+
Metanorma::Utils.singleton_class::HANGUL,
|
17
|
+
Metanorma::Utils.singleton_class::HIRAGANA,
|
18
|
+
Metanorma::Utils.singleton_class::KATAKANA,
|
19
|
+
Metanorma::Utils.singleton_class::BOPOMOFO,
|
20
|
+
].join("|")})".freeze
|
21
|
+
|
22
|
+
# Condition for converting punctuation to double width,
|
23
|
+
# in case of options[:proportional_mixed_cjk]
|
24
|
+
# 1. (Strict condition) CJK before, CJK after, modulo ignorable characters:
|
25
|
+
# 1a. CJK character, or start of string. Latin spaces optional.
|
26
|
+
ZH1_PUNCT = /(#{ZH_CHAR}|^)(\s*)$/xo.freeze
|
27
|
+
# 1b. Latin spaces optional, Latin punct which will also convert to CJK,
|
28
|
+
# CJK character, or end of string.
|
29
|
+
ZH2_PUNCT = /^\s*#{LATIN_PUNCT}*(#{ZH_CHAR}|$)/xo.freeze
|
30
|
+
# 2. CJK before, space after:
|
31
|
+
# 2a. CJK char, followed by optional Latin punct which will also convert to CJK
|
32
|
+
ZH1_NO_SPACE = /#{ZH_CHAR}#{LATIN_PUNCT}*$/xo.freeze
|
33
|
+
# 2b. optional Latin punct which wil also convert to CJK, then space
|
34
|
+
OPT_PUNCT_SPACE = /^($|#{LATIN_PUNCT}*\s)/xo.freeze
|
35
|
+
|
36
|
+
# Chinese numerals (common + formal/financial forms)
|
37
|
+
# Explicit characters needed because Chinese numeral ideographs
|
38
|
+
# are not tagged with Unicode Number property
|
39
|
+
# Using alternation instead of character class to properly include \p{N}
|
40
|
+
ZH_NUMERALS = "(?:[零一二三四五六七八九十百千万亿壹贰叁肆伍陆柒捌玖拾佰仟萬億兆]|\\p{N})".freeze
|
41
|
+
|
42
|
+
# Contexts for converting en-dashes to full-width
|
43
|
+
# Before: CJK or start of string, no digits
|
44
|
+
ZH1_DASH = /(#{ZH_CHAR}|^)(?<!=#{ZH_NUMERALS})$/xo.freeze
|
45
|
+
# After: no optional digits, CJK or end of string
|
46
|
+
ZH2_DASH = /^(?!#{ZH_NUMERALS})(#{ZH_CHAR}|$)/xo.freeze
|
47
|
+
# Before: CJK or start of string, optional digits
|
48
|
+
ZH1_NUM_DASH = /#{ZH_NUMERALS}$/xo.freeze
|
49
|
+
# After: optional digits, CJK or end of string
|
50
|
+
ZH2_NUM_DASH = /^#{ZH_NUMERALS}/xo.freeze
|
51
|
+
|
52
|
+
ZH_PUNCT_CONTEXTS =
|
53
|
+
[[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE],
|
54
|
+
[/(\s|^)$/, /^#{ZH_CHAR}/o]].freeze
|
55
|
+
|
56
|
+
# map of YAML punct keys to auto-text Latin equivalents
|
57
|
+
ZH_PUNCT_AUTOTEXT = {
|
58
|
+
colon: ":",
|
59
|
+
comma: ",",
|
60
|
+
"enum-comma": ",",
|
61
|
+
semicolon: ";",
|
62
|
+
period: ".",
|
63
|
+
"close-paren": ")",
|
64
|
+
"open-paren": "(",
|
65
|
+
"close-bracket": "]",
|
66
|
+
"open-bracket": "[",
|
67
|
+
"question-mark": "?",
|
68
|
+
"exclamation-mark": "!",
|
69
|
+
"em-dash": "—",
|
70
|
+
"open-quote": "“",
|
71
|
+
"close-quote": "”",
|
72
|
+
"open-nested-quote": "’",
|
73
|
+
"close-nested-quote": "’",
|
74
|
+
ellipse: "…",
|
75
|
+
}.freeze
|
76
|
+
|
77
|
+
# Pre-defined punctuation mappings for efficiency
|
78
|
+
def init_zh_punct_map
|
79
|
+
ZH_PUNCT_AUTOTEXT.each_with_object([]) do |(k, v), m|
|
80
|
+
@labels.dig("punct", k.to_s) or next
|
81
|
+
m << [v, @labels["punct"][k.to_s], ZH_PUNCT_CONTEXTS]
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def l10n_zh(text, script, options)
|
86
|
+
script ||= "Hans"
|
87
|
+
t, text_cache, xml, prev, _foll, esc_indices = l10n_prep(text, options)
|
88
|
+
t.each_with_index do |n, i|
|
89
|
+
next if esc_indices.include?(i) # Skip escaped nodes
|
90
|
+
|
91
|
+
# Adjust index if prev context prepended
|
92
|
+
prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
|
93
|
+
text = cleanup_entities(n.text, is_xml: false)
|
94
|
+
n.replace(l10_zh1(text, prev_ctx, foll_ctx, script, options))
|
95
|
+
end
|
96
|
+
to_xml(xml) #.gsub(/<\/?em>|<\/?strong>|<\/?i>|<\/?b>/, "")
|
97
|
+
end
|
98
|
+
|
99
|
+
# note: we can't differentiate comma from enumeration comma 、
|
100
|
+
# def l10_zh1(text, _script)
|
101
|
+
def l10_zh1(text, prev, foll, _script, options)
|
102
|
+
r = l10n_zh_punct(text, prev, foll, options)
|
103
|
+
r = l10n_zh_remove_space(r, prev, foll)
|
104
|
+
l10n_zh_dash(r, prev, foll)
|
105
|
+
end
|
106
|
+
|
107
|
+
def l10n_zh_punct(text, prev, foll, options)
|
108
|
+
# Use pre-defined mapping for better performance
|
109
|
+
@zh_punct_map ||= init_zh_punct_map
|
110
|
+
@zh_punct_map.each do |mapping|
|
111
|
+
punct_from, punct_to, regexes = mapping
|
112
|
+
options[:proportional_mixed_cjk] or regexes = nil
|
113
|
+
text = l10n_gsub(text, prev, foll, [punct_from, punct_to],
|
114
|
+
regexes)
|
115
|
+
end
|
116
|
+
text
|
117
|
+
end
|
118
|
+
|
119
|
+
def l10n_zh_dash(text, prev, foll)
|
120
|
+
text = l10n_gsub(text, prev, foll, ["–", @labels.dig("punct", "en-dash")],
|
121
|
+
[[ZH1_DASH, ZH2_DASH]])
|
122
|
+
l10n_gsub(text, prev, foll, ["–", @labels.dig("punct", "number-en-dash")],
|
123
|
+
[[ZH1_NUM_DASH, ZH2_NUM_DASH]])
|
124
|
+
end
|
125
|
+
|
126
|
+
def l10n_zh_remove_space(text, prev, foll)
|
127
|
+
text = l10n_gsub(text, prev, foll, [/\s+/, ""],
|
128
|
+
[[/(#{ZH_CHAR})$/o, /^#{ZH_CHAR}/o]])
|
129
|
+
if sep = @labels.dig("punct", "cjk-latin-separator")
|
130
|
+
# Skip over punctuation to find Latin letters/numbers
|
131
|
+
text = l10n_gsub(text, prev, foll, [/\s+/, sep],
|
132
|
+
[[/#{ZH_CHAR}$/o, /^\p{P}*[\p{Latin}\p{N}]/o]])
|
133
|
+
l10n_gsub(text, prev, foll, [/\s+/, sep],
|
134
|
+
[[/[\p{Latin}\p{N}]\p{P}*$/o, /^#{ZH_NON_PUNCT}/o]])
|
135
|
+
else
|
136
|
+
l10n_gsub(text, prev, foll, [/\s+/, ""],
|
137
|
+
[[/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o]])
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def self.cjk_extend(text)
|
142
|
+
cjk_extend(text)
|
143
|
+
end
|
144
|
+
|
145
|
+
def cjk_extend(title)
|
146
|
+
@c.decode(title).chars.map.with_index do |n, i|
|
147
|
+
if i.zero? || !interleave_space_cjk?(title[i - 1] + title[i])
|
148
|
+
n
|
149
|
+
else "\u3000#{n}"
|
150
|
+
end
|
151
|
+
end.join
|
152
|
+
end
|
153
|
+
|
154
|
+
def interleave_space_cjk?(text)
|
155
|
+
text.size == 2 or return
|
156
|
+
["\u2014\u2014", "\u2025\u2025", "\u2026\u2026",
|
157
|
+
"\u22ef\u22ef"].include?(text) ||
|
158
|
+
/\d\d|\p{Latin}\p{Latin}|[[:space:]]/.match?(text) ||
|
159
|
+
/^[\u2018\u201c(\u3014\[{\u3008\u300a\u300c\u300e\u3010\u2985\u3018\u3016\u00ab\u301d]/.match?(text) ||
|
160
|
+
/[\u2019\u201d)\u3015\]}\u3009\u300b\u300d\u300f\u3011\u2986\u3019\u3017\u00bb\u301f]$/.match?(text) ||
|
161
|
+
/[\u3002.\u3001,\u30fb:;\u2010\u301c\u30a0\u2013!?\u203c\u2047\u2048\u2049]/.match?(text) and return false
|
162
|
+
true
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -2,3 +2,24 @@ text: text
|
|
2
2
|
at: at
|
3
3
|
binary_and: "%1 <conn>and</conn> %2"
|
4
4
|
multiple_and: "%1<conn>, and</conn> %2"
|
5
|
+
punct:
|
6
|
+
colon: ":"
|
7
|
+
comma: ","
|
8
|
+
enum_comma: ","
|
9
|
+
semicolon: ";"
|
10
|
+
period: "."
|
11
|
+
close_paren: ")"
|
12
|
+
open_paren: "("
|
13
|
+
close_bracket: "]"
|
14
|
+
open_bracket: "["
|
15
|
+
question_mark: "?"
|
16
|
+
exclamation_mark: "!"
|
17
|
+
em_dash: "—"
|
18
|
+
en_dash: "–"
|
19
|
+
number_en_dash: "–"
|
20
|
+
open_quote: "“"
|
21
|
+
close_quote: "”"
|
22
|
+
open_nested_quote: "’"
|
23
|
+
close_nested_quote: "’"
|
24
|
+
ellipse: …
|
25
|
+
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: isodoc-i18n
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-10-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: base64
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: htmlentities
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -67,13 +81,13 @@ dependencies:
|
|
67
81
|
- !ruby/object:Gem::Version
|
68
82
|
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
84
|
+
name: canon
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
72
86
|
requirements:
|
73
87
|
- - ">="
|
74
88
|
- !ruby/object:Gem::Version
|
75
89
|
version: '0'
|
76
|
-
type: :
|
90
|
+
type: :development
|
77
91
|
prerelease: false
|
78
92
|
version_requirements: !ruby/object:Gem::Requirement
|
79
93
|
requirements:
|
@@ -170,58 +184,58 @@ dependencies:
|
|
170
184
|
requirements:
|
171
185
|
- - "~>"
|
172
186
|
- !ruby/object:Gem::Version
|
173
|
-
version: 1
|
187
|
+
version: '1'
|
174
188
|
type: :development
|
175
189
|
prerelease: false
|
176
190
|
version_requirements: !ruby/object:Gem::Requirement
|
177
191
|
requirements:
|
178
192
|
- - "~>"
|
179
193
|
- !ruby/object:Gem::Version
|
180
|
-
version: 1
|
194
|
+
version: '1'
|
181
195
|
- !ruby/object:Gem::Dependency
|
182
|
-
name:
|
196
|
+
name: rubocop-performance
|
183
197
|
requirement: !ruby/object:Gem::Requirement
|
184
198
|
requirements:
|
185
|
-
- - "
|
199
|
+
- - ">="
|
186
200
|
- !ruby/object:Gem::Version
|
187
|
-
version: '0
|
201
|
+
version: '0'
|
188
202
|
type: :development
|
189
203
|
prerelease: false
|
190
204
|
version_requirements: !ruby/object:Gem::Requirement
|
191
205
|
requirements:
|
192
|
-
- - "
|
206
|
+
- - ">="
|
193
207
|
- !ruby/object:Gem::Version
|
194
|
-
version: '0
|
208
|
+
version: '0'
|
195
209
|
- !ruby/object:Gem::Dependency
|
196
|
-
name:
|
210
|
+
name: simplecov
|
197
211
|
requirement: !ruby/object:Gem::Requirement
|
198
212
|
requirements:
|
199
213
|
- - "~>"
|
200
214
|
- !ruby/object:Gem::Version
|
201
|
-
version: '0.
|
215
|
+
version: '0.15'
|
202
216
|
type: :development
|
203
217
|
prerelease: false
|
204
218
|
version_requirements: !ruby/object:Gem::Requirement
|
205
219
|
requirements:
|
206
220
|
- - "~>"
|
207
221
|
- !ruby/object:Gem::Version
|
208
|
-
version: '0.
|
222
|
+
version: '0.15'
|
209
223
|
- !ruby/object:Gem::Dependency
|
210
|
-
name:
|
224
|
+
name: timecop
|
211
225
|
requirement: !ruby/object:Gem::Requirement
|
212
226
|
requirements:
|
213
|
-
- - "
|
227
|
+
- - "~>"
|
214
228
|
- !ruby/object:Gem::Version
|
215
|
-
version: '0'
|
229
|
+
version: '0.9'
|
216
230
|
type: :development
|
217
231
|
prerelease: false
|
218
232
|
version_requirements: !ruby/object:Gem::Requirement
|
219
233
|
requirements:
|
220
|
-
- - "
|
234
|
+
- - "~>"
|
221
235
|
- !ruby/object:Gem::Version
|
222
|
-
version: '0'
|
236
|
+
version: '0.9'
|
223
237
|
- !ruby/object:Gem::Dependency
|
224
|
-
name:
|
238
|
+
name: webmock
|
225
239
|
requirement: !ruby/object:Gem::Requirement
|
226
240
|
requirements:
|
227
241
|
- - ">="
|
@@ -257,6 +271,7 @@ files:
|
|
257
271
|
- lib/isodoc/i18n.rb
|
258
272
|
- lib/isodoc/i18n/version.rb
|
259
273
|
- lib/isodoc/l10n.rb
|
274
|
+
- lib/isodoc/l10n_cjk.rb
|
260
275
|
- lib/isodoc/liquid/liquid.rb
|
261
276
|
homepage: https://github.com/metanorma/isodoc-i18n
|
262
277
|
licenses:
|
@@ -277,7 +292,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
277
292
|
- !ruby/object:Gem::Version
|
278
293
|
version: '0'
|
279
294
|
requirements: []
|
280
|
-
rubygems_version: 3.
|
295
|
+
rubygems_version: 3.5.22
|
281
296
|
signing_key:
|
282
297
|
specification_version: 4
|
283
298
|
summary: isodoc-i18n
|