isodoc-i18n 1.4.0 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/isodoc-i18n.gemspec +3 -3
- data/lib/isodoc/i18n/version.rb +1 -1
- data/lib/isodoc/i18n-yaml.rb +79 -6
- data/lib/isodoc/i18n.rb +4 -2
- data/lib/isodoc/l10n.rb +39 -113
- data/lib/isodoc/l10n_cjk.rb +165 -0
- data/lib/isodoc-yaml/i18n-en.yaml +21 -0
- metadata +23 -22
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 02e65f54740c83c1d9698623c18711219680d6541f88e2cd0aaa62d3595c103d
|
|
4
|
+
data.tar.gz: f57b0e7bf29a2d02576dda04b3d7b34ded97a42ec382ebdbc9caefb12a1aa17b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b0a869f0df1ef445e5f18d336d0d3f18968e7055444e81535d7da4219a7c24f4e370ea45e0033d1ed2766d8fc5ea2c577f74ea07c232fc6217ff236aacc65e8e
|
|
7
|
+
data.tar.gz: 801f9ccb9c81c372c77834ad3667b88491f0aa1429c588e1a8c1854fc5ca8f1a817607de4bd0c44ed1d02f2a3984d5c336714692cff31c860d6c14a658502e9b
|
data/isodoc-i18n.gemspec
CHANGED
|
@@ -22,12 +22,13 @@ Gem::Specification.new do |spec|
|
|
|
22
22
|
end
|
|
23
23
|
spec.required_ruby_version = Gem::Requirement.new(">= 2.7.0")
|
|
24
24
|
|
|
25
|
+
spec.add_dependency "base64"
|
|
25
26
|
spec.add_dependency "htmlentities", "~> 4.3.4"
|
|
26
27
|
spec.add_dependency "liquid", "~> 5"
|
|
27
28
|
spec.add_dependency "metanorma-utils", ">= 1.7.0"
|
|
28
29
|
spec.add_dependency "twitter_cldr"
|
|
29
|
-
spec.add_dependency "base64"
|
|
30
30
|
|
|
31
|
+
spec.add_development_dependency "canon", "= 0.1.3"
|
|
31
32
|
spec.add_development_dependency "debug"
|
|
32
33
|
spec.add_development_dependency "equivalent-xml", "~> 0.6"
|
|
33
34
|
spec.add_development_dependency "guard", "~> 2.14"
|
|
@@ -35,10 +36,9 @@ Gem::Specification.new do |spec|
|
|
|
35
36
|
spec.add_development_dependency "rake", "~> 13.0"
|
|
36
37
|
spec.add_development_dependency "rspec", "~> 3.6"
|
|
37
38
|
spec.add_development_dependency "rubocop", "~> 1"
|
|
38
|
-
spec.add_development_dependency "rubocop-performance"
|
|
39
|
+
spec.add_development_dependency "rubocop-performance"
|
|
39
40
|
spec.add_development_dependency "simplecov", "~> 0.15"
|
|
40
41
|
spec.add_development_dependency "timecop", "~> 0.9"
|
|
41
42
|
spec.add_development_dependency "webmock"
|
|
42
|
-
spec.add_development_dependency "canon"
|
|
43
43
|
# spec.metadata["rubygems_mfa_required"] = "true"
|
|
44
44
|
end
|
data/lib/isodoc/i18n/version.rb
CHANGED
data/lib/isodoc/i18n-yaml.rb
CHANGED
|
@@ -7,11 +7,80 @@ module IsoDoc
|
|
|
7
7
|
|
|
8
8
|
def load_yaml(lang, script, i18nyaml = nil, i18nhash = nil)
|
|
9
9
|
ret = load_yaml1(lang, script)
|
|
10
|
-
i18nyaml
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
if i18nyaml
|
|
11
|
+
Array(i18nyaml).compact.each do |y|
|
|
12
|
+
ret = ret.deep_merge(YAML.load_file(y))
|
|
13
|
+
end
|
|
14
|
+
return postprocess(ret)
|
|
15
|
+
end
|
|
16
|
+
i18nhash and return postprocess(ret.deep_merge(i18nhash))
|
|
17
|
+
postprocess(ret)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def postprocess(labels)
|
|
21
|
+
self_reference_resolve(normalise_hash(labels))
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def self_reference_resolve(labels)
|
|
25
|
+
resolve_references(labels, labels)
|
|
26
|
+
end
|
|
13
27
|
|
|
14
|
-
|
|
28
|
+
def resolve_references(obj, labels)
|
|
29
|
+
case obj
|
|
30
|
+
when Hash
|
|
31
|
+
obj.transform_values { |v| resolve_references(v, labels) }
|
|
32
|
+
when Array
|
|
33
|
+
obj.map { |item| resolve_references(item, labels) }
|
|
34
|
+
when String
|
|
35
|
+
resolve_string_references(obj, labels)
|
|
36
|
+
else
|
|
37
|
+
obj
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def resolve_string_references(str, labels)
|
|
42
|
+
# Match patterns like #{self["key"]["subkey"]} or #{self.key.subkey}
|
|
43
|
+
# Allow spaces around the self expression
|
|
44
|
+
str.gsub(/\#\{\s*self([^}]+?)\s*\}/) do |match|
|
|
45
|
+
path_expr = Regexp.last_match(1)
|
|
46
|
+
resolve_path(path_expr, labels, match)
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def resolve_path(path_expr, labels, original_expr)
|
|
51
|
+
segments = parse_path(path_expr)
|
|
52
|
+
current = labels
|
|
53
|
+
|
|
54
|
+
segments.each do |segment|
|
|
55
|
+
case current
|
|
56
|
+
when Hash
|
|
57
|
+
current.key?(segment) or
|
|
58
|
+
raise "Self-reference error: Path '#{original_expr}' not found - key '#{segment}' does not exist"
|
|
59
|
+
current = current[segment]
|
|
60
|
+
when Array
|
|
61
|
+
index = segment.to_i
|
|
62
|
+
segment =~ /^\d+$/ && index >= 0 && index < current.length or
|
|
63
|
+
raise "Self-reference error: Path '#{original_expr}' not found - invalid array index '#{segment}'"
|
|
64
|
+
current = current[index]
|
|
65
|
+
else
|
|
66
|
+
raise "Self-reference error: Path '#{original_expr}' not found - cannot navigate through non-collection type"
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
current.to_s
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def parse_path(path_expr)
|
|
74
|
+
# Split by dots and brackets while preserving the content
|
|
75
|
+
parts = path_expr.sub(/^\./, "").scan(/\.?([\w-]+)|\[([^\]]+)\]/)
|
|
76
|
+
parts.each_with_object([]) do |(dot_part, bracket_part), segments|
|
|
77
|
+
if dot_part
|
|
78
|
+
segments << dot_part
|
|
79
|
+
elsif bracket_part
|
|
80
|
+
segment = bracket_part.strip.gsub(/^["']|["']$/, "")
|
|
81
|
+
segments << segment
|
|
82
|
+
end
|
|
83
|
+
end
|
|
15
84
|
end
|
|
16
85
|
|
|
17
86
|
def normalise_hash(ret)
|
|
@@ -30,8 +99,8 @@ module IsoDoc
|
|
|
30
99
|
def load_yaml1(lang, script)
|
|
31
100
|
case lang
|
|
32
101
|
when "zh"
|
|
33
|
-
if script
|
|
34
|
-
else load_yaml2("
|
|
102
|
+
if script then load_yaml2("zh-#{script}")
|
|
103
|
+
else load_yaml2("zh-Hans")
|
|
35
104
|
end
|
|
36
105
|
else
|
|
37
106
|
load_yaml2(lang)
|
|
@@ -54,5 +123,9 @@ module IsoDoc
|
|
|
54
123
|
def set(key, val)
|
|
55
124
|
@labels[key] = val
|
|
56
125
|
end
|
|
126
|
+
|
|
127
|
+
def merge(new_labels)
|
|
128
|
+
@labels = @labels.deep_merge(new_labels)
|
|
129
|
+
end
|
|
57
130
|
end
|
|
58
131
|
end
|
data/lib/isodoc/i18n.rb
CHANGED
|
@@ -27,7 +27,6 @@ module IsoDoc
|
|
|
27
27
|
CJK_SCRIPTS = %w(Hans Hant Jpan Kore).freeze
|
|
28
28
|
|
|
29
29
|
def liquid_init
|
|
30
|
-
::IsoDoc::I18n::Liquid.set(self)
|
|
31
30
|
::Liquid::Environment.default.register_filter(::IsoDoc::I18n::Liquid)
|
|
32
31
|
end
|
|
33
32
|
|
|
@@ -50,6 +49,7 @@ module IsoDoc
|
|
|
50
49
|
|
|
51
50
|
# populate with variables, Liquid, inflections, ordinals/spellout
|
|
52
51
|
def populate(keys, vars = {})
|
|
52
|
+
::IsoDoc::I18n::Liquid.set(self)
|
|
53
53
|
::Liquid::Template.parse(@labels.dig(*Array(keys)))
|
|
54
54
|
.render(vars.merge("labels" => @labels))
|
|
55
55
|
end
|
|
@@ -68,7 +68,9 @@ module IsoDoc
|
|
|
68
68
|
end
|
|
69
69
|
|
|
70
70
|
def enum_comma
|
|
71
|
-
|
|
71
|
+
c = @labels.dig("punct", "enum-comma")
|
|
72
|
+
c && CJK_SCRIPTS.include?(@script) and
|
|
73
|
+
return "<enum-comma>#{c}</enum-comma>"
|
|
72
74
|
"<enum-comma>,</enum-comma> "
|
|
73
75
|
end
|
|
74
76
|
|
data/lib/isodoc/l10n.rb
CHANGED
|
@@ -1,55 +1,23 @@
|
|
|
1
1
|
require "metanorma-utils"
|
|
2
|
+
require_relative "l10n_cjk"
|
|
2
3
|
|
|
3
4
|
module IsoDoc
|
|
4
5
|
class I18n
|
|
5
|
-
# Use comprehensive CJK definition from metanorma-utils
|
|
6
|
-
# This includes Han, Katakana, Hiragana, Hangul, Bopomofo and all CJK extensions
|
|
7
|
-
ZH_CHAR = "(#{Metanorma::Utils::CJK})".freeze
|
|
8
|
-
LATIN_PUNCT = /[:,.()\[\];?!-]/.freeze
|
|
9
|
-
|
|
10
|
-
# Condition for converting punctuation to double width:
|
|
11
|
-
# 1. (Strict condition) CJK before, CJK after, modulo ignorable characters:
|
|
12
|
-
# 1a. CJK character, or start of string. Latin spaces optional.
|
|
13
|
-
ZH1_PUNCT = /(#{ZH_CHAR}|^)(\s*)$/xo.freeze
|
|
14
|
-
# 1b. Latin spaces optional, Latin punct which will also convert to CJK,
|
|
15
|
-
# CJK character, or end of string.
|
|
16
|
-
ZH2_PUNCT = /^\s*#{LATIN_PUNCT}*(#{ZH_CHAR}|$)/xo.freeze
|
|
17
|
-
# 2. CJK before, space after:
|
|
18
|
-
# 2a. CJK char, followed by optional Latin punct which will also convert to CJK
|
|
19
|
-
ZH1_NO_SPACE = /#{ZH_CHAR}#{LATIN_PUNCT}*$/xo.freeze
|
|
20
|
-
# 2b. optional Latin punct which wil also convert to CJK, then space
|
|
21
|
-
OPT_PUNCT_SPACE = /^($|#{LATIN_PUNCT}*\s)/xo.freeze
|
|
22
|
-
|
|
23
|
-
# Contexts for converting en-dashes to full-width
|
|
24
|
-
# Before: CJK or start of string, optional digits
|
|
25
|
-
ZH1_DASH = /(#{ZH_CHAR}|^)(\d*)$/xo.freeze
|
|
26
|
-
# After: optional digits, CJK or end of string
|
|
27
|
-
ZH2_DASH = /^\d*(#{ZH_CHAR}|$)/xo.freeze
|
|
28
|
-
|
|
29
|
-
# Pre-defined punctuation mappings for efficiency
|
|
30
|
-
ZH_PUNCT_MAP = [
|
|
31
|
-
["::", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
|
32
|
-
[",,", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
|
33
|
-
[".。", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
|
34
|
-
["))", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
|
35
|
-
["]]", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
|
36
|
-
[";;", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
|
37
|
-
["??", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
|
38
|
-
["!!", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
|
39
|
-
["((", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]],
|
|
40
|
-
["[[", [[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE], [/(\s|^)$/, /^#{ZH_CHAR}/o]]]
|
|
41
|
-
].freeze
|
|
42
|
-
|
|
43
6
|
def self.l10n(text, lang = @lang, script = @script, options = {})
|
|
44
7
|
l10n(text, lang, script, options)
|
|
45
8
|
end
|
|
46
9
|
|
|
47
10
|
# function localising spaces and punctuation
|
|
48
11
|
# options[:prev] and options[:foll] are optional context strings
|
|
12
|
+
# options[:proportional_mixed_cjk] allows contextual full-width vs
|
|
13
|
+
# half-width punctuation
|
|
49
14
|
def l10n(text, lang = @lang, script = @script, options = {})
|
|
50
15
|
locale = options[:locale] || @locale
|
|
51
|
-
%w(zh ja ko).include?(lang) and
|
|
52
|
-
|
|
16
|
+
%w(zh ja ko).include?(lang) and
|
|
17
|
+
text = l10n_zh(text, script, options)
|
|
18
|
+
lang == "fr" and
|
|
19
|
+
text = l10n_fr(text, locale || "FR", options)
|
|
20
|
+
text&.gsub!(/<esc>|<\/esc>/, "") # Strip esc tags
|
|
53
21
|
bidiwrap(text, lang, script)
|
|
54
22
|
end
|
|
55
23
|
|
|
@@ -71,30 +39,36 @@ module IsoDoc
|
|
|
71
39
|
.default_script(@lang))]
|
|
72
40
|
end
|
|
73
41
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
to_xml(xml).gsub(/<b>|<\/b>|<\?[^>]+>/, "")
|
|
42
|
+
def l10n_prep(text, options)
|
|
43
|
+
xml = Nokogiri::XML::DocumentFragment.parse(text)
|
|
44
|
+
t = xml.xpath(".//text()").reject { |node| node.text.empty? }
|
|
45
|
+
text_cache = build_text_cache(t, options[:prev], options[:foll])
|
|
46
|
+
|
|
47
|
+
# Find all text nodes within <esc> tags in one XPath query
|
|
48
|
+
# This is O(n) instead of O(n*m) where m is tree depth
|
|
49
|
+
esc_indices = build_esc_indices(xml, t)
|
|
50
|
+
|
|
51
|
+
[t, text_cache, xml, options[:prev], options[:foll], esc_indices]
|
|
85
52
|
end
|
|
86
53
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
54
|
+
# Build set of indices for text nodes within <esc> tags
|
|
55
|
+
# Handles both namespaced and non-namespaced <esc> elements
|
|
56
|
+
def build_esc_indices(xml, text_nodes)
|
|
57
|
+
# Try both non-namespaced and namespace-agnostic queries
|
|
58
|
+
esc_text_nodes = Set.new(xml.xpath(".//esc//text()") +
|
|
59
|
+
xml.xpath(".//*[local-name()='esc']//text()"))
|
|
60
|
+
Set.new.tap do |indices|
|
|
61
|
+
text_nodes.each_with_index do |node, i|
|
|
62
|
+
indices.add(i) if esc_text_nodes.include?(node)
|
|
63
|
+
end
|
|
64
|
+
end
|
|
92
65
|
end
|
|
93
66
|
|
|
94
67
|
# Cache text content once per method call to avoid repeated .text calls
|
|
95
68
|
# Build text cache with optional prepended/appended context
|
|
69
|
+
# Also, reduce multiple spaces to single, to avoid miscrecognition of space
|
|
96
70
|
def build_text_cache(text_nodes, prev_context = nil, foll_context = nil)
|
|
97
|
-
text_cache = text_nodes.map(&:text)
|
|
71
|
+
text_cache = text_nodes.map(&:text).map { |x| x.gsub(/\s+/, " ") }
|
|
98
72
|
text_cache.unshift(prev_context) if prev_context
|
|
99
73
|
text_cache.push(foll_context) if foll_context
|
|
100
74
|
text_cache
|
|
@@ -117,9 +91,11 @@ module IsoDoc
|
|
|
117
91
|
[prev, foll]
|
|
118
92
|
end
|
|
119
93
|
|
|
120
|
-
def l10n_fr(text, locale,
|
|
121
|
-
t, text_cache, xml = l10n_prep(text,
|
|
94
|
+
def l10n_fr(text, locale, options)
|
|
95
|
+
t, text_cache, xml, prev, _foll, esc_indices = l10n_prep(text, options)
|
|
122
96
|
t.each_with_index do |n, i|
|
|
97
|
+
next if esc_indices.include?(i) # Skip escaped nodes
|
|
98
|
+
|
|
123
99
|
prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
|
|
124
100
|
text = cleanup_entities(n.text, is_xml: false)
|
|
125
101
|
n.replace(l10n_fr1(text, prev_ctx, foll_ctx, locale))
|
|
@@ -127,35 +103,15 @@ module IsoDoc
|
|
|
127
103
|
to_xml(xml)
|
|
128
104
|
end
|
|
129
105
|
|
|
130
|
-
# note: we can't differentiate comma from enumeration comma 、
|
|
131
|
-
# def l10_zh1(text, _script)
|
|
132
|
-
def l10_zh1(text, prev, foll, _script)
|
|
133
|
-
r = l10n_zh_punct(text, prev, foll)
|
|
134
|
-
r = l10n_zh_remove_space(r, prev, foll)
|
|
135
|
-
l10n_zh_dash(r, prev, foll)
|
|
136
|
-
end
|
|
137
|
-
|
|
138
|
-
def l10n_zh_punct(text, prev, foll)
|
|
139
|
-
# Use pre-defined mapping for better performance
|
|
140
|
-
ZH_PUNCT_MAP.each do |mapping|
|
|
141
|
-
punct_pair, regexes = mapping
|
|
142
|
-
text = l10n_gsub(text, prev, foll, [punct_pair[0], punct_pair[1]], regexes)
|
|
143
|
-
end
|
|
144
|
-
text
|
|
145
|
-
end
|
|
146
|
-
|
|
147
|
-
def l10n_zh_dash(text, prev, foll)
|
|
148
|
-
l10n_gsub(text, prev, foll, %w(– ~), [[ZH1_DASH, ZH2_DASH]])
|
|
149
|
-
end
|
|
150
|
-
|
|
151
106
|
# text: string we are scanning for instances of delim[0] to replace
|
|
152
107
|
# prev: string preceding text, as additional token of context
|
|
153
108
|
# foll: string following text, as additional token of context
|
|
154
109
|
# delim: delim[0] is the symbol we want to replace, delim[1] its replacement
|
|
155
110
|
# regexes: a list of regex pairs: the context before the found token,
|
|
156
111
|
# and the context after the found token, under which replacing it
|
|
157
|
-
# with delim[1] is permitted
|
|
112
|
+
# with delim[1] is permitted. If regex is nil, always allow the replacement
|
|
158
113
|
def l10n_gsub(text, prev, foll, delim, regexes)
|
|
114
|
+
delim[1] or return text
|
|
159
115
|
context = l10n_gsub_context(text, prev, foll, delim) or return text
|
|
160
116
|
(1...(context.size - 1)).each do |i|
|
|
161
117
|
l10_context_valid?(context, i, delim, regexes) and
|
|
@@ -170,11 +126,12 @@ module IsoDoc
|
|
|
170
126
|
d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
|
|
171
127
|
context = text.split(/(#{d})/) # delim to replace
|
|
172
128
|
context.size == 1 and return
|
|
173
|
-
[prev, context, foll].flatten
|
|
129
|
+
[prev, context.reject(&:empty?), foll].flatten
|
|
174
130
|
end
|
|
175
131
|
|
|
176
132
|
def l10_context_valid?(context, idx, delim, regex)
|
|
177
133
|
l10n_context_found_delimiter?(context[idx], delim) or return false
|
|
134
|
+
regex.nil? and return true
|
|
178
135
|
regex.detect do |r|
|
|
179
136
|
r[0].match?(context[0...idx].join) && # preceding context
|
|
180
137
|
r[1].match?(context[(idx + 1)..-1].join) # foll context
|
|
@@ -189,13 +146,6 @@ module IsoDoc
|
|
|
189
146
|
end
|
|
190
147
|
end
|
|
191
148
|
|
|
192
|
-
def l10n_zh_remove_space(text, prev, foll)
|
|
193
|
-
text = l10n_gsub(text, prev, foll, [" ", ""],
|
|
194
|
-
[[/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o]])
|
|
195
|
-
l10n_gsub(text, prev, foll, [" ", ""],
|
|
196
|
-
[[/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o]])
|
|
197
|
-
end
|
|
198
|
-
|
|
199
149
|
def l10n_fr1(text, prev, foll, locale)
|
|
200
150
|
text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
|
|
201
151
|
[[/\p{Alnum}$/, /^(\s|$)/]])
|
|
@@ -206,30 +156,6 @@ module IsoDoc
|
|
|
206
156
|
[[/\p{Alnum}$/, /^(\s|$)/]])
|
|
207
157
|
end
|
|
208
158
|
|
|
209
|
-
def self.cjk_extend(text)
|
|
210
|
-
cjk_extend(text)
|
|
211
|
-
end
|
|
212
|
-
|
|
213
|
-
def cjk_extend(title)
|
|
214
|
-
@c.decode(title).chars.map.with_index do |n, i|
|
|
215
|
-
if i.zero? || !interleave_space_cjk?(title[i - 1] + title[i])
|
|
216
|
-
n
|
|
217
|
-
else "\u3000#{n}"
|
|
218
|
-
end
|
|
219
|
-
end.join
|
|
220
|
-
end
|
|
221
|
-
|
|
222
|
-
def interleave_space_cjk?(text)
|
|
223
|
-
text.size == 2 or return
|
|
224
|
-
["\u2014\u2014", "\u2025\u2025", "\u2026\u2026",
|
|
225
|
-
"\u22ef\u22ef"].include?(text) ||
|
|
226
|
-
/\d\d|\p{Latin}\p{Latin}|[[:space:]]/.match?(text) ||
|
|
227
|
-
/^[\u2018\u201c(\u3014\[{\u3008\u300a\u300c\u300e\u3010\u2985\u3018\u3016\u00ab\u301d]/.match?(text) ||
|
|
228
|
-
/[\u2019\u201d)\u3015\]}\u3009\u300b\u300d\u300f\u3011\u2986\u3019\u3017\u00bb\u301f]$/.match?(text) ||
|
|
229
|
-
/[\u3002.\u3001,\u30fb:;\u2010\u301c\u30a0\u2013!?\u203c\u2047\u2048\u2049]/.match?(text) and return false
|
|
230
|
-
true
|
|
231
|
-
end
|
|
232
|
-
|
|
233
159
|
def to_xml(node)
|
|
234
160
|
node&.to_xml(encoding: "UTF-8", indent: 0,
|
|
235
161
|
save_with: Nokogiri::XML::Node::SaveOptions::AS_XML)
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
module IsoDoc
|
|
2
|
+
class I18n
|
|
3
|
+
# Use comprehensive CJK definition from metanorma-utils
|
|
4
|
+
# This includes Han, Katakana, Hiragana, Hangul, Bopomofo
|
|
5
|
+
# and all CJK extensions
|
|
6
|
+
ZH_CHAR = "(#{Metanorma::Utils::CJK})".freeze
|
|
7
|
+
LATIN_PUNCT = /[:,.()\[\];?!-]/.freeze
|
|
8
|
+
# CJK character which is not punctuation
|
|
9
|
+
ZH_NON_PUNCT = "(#{
|
|
10
|
+
[
|
|
11
|
+
Metanorma::Utils.singleton_class::HAN,
|
|
12
|
+
Metanorma::Utils.singleton_class::HAN_IDC,
|
|
13
|
+
Metanorma::Utils.singleton_class::KANBUN,
|
|
14
|
+
Metanorma::Utils.singleton_class::CJK_COMPAT_IDEOGRAPHS,
|
|
15
|
+
Metanorma::Utils.singleton_class::HAN_COMPAT_IDEOGRAPHS,
|
|
16
|
+
Metanorma::Utils.singleton_class::HANGUL,
|
|
17
|
+
Metanorma::Utils.singleton_class::HIRAGANA,
|
|
18
|
+
Metanorma::Utils.singleton_class::KATAKANA,
|
|
19
|
+
Metanorma::Utils.singleton_class::BOPOMOFO,
|
|
20
|
+
].join("|")})".freeze
|
|
21
|
+
|
|
22
|
+
# Condition for converting punctuation to double width,
|
|
23
|
+
# in case of options[:proportional_mixed_cjk]
|
|
24
|
+
# 1. (Strict condition) CJK before, CJK after, modulo ignorable characters:
|
|
25
|
+
# 1a. CJK character, or start of string. Latin spaces optional.
|
|
26
|
+
ZH1_PUNCT = /(#{ZH_CHAR}|^)(\s*)$/xo.freeze
|
|
27
|
+
# 1b. Latin spaces optional, Latin punct which will also convert to CJK,
|
|
28
|
+
# CJK character, or end of string.
|
|
29
|
+
ZH2_PUNCT = /^\s*#{LATIN_PUNCT}*(#{ZH_CHAR}|$)/xo.freeze
|
|
30
|
+
# 2. CJK before, space after:
|
|
31
|
+
# 2a. CJK char, followed by optional Latin punct which will also convert to CJK
|
|
32
|
+
ZH1_NO_SPACE = /#{ZH_CHAR}#{LATIN_PUNCT}*$/xo.freeze
|
|
33
|
+
# 2b. optional Latin punct which wil also convert to CJK, then space
|
|
34
|
+
OPT_PUNCT_SPACE = /^($|#{LATIN_PUNCT}*\s)/xo.freeze
|
|
35
|
+
|
|
36
|
+
# Chinese numerals (common + formal/financial forms)
|
|
37
|
+
# Explicit characters needed because Chinese numeral ideographs
|
|
38
|
+
# are not tagged with Unicode Number property
|
|
39
|
+
# Using alternation instead of character class to properly include \p{N}
|
|
40
|
+
ZH_NUMERALS = "(?:[零一二三四五六七八九十百千万亿壹贰叁肆伍陆柒捌玖拾佰仟萬億兆]|\\p{N})".freeze
|
|
41
|
+
|
|
42
|
+
# Contexts for converting en-dashes to full-width
|
|
43
|
+
# Before: CJK or start of string, no digits
|
|
44
|
+
ZH1_DASH = /(#{ZH_CHAR}|^)(?<!=#{ZH_NUMERALS})$/xo.freeze
|
|
45
|
+
# After: no optional digits, CJK or end of string
|
|
46
|
+
ZH2_DASH = /^(?!#{ZH_NUMERALS})(#{ZH_CHAR}|$)/xo.freeze
|
|
47
|
+
# Before: CJK or start of string, optional digits
|
|
48
|
+
ZH1_NUM_DASH = /#{ZH_NUMERALS}$/xo.freeze
|
|
49
|
+
# After: optional digits, CJK or end of string
|
|
50
|
+
ZH2_NUM_DASH = /^#{ZH_NUMERALS}/xo.freeze
|
|
51
|
+
|
|
52
|
+
ZH_PUNCT_CONTEXTS =
|
|
53
|
+
[[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE],
|
|
54
|
+
[/(\s|^)$/, /^#{ZH_CHAR}/o]].freeze
|
|
55
|
+
|
|
56
|
+
# map of YAML punct keys to auto-text Latin equivalents
|
|
57
|
+
ZH_PUNCT_AUTOTEXT = {
|
|
58
|
+
colon: ":",
|
|
59
|
+
comma: ",",
|
|
60
|
+
# "enum-comma": ",", # enum-comma is ambiguous with comma
|
|
61
|
+
semicolon: ";",
|
|
62
|
+
period: ".",
|
|
63
|
+
"close-paren": ")",
|
|
64
|
+
"open-paren": "(",
|
|
65
|
+
"close-bracket": "]",
|
|
66
|
+
"open-bracket": "[",
|
|
67
|
+
"question-mark": "?",
|
|
68
|
+
"exclamation-mark": "!",
|
|
69
|
+
"em-dash": "—",
|
|
70
|
+
"open-quote": "“",
|
|
71
|
+
"close-quote": "”",
|
|
72
|
+
"open-nested-quote": "’",
|
|
73
|
+
"close-nested-quote": "’",
|
|
74
|
+
ellipse: "…",
|
|
75
|
+
}.freeze
|
|
76
|
+
|
|
77
|
+
# Pre-defined punctuation mappings for efficiency
|
|
78
|
+
def init_zh_punct_map
|
|
79
|
+
ZH_PUNCT_AUTOTEXT.each_with_object([]) do |(k, v), m|
|
|
80
|
+
@labels.dig("punct", k.to_s) or next
|
|
81
|
+
m << [v, @labels["punct"][k.to_s], ZH_PUNCT_CONTEXTS]
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def l10n_zh(text, script, options)
|
|
86
|
+
script ||= "Hans"
|
|
87
|
+
t, text_cache, xml, prev, _foll, esc_indices = l10n_prep(text, options)
|
|
88
|
+
t.each_with_index do |n, i|
|
|
89
|
+
next if esc_indices.include?(i) # Skip escaped nodes
|
|
90
|
+
|
|
91
|
+
# Adjust index if prev context prepended
|
|
92
|
+
prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
|
|
93
|
+
text = cleanup_entities(n.text, is_xml: false)
|
|
94
|
+
n.replace(l10_zh1(text, prev_ctx, foll_ctx, script, options))
|
|
95
|
+
end
|
|
96
|
+
to_xml(xml) #.gsub(/<\/?em>|<\/?strong>|<\/?i>|<\/?b>/, "")
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# note: we can't differentiate comma from enumeration comma 、
|
|
100
|
+
# def l10_zh1(text, _script)
|
|
101
|
+
def l10_zh1(text, prev, foll, _script, options)
|
|
102
|
+
r = l10n_zh_punct(text, prev, foll, options)
|
|
103
|
+
r = l10n_zh_remove_space(r, prev, foll)
|
|
104
|
+
l10n_zh_dash(r, prev, foll)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def l10n_zh_punct(text, prev, foll, options)
|
|
108
|
+
# Use pre-defined mapping for better performance
|
|
109
|
+
@zh_punct_map ||= init_zh_punct_map
|
|
110
|
+
@zh_punct_map.each do |mapping|
|
|
111
|
+
punct_from, punct_to, regexes = mapping
|
|
112
|
+
options[:proportional_mixed_cjk] or regexes = nil
|
|
113
|
+
text = l10n_gsub(text, prev, foll, [punct_from, punct_to],
|
|
114
|
+
regexes)
|
|
115
|
+
end
|
|
116
|
+
text
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def l10n_zh_dash(text, prev, foll)
|
|
120
|
+
text = l10n_gsub(text, prev, foll, ["–", @labels.dig("punct", "en-dash")],
|
|
121
|
+
[[ZH1_DASH, ZH2_DASH]])
|
|
122
|
+
l10n_gsub(text, prev, foll, ["–", @labels.dig("punct", "number-en-dash")],
|
|
123
|
+
[[ZH1_NUM_DASH, ZH2_NUM_DASH]])
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def l10n_zh_remove_space(text, prev, foll)
|
|
127
|
+
text = l10n_gsub(text, prev, foll, [/\s+/, ""],
|
|
128
|
+
[[/(#{ZH_CHAR})$/o, /^#{ZH_CHAR}/o]])
|
|
129
|
+
if sep = @labels.dig("punct", "cjk-latin-separator")
|
|
130
|
+
# Skip over punctuation to find Latin letters/numbers
|
|
131
|
+
text = l10n_gsub(text, prev, foll, [/\s+/, sep],
|
|
132
|
+
[[/#{ZH_CHAR}$/o, /^\p{P}*[\p{Latin}\p{N}]/o]])
|
|
133
|
+
l10n_gsub(text, prev, foll, [/\s+/, sep],
|
|
134
|
+
[[/[\p{Latin}\p{N}]\p{P}*$/o, /^#{ZH_NON_PUNCT}/o]])
|
|
135
|
+
else
|
|
136
|
+
l10n_gsub(text, prev, foll, [/\s+/, ""],
|
|
137
|
+
[[/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o]])
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def self.cjk_extend(text)
|
|
142
|
+
cjk_extend(text)
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def cjk_extend(title)
|
|
146
|
+
@c.decode(title).chars.map.with_index do |n, i|
|
|
147
|
+
if i.zero? || !interleave_space_cjk?(title[i - 1] + title[i])
|
|
148
|
+
n
|
|
149
|
+
else "\u3000#{n}"
|
|
150
|
+
end
|
|
151
|
+
end.join
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def interleave_space_cjk?(text)
|
|
155
|
+
text.size == 2 or return
|
|
156
|
+
["\u2014\u2014", "\u2025\u2025", "\u2026\u2026",
|
|
157
|
+
"\u22ef\u22ef"].include?(text) ||
|
|
158
|
+
/\d\d|\p{Latin}\p{Latin}|[[:space:]]/.match?(text) ||
|
|
159
|
+
/^[\u2018\u201c(\u3014\[{\u3008\u300a\u300c\u300e\u3010\u2985\u3018\u3016\u00ab\u301d]/.match?(text) ||
|
|
160
|
+
/[\u2019\u201d)\u3015\]}\u3009\u300b\u300d\u300f\u3011\u2986\u3019\u3017\u00bb\u301f]$/.match?(text) ||
|
|
161
|
+
/[\u3002.\u3001,\u30fb:;\u2010\u301c\u30a0\u2013!?\u203c\u2047\u2048\u2049]/.match?(text) and return false
|
|
162
|
+
true
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|
|
@@ -2,3 +2,24 @@ text: text
|
|
|
2
2
|
at: at
|
|
3
3
|
binary_and: "%1 <conn>and</conn> %2"
|
|
4
4
|
multiple_and: "%1<conn>, and</conn> %2"
|
|
5
|
+
punct:
|
|
6
|
+
colon: ":"
|
|
7
|
+
comma: ","
|
|
8
|
+
enum_comma: ","
|
|
9
|
+
semicolon: ";"
|
|
10
|
+
period: "."
|
|
11
|
+
close_paren: ")"
|
|
12
|
+
open_paren: "("
|
|
13
|
+
close_bracket: "]"
|
|
14
|
+
open_bracket: "["
|
|
15
|
+
question_mark: "?"
|
|
16
|
+
exclamation_mark: "!"
|
|
17
|
+
em_dash: "—"
|
|
18
|
+
en_dash: "–"
|
|
19
|
+
number_en_dash: "–"
|
|
20
|
+
open_quote: "“"
|
|
21
|
+
close_quote: "”"
|
|
22
|
+
open_nested_quote: "’"
|
|
23
|
+
close_nested_quote: "’"
|
|
24
|
+
ellipse: …
|
|
25
|
+
|
metadata
CHANGED
|
@@ -1,15 +1,29 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: isodoc-i18n
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.4.
|
|
4
|
+
version: 1.4.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-
|
|
11
|
+
date: 2025-11-03 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: base64
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - ">="
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - ">="
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0'
|
|
13
27
|
- !ruby/object:Gem::Dependency
|
|
14
28
|
name: htmlentities
|
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -67,19 +81,19 @@ dependencies:
|
|
|
67
81
|
- !ruby/object:Gem::Version
|
|
68
82
|
version: '0'
|
|
69
83
|
- !ruby/object:Gem::Dependency
|
|
70
|
-
name:
|
|
84
|
+
name: canon
|
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
|
72
86
|
requirements:
|
|
73
|
-
- -
|
|
87
|
+
- - '='
|
|
74
88
|
- !ruby/object:Gem::Version
|
|
75
|
-
version:
|
|
76
|
-
type: :
|
|
89
|
+
version: 0.1.3
|
|
90
|
+
type: :development
|
|
77
91
|
prerelease: false
|
|
78
92
|
version_requirements: !ruby/object:Gem::Requirement
|
|
79
93
|
requirements:
|
|
80
|
-
- -
|
|
94
|
+
- - '='
|
|
81
95
|
- !ruby/object:Gem::Version
|
|
82
|
-
version:
|
|
96
|
+
version: 0.1.3
|
|
83
97
|
- !ruby/object:Gem::Dependency
|
|
84
98
|
name: debug
|
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -234,20 +248,6 @@ dependencies:
|
|
|
234
248
|
- - ">="
|
|
235
249
|
- !ruby/object:Gem::Version
|
|
236
250
|
version: '0'
|
|
237
|
-
- !ruby/object:Gem::Dependency
|
|
238
|
-
name: canon
|
|
239
|
-
requirement: !ruby/object:Gem::Requirement
|
|
240
|
-
requirements:
|
|
241
|
-
- - ">="
|
|
242
|
-
- !ruby/object:Gem::Version
|
|
243
|
-
version: '0'
|
|
244
|
-
type: :development
|
|
245
|
-
prerelease: false
|
|
246
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
247
|
-
requirements:
|
|
248
|
-
- - ">="
|
|
249
|
-
- !ruby/object:Gem::Version
|
|
250
|
-
version: '0'
|
|
251
251
|
description: 'Internationalisation for Metanorma rendering
|
|
252
252
|
|
|
253
253
|
'
|
|
@@ -271,6 +271,7 @@ files:
|
|
|
271
271
|
- lib/isodoc/i18n.rb
|
|
272
272
|
- lib/isodoc/i18n/version.rb
|
|
273
273
|
- lib/isodoc/l10n.rb
|
|
274
|
+
- lib/isodoc/l10n_cjk.rb
|
|
274
275
|
- lib/isodoc/liquid/liquid.rb
|
|
275
276
|
homepage: https://github.com/metanorma/isodoc-i18n
|
|
276
277
|
licenses:
|