isodoc-i18n 1.2.1 → 1.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/isodoc-i18n.gemspec +2 -0
- data/lib/isodoc/i18n/version.rb +1 -1
- data/lib/isodoc/i18n.rb +1 -0
- data/lib/isodoc/l10n.rb +92 -32
- metadata +31 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d1cb9e9bc5f9e053a31ca971936606b5248577c9b001d3c858d8ecaed201ced8
|
4
|
+
data.tar.gz: 0ea1ee1c8b6913c3a708d63eee6f6f0af5f6ba171a65b04a7792063bd117a339
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a40fba88473c09f93b3eaf3da323c22c6974e9a50a2a287d4235ba5a757852a38a553765e9fdf6289d0dd97721e4c03660e390d06ece34a6f082b154377c041b
|
7
|
+
data.tar.gz: c8e0127577dda72be74e84f9e211169b682817f12ba38507d98c81ac5da649b5ae92d3a4d29f1d63b38ec244c4dbc44a23cf95eb0043773d4e6ae3b59309bd1c
|
data/isodoc-i18n.gemspec
CHANGED
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_dependency "liquid", "~> 5"
|
27
27
|
spec.add_dependency "metanorma-utils", ">= 1.7.0"
|
28
28
|
spec.add_dependency "twitter_cldr"
|
29
|
+
spec.add_dependency "base64"
|
29
30
|
|
30
31
|
spec.add_development_dependency "debug"
|
31
32
|
spec.add_development_dependency "equivalent-xml", "~> 0.6"
|
@@ -37,5 +38,6 @@ Gem::Specification.new do |spec|
|
|
37
38
|
spec.add_development_dependency "simplecov", "~> 0.15"
|
38
39
|
spec.add_development_dependency "timecop", "~> 0.9"
|
39
40
|
spec.add_development_dependency "webmock"
|
41
|
+
spec.add_development_dependency "xml-c14n"
|
40
42
|
# spec.metadata["rubygems_mfa_required"] = "true"
|
41
43
|
end
|
data/lib/isodoc/i18n/version.rb
CHANGED
data/lib/isodoc/i18n.rb
CHANGED
data/lib/isodoc/l10n.rb
CHANGED
@@ -5,9 +5,8 @@ module IsoDoc
|
|
5
5
|
end
|
6
6
|
|
7
7
|
# function localising spaces and punctuation.
|
8
|
-
# Not clear if period needs to be localised for zh
|
9
8
|
def l10n(text, lang = @lang, script = @script, locale = @locale)
|
10
|
-
|
9
|
+
%w(zh ja ko).include?(lang) and text = l10n_zh(text, script)
|
11
10
|
lang == "fr" && text = l10n_fr(text, locale || "FR")
|
12
11
|
bidiwrap(text, lang, script)
|
13
12
|
end
|
@@ -30,23 +29,35 @@ module IsoDoc
|
|
30
29
|
.default_script(@lang))]
|
31
30
|
end
|
32
31
|
|
32
|
+
# CJK
|
33
33
|
def l10n_zh(text, script = "Hans")
|
34
34
|
xml = Nokogiri::XML::DocumentFragment.parse(text)
|
35
|
-
xml.
|
36
|
-
|
37
|
-
|
38
|
-
|
35
|
+
t = xml.xpath(".//text()")
|
36
|
+
t.each_with_index do |n, i|
|
37
|
+
prev, foll = l10n_context(t, i)
|
38
|
+
text = cleanup_entities(n.text, is_xml: false)
|
39
|
+
n.replace(l10_zh1(text, prev, foll, script))
|
39
40
|
end
|
40
41
|
xml.to_xml(encoding: "UTF-8").gsub(/<b>/, "").gsub("</b>", "")
|
41
42
|
.gsub(/<\?[^>]+>/, "")
|
42
43
|
end
|
43
44
|
|
45
|
+
# previous, following context of current text node:
|
46
|
+
# do not use just the immediately adjoining text tokens for context
|
47
|
+
# deal with spaces and empty text by just concatenating entire context
|
48
|
+
def l10n_context(nodes, idx)
|
49
|
+
prev = nodes[0...idx].map(&:text).join
|
50
|
+
foll = nodes[(idx + 1)...(nodes.size)].map(&:text).join
|
51
|
+
[prev, foll]
|
52
|
+
end
|
53
|
+
|
44
54
|
def l10n_fr(text, locale)
|
45
55
|
xml = Nokogiri::XML::DocumentFragment.parse(text)
|
46
|
-
xml.
|
47
|
-
|
48
|
-
|
49
|
-
|
56
|
+
t = xml.xpath(".//text()")
|
57
|
+
t.each_with_index do |n, i|
|
58
|
+
prev, foll = l10n_context(t, i)
|
59
|
+
text = cleanup_entities(n.text, is_xml: false)
|
60
|
+
n.replace(l10n_fr1(text, prev, foll, locale))
|
50
61
|
end
|
51
62
|
xml.to_xml(encoding: "UTF-8")
|
52
63
|
end
|
@@ -55,36 +66,84 @@ module IsoDoc
|
|
55
66
|
"\\p{In Halfwidth And Fullwidth Forms}".freeze
|
56
67
|
|
57
68
|
# note: we can't differentiate comma from enumeration comma 、
|
58
|
-
def l10_zh1(text, _script)
|
59
|
-
|
69
|
+
# def l10_zh1(text, _script)
|
70
|
+
def l10_zh1(text, prev, foll, _script)
|
71
|
+
# l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
|
72
|
+
r = l10n_zh_punct(text, prev, foll)
|
73
|
+
r = l10n_zh_remove_space(r, prev, foll)
|
74
|
+
l10n_zh_dash(r, prev, foll)
|
60
75
|
end
|
61
76
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
77
|
+
ZH1_PUNCT = /(#{ZH_CHAR}|^) # CJK character, or start of string
|
78
|
+
(\s*)$ # Latin spaces optional
|
79
|
+
/xo.freeze
|
80
|
+
ZH2_PUNCT = /^\s* # followed by ignorable Latin spaces
|
81
|
+
[:,.()\[\];?!-]* # Latin punct which will also convert to CJK
|
82
|
+
(#{ZH_CHAR}|$) # CJK character, or end of string
|
83
|
+
/xo.freeze
|
84
|
+
|
85
|
+
# CJK punct if (^|CJK).($|CJK)
|
86
|
+
def l10n_zh_punct(text, prev, foll)
|
87
|
+
["::", ",,", "..", "))", "]]", ";;", "??", "!!", "((", "[["].each do |m|
|
88
|
+
text = l10n_gsub(text, prev, foll, [m[0], m[1]],
|
89
|
+
[ZH1_PUNCT, ZH2_PUNCT])
|
68
90
|
end
|
69
91
|
text
|
70
92
|
end
|
71
93
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
94
|
+
ZH1_DASH = /(#{ZH_CHAR}|^) # CJK character, or start of string
|
95
|
+
(\d*) # optional digits
|
96
|
+
$/xo.freeze
|
97
|
+
|
98
|
+
ZH2_DASH = /^\d* # followed by optional digits
|
99
|
+
(#{ZH_CHAR}|$) # CJK character, or end of string
|
100
|
+
/xo.freeze
|
101
|
+
|
102
|
+
def l10n_zh_dash(text, prev, foll)
|
103
|
+
l10n_gsub(text, prev, foll, %w(– ~), [ZH1_DASH, ZH2_DASH])
|
104
|
+
end
|
105
|
+
|
106
|
+
def l10n_gsub(text, prev, foll, delim, regex)
|
107
|
+
context = l10n_gsub_context(text, prev, foll, delim) or return text
|
108
|
+
(1...(context.size - 1)).each do |i|
|
109
|
+
l10_context_valid?(context, i, delim, regex) and
|
110
|
+
context[i] = delim[1].gsub("\\0", context[i]) # Full-width equivalent
|
111
|
+
end
|
112
|
+
context[1...(context.size - 1)].join
|
113
|
+
end
|
114
|
+
|
115
|
+
def l10n_gsub_context(text, prev, foll, delim)
|
116
|
+
d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
|
117
|
+
context = text.split(/(#{d})/) # delim to replace
|
118
|
+
context.size == 1 and return
|
119
|
+
[prev, context, foll].flatten
|
120
|
+
end
|
121
|
+
|
122
|
+
def l10_context_valid?(context, idx, delim, regex)
|
123
|
+
found_delim = if delim[0].is_a?(Regexp) # punct to convert
|
124
|
+
delim[0].match?(context[idx])
|
125
|
+
else
|
126
|
+
context[idx] == delim[0]
|
127
|
+
end
|
128
|
+
found_delim &&
|
129
|
+
regex[0].match?(context[0...idx].join) && # preceding context
|
130
|
+
regex[1].match?(context[(idx + 1)..-1].join) # foll context
|
131
|
+
end
|
132
|
+
|
133
|
+
def l10n_zh_remove_space(text, prev, foll)
|
134
|
+
text = l10n_gsub(text, prev, foll, [" ", ""],
|
135
|
+
[/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o])
|
136
|
+
l10n_gsub(text, prev, foll, [" ", ""],
|
137
|
+
[/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o])
|
77
138
|
end
|
78
139
|
|
79
|
-
def l10n_fr1(text, locale)
|
80
|
-
text = text
|
81
|
-
|
82
|
-
text = text
|
83
|
-
text = text.gsub(/([«‹])/, "\\1\u202f")
|
140
|
+
def l10n_fr1(text, prev, foll, locale)
|
141
|
+
text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
|
142
|
+
[/\p{Alnum}$/, /^(\s|$)/])
|
143
|
+
text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"], [/$/, /^./])
|
84
144
|
colonsp = locale == "CH" ? "\u202f" : "\u00a0"
|
85
|
-
text
|
86
|
-
|
87
|
-
text.gsub(/^(:\s)/, "#{colonsp}\\1")
|
145
|
+
l10n_gsub(text, prev, foll, [":", "#{colonsp}\\0"],
|
146
|
+
[/\p{Alnum}$/, /^(\s|$)/])
|
88
147
|
end
|
89
148
|
|
90
149
|
def self.cjk_extend(text)
|
@@ -102,7 +161,8 @@ module IsoDoc
|
|
102
161
|
|
103
162
|
def interleave_space_cjk?(text)
|
104
163
|
text.size == 2 or return
|
105
|
-
["\u2014\u2014", "\u2025\u2025", "\u2026\u2026",
|
164
|
+
["\u2014\u2014", "\u2025\u2025", "\u2026\u2026",
|
165
|
+
"\u22ef\u22ef"].include?(text) ||
|
106
166
|
/\d\d|\p{Latin}\p{Latin}|[[:space:]]/.match?(text) ||
|
107
167
|
/^[\u2018\u201c(\u3014\[{\u3008\u300a\u300c\u300e\u3010\u2985\u3018\u3016\u00ab\u301d]/.match?(text) ||
|
108
168
|
/[\u2019\u201d)\u3015\]}\u3009\u300b\u300d\u300f\u3011\u2986\u3019\u3017\u00bb\u301f]$/.match?(text) ||
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: isodoc-i18n
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-11-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: htmlentities
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: base64
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: debug
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -206,6 +220,20 @@ dependencies:
|
|
206
220
|
- - ">="
|
207
221
|
- !ruby/object:Gem::Version
|
208
222
|
version: '0'
|
223
|
+
- !ruby/object:Gem::Dependency
|
224
|
+
name: xml-c14n
|
225
|
+
requirement: !ruby/object:Gem::Requirement
|
226
|
+
requirements:
|
227
|
+
- - ">="
|
228
|
+
- !ruby/object:Gem::Version
|
229
|
+
version: '0'
|
230
|
+
type: :development
|
231
|
+
prerelease: false
|
232
|
+
version_requirements: !ruby/object:Gem::Requirement
|
233
|
+
requirements:
|
234
|
+
- - ">="
|
235
|
+
- !ruby/object:Gem::Version
|
236
|
+
version: '0'
|
209
237
|
description: 'Internationalisation for Metanorma rendering
|
210
238
|
|
211
239
|
'
|
@@ -249,7 +277,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
249
277
|
- !ruby/object:Gem::Version
|
250
278
|
version: '0'
|
251
279
|
requirements: []
|
252
|
-
rubygems_version: 3.3.
|
280
|
+
rubygems_version: 3.3.27
|
253
281
|
signing_key:
|
254
282
|
specification_version: 4
|
255
283
|
summary: isodoc-i18n
|