isodoc-i18n 1.2.2 → 1.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/isodoc-i18n.gemspec +1 -0
- data/lib/isodoc/i18n/version.rb +1 -1
- data/lib/isodoc/i18n.rb +1 -0
- data/lib/isodoc/l10n.rb +86 -39
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d5373a1bd009370c01efe97c54aa373ad71eb520628aabc98207257b84b7a52a
|
4
|
+
data.tar.gz: 9c97a3e50fa228d20f890aaa16f94ae90efb722ccf9a6e96a62e36f819759745
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e7abebde7c6d4630a4f6f1916d746b5833c3f02c894d5401d9ccba720aae1c699639563066ed493107b6554ae78433c046a1e5ecce496b021cb6613242ef81c4
|
7
|
+
data.tar.gz: 40d2f238053553382f1ff63eac7d579402e5e296fa18f4f38fe20cae94854ac1d58e658b5d41fe05f039375d6dd4f25cf3bfbedb1c548c699d90dc4a880d7d15
|
data/isodoc-i18n.gemspec
CHANGED
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_dependency "liquid", "~> 5"
|
27
27
|
spec.add_dependency "metanorma-utils", ">= 1.7.0"
|
28
28
|
spec.add_dependency "twitter_cldr"
|
29
|
+
spec.add_dependency "base64"
|
29
30
|
|
30
31
|
spec.add_development_dependency "debug"
|
31
32
|
spec.add_development_dependency "equivalent-xml", "~> 0.6"
|
data/lib/isodoc/i18n/version.rb
CHANGED
data/lib/isodoc/i18n.rb
CHANGED
data/lib/isodoc/l10n.rb
CHANGED
@@ -32,71 +32,118 @@ module IsoDoc
|
|
32
32
|
# CJK
|
33
33
|
def l10n_zh(text, script = "Hans")
|
34
34
|
xml = Nokogiri::XML::DocumentFragment.parse(text)
|
35
|
-
xml.
|
36
|
-
|
37
|
-
|
35
|
+
t = xml.xpath(".//text()")
|
36
|
+
t.each_with_index do |n, i|
|
37
|
+
prev, foll = l10n_context(t, i)
|
38
|
+
text = cleanup_entities(n.text, is_xml: false)
|
39
|
+
n.replace(l10_zh1(text, prev, foll, script))
|
38
40
|
end
|
39
41
|
xml.to_xml(encoding: "UTF-8").gsub(/<b>/, "").gsub("</b>", "")
|
40
42
|
.gsub(/<\?[^>]+>/, "")
|
41
43
|
end
|
42
44
|
|
45
|
+
# previous, following context of current text node:
|
46
|
+
# do not use just the immediately adjoining text tokens for context
|
47
|
+
# deal with spaces and empty text by just concatenating entire context
|
48
|
+
def l10n_context(nodes, idx)
|
49
|
+
prev = nodes[0...idx].map(&:text).join
|
50
|
+
foll = nodes[(idx + 1)...(nodes.size)].map(&:text).join
|
51
|
+
[prev, foll]
|
52
|
+
end
|
53
|
+
|
43
54
|
def l10n_fr(text, locale)
|
44
55
|
xml = Nokogiri::XML::DocumentFragment.parse(text)
|
45
|
-
xml.
|
46
|
-
|
47
|
-
|
48
|
-
|
56
|
+
t = xml.xpath(".//text()")
|
57
|
+
t.each_with_index do |n, i|
|
58
|
+
prev, foll = l10n_context(t, i)
|
59
|
+
text = cleanup_entities(n.text, is_xml: false)
|
60
|
+
n.replace(l10n_fr1(text, prev, foll, locale))
|
49
61
|
end
|
50
62
|
xml.to_xml(encoding: "UTF-8")
|
51
63
|
end
|
52
64
|
|
53
|
-
ZH_CHAR = "\\p{Han}|\\p{In CJK Symbols And Punctuation}|" \
|
54
|
-
"\\p{In Halfwidth And Fullwidth Forms}".freeze
|
65
|
+
ZH_CHAR = "(\\p{Han}|\\p{In CJK Symbols And Punctuation}|" \
|
66
|
+
"\\p{In Halfwidth And Fullwidth Forms})".freeze
|
55
67
|
|
56
68
|
# note: we can't differentiate comma from enumeration comma 、
|
57
|
-
def l10_zh1(text, _script)
|
58
|
-
|
69
|
+
# def l10_zh1(text, _script)
|
70
|
+
def l10_zh1(text, prev, foll, _script)
|
71
|
+
# l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
|
72
|
+
r = l10n_zh_punct(text, prev, foll)
|
73
|
+
r = l10n_zh_remove_space(r, prev, foll)
|
74
|
+
l10n_zh_dash(r, prev, foll)
|
59
75
|
end
|
60
76
|
|
77
|
+
ZH1_PUNCT = /(#{ZH_CHAR}|^) # CJK character, or start of string
|
78
|
+
(\s*)$ # Latin spaces optional
|
79
|
+
/xo.freeze
|
80
|
+
ZH2_PUNCT = /^\s* # followed by ignorable Latin spaces
|
81
|
+
[:,.()\[\];?!-]* # Latin punct which will also convert to CJK
|
82
|
+
(#{ZH_CHAR}|$) # CJK character, or end of string
|
83
|
+
/xo.freeze
|
84
|
+
|
61
85
|
# CJK punct if (^|CJK).($|CJK)
|
62
|
-
def l10n_zh_punct(text)
|
86
|
+
def l10n_zh_punct(text, prev, foll)
|
63
87
|
["::", ",,", "..", "))", "]]", ";;", "??", "!!", "((", "[["].each do |m|
|
64
|
-
text = text
|
65
|
-
|
66
|
-
#{Regexp.quote(m[0])} # Latin punctuation we want to convert to CJK
|
67
|
-
(?= \s* # followed (lookahead) by ignorable Latin spaces
|
68
|
-
[:,.()\[\];?!-]* # Latin punctuation which we will also convert to CJK
|
69
|
-
(#{ZH_CHAR}|$) # CJK character, or end of string
|
70
|
-
) /x, "\\1#{m[1]}")
|
88
|
+
text = l10n_gsub(text, prev, foll, [m[0], m[1]],
|
89
|
+
[ZH1_PUNCT, ZH2_PUNCT])
|
71
90
|
end
|
72
91
|
text
|
73
92
|
end
|
74
93
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
(#{ZH_CHAR}|$)
|
81
|
-
|
94
|
+
ZH1_DASH = /(#{ZH_CHAR}|^) # CJK character, or start of string
|
95
|
+
(\d*) # optional digits
|
96
|
+
$/xo.freeze
|
97
|
+
|
98
|
+
ZH2_DASH = /^\d* # followed by optional digits
|
99
|
+
(#{ZH_CHAR}|$) # CJK character, or end of string
|
100
|
+
/xo.freeze
|
101
|
+
|
102
|
+
def l10n_zh_dash(text, prev, foll)
|
103
|
+
l10n_gsub(text, prev, foll, %w(– ~), [ZH1_DASH, ZH2_DASH])
|
104
|
+
end
|
105
|
+
|
106
|
+
def l10n_gsub(text, prev, foll, delim, regex)
|
107
|
+
context = l10n_gsub_context(text, prev, foll, delim) or return text
|
108
|
+
(1...(context.size - 1)).each do |i|
|
109
|
+
l10_context_valid?(context, i, delim, regex) and
|
110
|
+
context[i] = delim[1].gsub("\\0", context[i]) # Full-width equivalent
|
111
|
+
end
|
112
|
+
context[1...(context.size - 1)].join
|
113
|
+
end
|
114
|
+
|
115
|
+
def l10n_gsub_context(text, prev, foll, delim)
|
116
|
+
d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
|
117
|
+
context = text.split(/(#{d})/) # delim to replace
|
118
|
+
context.size == 1 and return
|
119
|
+
[prev, context, foll].flatten
|
120
|
+
end
|
121
|
+
|
122
|
+
def l10_context_valid?(context, idx, delim, regex)
|
123
|
+
found_delim = if delim[0].is_a?(Regexp) # punct to convert
|
124
|
+
delim[0].match?(context[idx])
|
125
|
+
else
|
126
|
+
context[idx] == delim[0]
|
127
|
+
end
|
128
|
+
found_delim &&
|
129
|
+
regex[0].match?(context[0...idx].join) && # preceding context
|
130
|
+
regex[1].match?(context[(idx + 1)..-1].join) # foll context
|
82
131
|
end
|
83
132
|
|
84
|
-
def l10n_zh_remove_space(text)
|
85
|
-
text
|
86
|
-
|
87
|
-
|
88
|
-
|
133
|
+
def l10n_zh_remove_space(text, prev, foll)
|
134
|
+
text = l10n_gsub(text, prev, foll, [" ", ""],
|
135
|
+
[/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o])
|
136
|
+
l10n_gsub(text, prev, foll, [" ", ""],
|
137
|
+
[/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o])
|
89
138
|
end
|
90
139
|
|
91
|
-
def l10n_fr1(text, locale)
|
92
|
-
text = text
|
93
|
-
|
94
|
-
text = text
|
95
|
-
text = text.gsub(/([«‹])/, "\\1\u202f")
|
140
|
+
def l10n_fr1(text, prev, foll, locale)
|
141
|
+
text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
|
142
|
+
[/\p{Alnum}$/, /^(\s|$)/])
|
143
|
+
text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"], [/$/, /^./])
|
96
144
|
colonsp = locale == "CH" ? "\u202f" : "\u00a0"
|
97
|
-
text
|
98
|
-
|
99
|
-
text.gsub(/^(:\s)/, "#{colonsp}\\1")
|
145
|
+
l10n_gsub(text, prev, foll, [":", "#{colonsp}\\0"],
|
146
|
+
[/\p{Alnum}$/, /^(\s|$)/])
|
100
147
|
end
|
101
148
|
|
102
149
|
def self.cjk_extend(text)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: isodoc-i18n
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-11-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: htmlentities
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: base64
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: debug
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|