isodoc-i18n 1.2.2 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/isodoc-i18n.gemspec +1 -0
- data/lib/isodoc/i18n/version.rb +1 -1
- data/lib/isodoc/i18n.rb +1 -0
- data/lib/isodoc/l10n.rb +84 -37
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d1cb9e9bc5f9e053a31ca971936606b5248577c9b001d3c858d8ecaed201ced8
|
4
|
+
data.tar.gz: 0ea1ee1c8b6913c3a708d63eee6f6f0af5f6ba171a65b04a7792063bd117a339
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a40fba88473c09f93b3eaf3da323c22c6974e9a50a2a287d4235ba5a757852a38a553765e9fdf6289d0dd97721e4c03660e390d06ece34a6f082b154377c041b
|
7
|
+
data.tar.gz: c8e0127577dda72be74e84f9e211169b682817f12ba38507d98c81ac5da649b5ae92d3a4d29f1d63b38ec244c4dbc44a23cf95eb0043773d4e6ae3b59309bd1c
|
data/isodoc-i18n.gemspec
CHANGED
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_dependency "liquid", "~> 5"
|
27
27
|
spec.add_dependency "metanorma-utils", ">= 1.7.0"
|
28
28
|
spec.add_dependency "twitter_cldr"
|
29
|
+
spec.add_dependency "base64"
|
29
30
|
|
30
31
|
spec.add_development_dependency "debug"
|
31
32
|
spec.add_development_dependency "equivalent-xml", "~> 0.6"
|
data/lib/isodoc/i18n/version.rb
CHANGED
data/lib/isodoc/i18n.rb
CHANGED
data/lib/isodoc/l10n.rb
CHANGED
@@ -32,20 +32,32 @@ module IsoDoc
|
|
32
32
|
# CJK
|
33
33
|
def l10n_zh(text, script = "Hans")
|
34
34
|
xml = Nokogiri::XML::DocumentFragment.parse(text)
|
35
|
-
xml.
|
36
|
-
|
37
|
-
|
35
|
+
t = xml.xpath(".//text()")
|
36
|
+
t.each_with_index do |n, i|
|
37
|
+
prev, foll = l10n_context(t, i)
|
38
|
+
text = cleanup_entities(n.text, is_xml: false)
|
39
|
+
n.replace(l10_zh1(text, prev, foll, script))
|
38
40
|
end
|
39
41
|
xml.to_xml(encoding: "UTF-8").gsub(/<b>/, "").gsub("</b>", "")
|
40
42
|
.gsub(/<\?[^>]+>/, "")
|
41
43
|
end
|
42
44
|
|
45
|
+
# previous, following context of current text node:
|
46
|
+
# do not use just the immediately adjoining text tokens for context
|
47
|
+
# deal with spaces and empty text by just concatenating entire context
|
48
|
+
def l10n_context(nodes, idx)
|
49
|
+
prev = nodes[0...idx].map(&:text).join
|
50
|
+
foll = nodes[(idx + 1)...(nodes.size)].map(&:text).join
|
51
|
+
[prev, foll]
|
52
|
+
end
|
53
|
+
|
43
54
|
def l10n_fr(text, locale)
|
44
55
|
xml = Nokogiri::XML::DocumentFragment.parse(text)
|
45
|
-
xml.
|
46
|
-
|
47
|
-
|
48
|
-
|
56
|
+
t = xml.xpath(".//text()")
|
57
|
+
t.each_with_index do |n, i|
|
58
|
+
prev, foll = l10n_context(t, i)
|
59
|
+
text = cleanup_entities(n.text, is_xml: false)
|
60
|
+
n.replace(l10n_fr1(text, prev, foll, locale))
|
49
61
|
end
|
50
62
|
xml.to_xml(encoding: "UTF-8")
|
51
63
|
end
|
@@ -54,49 +66,84 @@ module IsoDoc
|
|
54
66
|
"\\p{In Halfwidth And Fullwidth Forms}".freeze
|
55
67
|
|
56
68
|
# note: we can't differentiate comma from enumeration comma 、
|
57
|
-
def l10_zh1(text, _script)
|
58
|
-
|
69
|
+
# def l10_zh1(text, _script)
|
70
|
+
def l10_zh1(text, prev, foll, _script)
|
71
|
+
# l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
|
72
|
+
r = l10n_zh_punct(text, prev, foll)
|
73
|
+
r = l10n_zh_remove_space(r, prev, foll)
|
74
|
+
l10n_zh_dash(r, prev, foll)
|
59
75
|
end
|
60
76
|
|
77
|
+
ZH1_PUNCT = /(#{ZH_CHAR}|^) # CJK character, or start of string
|
78
|
+
(\s*)$ # Latin spaces optional
|
79
|
+
/xo.freeze
|
80
|
+
ZH2_PUNCT = /^\s* # followed by ignorable Latin spaces
|
81
|
+
[:,.()\[\];?!-]* # Latin punct which will also convert to CJK
|
82
|
+
(#{ZH_CHAR}|$) # CJK character, or end of string
|
83
|
+
/xo.freeze
|
84
|
+
|
61
85
|
# CJK punct if (^|CJK).($|CJK)
|
62
|
-
def l10n_zh_punct(text)
|
86
|
+
def l10n_zh_punct(text, prev, foll)
|
63
87
|
["::", ",,", "..", "))", "]]", ";;", "??", "!!", "((", "[["].each do |m|
|
64
|
-
text = text
|
65
|
-
|
66
|
-
#{Regexp.quote(m[0])} # Latin punctuation we want to convert to CJK
|
67
|
-
(?= \s* # followed (lookahead) by ignorable Latin spaces
|
68
|
-
[:,.()\[\];?!-]* # Latin punctuation which we will also convert to CJK
|
69
|
-
(#{ZH_CHAR}|$) # CJK character, or end of string
|
70
|
-
) /x, "\\1#{m[1]}")
|
88
|
+
text = l10n_gsub(text, prev, foll, [m[0], m[1]],
|
89
|
+
[ZH1_PUNCT, ZH2_PUNCT])
|
71
90
|
end
|
72
91
|
text
|
73
92
|
end
|
74
93
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
(#{ZH_CHAR}|$)
|
81
|
-
|
94
|
+
ZH1_DASH = /(#{ZH_CHAR}|^) # CJK character, or start of string
|
95
|
+
(\d*) # optional digits
|
96
|
+
$/xo.freeze
|
97
|
+
|
98
|
+
ZH2_DASH = /^\d* # followed by optional digits
|
99
|
+
(#{ZH_CHAR}|$) # CJK character, or end of string
|
100
|
+
/xo.freeze
|
101
|
+
|
102
|
+
def l10n_zh_dash(text, prev, foll)
|
103
|
+
l10n_gsub(text, prev, foll, %w(– ~), [ZH1_DASH, ZH2_DASH])
|
104
|
+
end
|
105
|
+
|
106
|
+
def l10n_gsub(text, prev, foll, delim, regex)
|
107
|
+
context = l10n_gsub_context(text, prev, foll, delim) or return text
|
108
|
+
(1...(context.size - 1)).each do |i|
|
109
|
+
l10_context_valid?(context, i, delim, regex) and
|
110
|
+
context[i] = delim[1].gsub("\\0", context[i]) # Full-width equivalent
|
111
|
+
end
|
112
|
+
context[1...(context.size - 1)].join
|
113
|
+
end
|
114
|
+
|
115
|
+
def l10n_gsub_context(text, prev, foll, delim)
|
116
|
+
d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
|
117
|
+
context = text.split(/(#{d})/) # delim to replace
|
118
|
+
context.size == 1 and return
|
119
|
+
[prev, context, foll].flatten
|
120
|
+
end
|
121
|
+
|
122
|
+
def l10_context_valid?(context, idx, delim, regex)
|
123
|
+
found_delim = if delim[0].is_a?(Regexp) # punct to convert
|
124
|
+
delim[0].match?(context[idx])
|
125
|
+
else
|
126
|
+
context[idx] == delim[0]
|
127
|
+
end
|
128
|
+
found_delim &&
|
129
|
+
regex[0].match?(context[0...idx].join) && # preceding context
|
130
|
+
regex[1].match?(context[(idx + 1)..-1].join) # foll context
|
82
131
|
end
|
83
132
|
|
84
|
-
def l10n_zh_remove_space(text)
|
85
|
-
text
|
86
|
-
|
87
|
-
|
88
|
-
|
133
|
+
def l10n_zh_remove_space(text, prev, foll)
|
134
|
+
text = l10n_gsub(text, prev, foll, [" ", ""],
|
135
|
+
[/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o])
|
136
|
+
l10n_gsub(text, prev, foll, [" ", ""],
|
137
|
+
[/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o])
|
89
138
|
end
|
90
139
|
|
91
|
-
def l10n_fr1(text, locale)
|
92
|
-
text = text
|
93
|
-
|
94
|
-
text = text
|
95
|
-
text = text.gsub(/([«‹])/, "\\1\u202f")
|
140
|
+
def l10n_fr1(text, prev, foll, locale)
|
141
|
+
text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
|
142
|
+
[/\p{Alnum}$/, /^(\s|$)/])
|
143
|
+
text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"], [/$/, /^./])
|
96
144
|
colonsp = locale == "CH" ? "\u202f" : "\u00a0"
|
97
|
-
text
|
98
|
-
|
99
|
-
text.gsub(/^(:\s)/, "#{colonsp}\\1")
|
145
|
+
l10n_gsub(text, prev, foll, [":", "#{colonsp}\\0"],
|
146
|
+
[/\p{Alnum}$/, /^(\s|$)/])
|
100
147
|
end
|
101
148
|
|
102
149
|
def self.cjk_extend(text)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: isodoc-i18n
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-11-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: htmlentities
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: base64
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: debug
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|