isodoc-i18n 1.2.1 → 1.2.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7552e0a62364e47c4f8ea7b2810e1ba5d11e39d2a3711792c826412b1aef661f
4
- data.tar.gz: 29d4eef4f2a07bcf6ad1de7ddc229cf7ff9b0ffa8e55c81652d2174165cf2214
3
+ metadata.gz: d1cb9e9bc5f9e053a31ca971936606b5248577c9b001d3c858d8ecaed201ced8
4
+ data.tar.gz: 0ea1ee1c8b6913c3a708d63eee6f6f0af5f6ba171a65b04a7792063bd117a339
5
5
  SHA512:
6
- metadata.gz: 220fddc821f5f00d4f1eecca1f7583b0236254bb0a5141c50d7583f4c725be2b4aca949ffc4f89ed390c01850f2912d86b9562959773efabf1a3df4838e8ec60
7
- data.tar.gz: 59373ef3a3688bf4f382da1e2a5df55b3b7ae1615385bc04504faf9421e9c5e66333ed1859243f0e8a86d4f42724692ab1cd9b8f9f0bd62d4c4e70cceb7e93cd
6
+ metadata.gz: a40fba88473c09f93b3eaf3da323c22c6974e9a50a2a287d4235ba5a757852a38a553765e9fdf6289d0dd97721e4c03660e390d06ece34a6f082b154377c041b
7
+ data.tar.gz: c8e0127577dda72be74e84f9e211169b682817f12ba38507d98c81ac5da649b5ae92d3a4d29f1d63b38ec244c4dbc44a23cf95eb0043773d4e6ae3b59309bd1c
data/isodoc-i18n.gemspec CHANGED
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
26
26
  spec.add_dependency "liquid", "~> 5"
27
27
  spec.add_dependency "metanorma-utils", ">= 1.7.0"
28
28
  spec.add_dependency "twitter_cldr"
29
+ spec.add_dependency "base64"
29
30
 
30
31
  spec.add_development_dependency "debug"
31
32
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
@@ -37,5 +38,6 @@ Gem::Specification.new do |spec|
37
38
  spec.add_development_dependency "simplecov", "~> 0.15"
38
39
  spec.add_development_dependency "timecop", "~> 0.9"
39
40
  spec.add_development_dependency "webmock"
41
+ spec.add_development_dependency "xml-c14n"
40
42
  # spec.metadata["rubygems_mfa_required"] = "true"
41
43
  end
@@ -1,5 +1,5 @@
1
1
  module IsoDoc
2
2
  class I18n
3
- VERSION = "1.2.1".freeze
3
+ VERSION = "1.2.3".freeze
4
4
  end
5
5
  end
data/lib/isodoc/i18n.rb CHANGED
@@ -6,6 +6,7 @@ require_relative "l10n"
6
6
  require_relative "liquid/liquid"
7
7
  require "liquid"
8
8
  require_relative "i18n/version"
9
+ require "base64"
9
10
 
10
11
  module IsoDoc
11
12
  class I18n
data/lib/isodoc/l10n.rb CHANGED
@@ -5,9 +5,8 @@ module IsoDoc
5
5
  end
6
6
 
7
7
  # function localising spaces and punctuation.
8
- # Not clear if period needs to be localised for zh
9
8
  def l10n(text, lang = @lang, script = @script, locale = @locale)
10
- lang == "zh" and text = l10n_zh(text, script)
9
+ %w(zh ja ko).include?(lang) and text = l10n_zh(text, script)
11
10
  lang == "fr" && text = l10n_fr(text, locale || "FR")
12
11
  bidiwrap(text, lang, script)
13
12
  end
@@ -30,23 +29,35 @@ module IsoDoc
30
29
  .default_script(@lang))]
31
30
  end
32
31
 
32
+ # CJK
33
33
  def l10n_zh(text, script = "Hans")
34
34
  xml = Nokogiri::XML::DocumentFragment.parse(text)
35
- xml.traverse do |n|
36
- next unless n.text?
37
-
38
- n.replace(l10_zh1(cleanup_entities(n.text, is_xml: false), script))
35
+ t = xml.xpath(".//text()")
36
+ t.each_with_index do |n, i|
37
+ prev, foll = l10n_context(t, i)
38
+ text = cleanup_entities(n.text, is_xml: false)
39
+ n.replace(l10_zh1(text, prev, foll, script))
39
40
  end
40
41
  xml.to_xml(encoding: "UTF-8").gsub(/<b>/, "").gsub("</b>", "")
41
42
  .gsub(/<\?[^>]+>/, "")
42
43
  end
43
44
 
45
+ # previous, following context of current text node:
46
+ # do not use just the immediately adjoining text tokens for context
47
+ # deal with spaces and empty text by just concatenating entire context
48
+ def l10n_context(nodes, idx)
49
+ prev = nodes[0...idx].map(&:text).join
50
+ foll = nodes[(idx + 1)...(nodes.size)].map(&:text).join
51
+ [prev, foll]
52
+ end
53
+
44
54
  def l10n_fr(text, locale)
45
55
  xml = Nokogiri::XML::DocumentFragment.parse(text)
46
- xml.traverse do |n|
47
- next unless n.text?
48
-
49
- n.replace(l10n_fr1(cleanup_entities(n.text, is_xml: false), locale))
56
+ t = xml.xpath(".//text()")
57
+ t.each_with_index do |n, i|
58
+ prev, foll = l10n_context(t, i)
59
+ text = cleanup_entities(n.text, is_xml: false)
60
+ n.replace(l10n_fr1(text, prev, foll, locale))
50
61
  end
51
62
  xml.to_xml(encoding: "UTF-8")
52
63
  end
@@ -55,36 +66,84 @@ module IsoDoc
55
66
  "\\p{In Halfwidth And Fullwidth Forms}".freeze
56
67
 
57
68
  # note: we can't differentiate comma from enumeration comma 、
58
- def l10_zh1(text, _script)
59
- l10n_zh_remove_space(l10n_zh_punct(text))
69
+ # def l10_zh1(text, _script)
70
+ def l10_zh1(text, prev, foll, _script)
71
+ # l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
72
+ r = l10n_zh_punct(text, prev, foll)
73
+ r = l10n_zh_remove_space(r, prev, foll)
74
+ l10n_zh_dash(r, prev, foll)
60
75
  end
61
76
 
62
- def l10n_zh_punct(text)
63
- ["::", ",,", "..", "))", "]]", "::", ";;", "??", "!!", "–~"].each do |m|
64
- text = text.gsub(/#{Regexp.quote m[0]}/, m[1])
65
- end
66
- ["((", "[["].each do |m|
67
- text = text.gsub(/#{Regexp.quote m[0]}/, m[1])
77
+ ZH1_PUNCT = /(#{ZH_CHAR}|^) # CJK character, or start of string
78
+ (\s*)$ # Latin spaces optional
79
+ /xo.freeze
80
+ ZH2_PUNCT = /^\s* # followed by ignorable Latin spaces
81
+ [:,.()\[\];?!-]* # Latin punct which will also convert to CJK
82
+ (#{ZH_CHAR}|$) # CJK character, or end of string
83
+ /xo.freeze
84
+
85
+ # CJK punct if (^|CJK).($|CJK)
86
+ def l10n_zh_punct(text, prev, foll)
87
+ ["::", ",,", "..", "))", "]]", ";;", "??", "!!", "((", "[["].each do |m|
88
+ text = l10n_gsub(text, prev, foll, [m[0], m[1]],
89
+ [ZH1_PUNCT, ZH2_PUNCT])
68
90
  end
69
91
  text
70
92
  end
71
93
 
72
- def l10n_zh_remove_space(text)
73
- text.gsub(/(?<=#{ZH_CHAR}) (?=#{ZH_CHAR})/o, "")
74
- .gsub(/(?<=\d) (?=#{ZH_CHAR})/o, "")
75
- .gsub(/(?<=#{ZH_CHAR}) (?=\d)/o, "")
76
- .gsub(/(?<=#{ZH_CHAR}) (?=[A-Za-z](#{ZH_CHAR}|$))/o, "")
94
+ ZH1_DASH = /(#{ZH_CHAR}|^) # CJK character, or start of string
95
+ (\d*) # optional digits
96
+ $/xo.freeze
97
+
98
+ ZH2_DASH = /^\d* # followed by optional digits
99
+ (#{ZH_CHAR}|$) # CJK character, or end of string
100
+ /xo.freeze
101
+
102
+ def l10n_zh_dash(text, prev, foll)
103
+ l10n_gsub(text, prev, foll, %w(– ~), [ZH1_DASH, ZH2_DASH])
104
+ end
105
+
106
+ def l10n_gsub(text, prev, foll, delim, regex)
107
+ context = l10n_gsub_context(text, prev, foll, delim) or return text
108
+ (1...(context.size - 1)).each do |i|
109
+ l10_context_valid?(context, i, delim, regex) and
110
+ context[i] = delim[1].gsub("\\0", context[i]) # Full-width equivalent
111
+ end
112
+ context[1...(context.size - 1)].join
113
+ end
114
+
115
+ def l10n_gsub_context(text, prev, foll, delim)
116
+ d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
117
+ context = text.split(/(#{d})/) # delim to replace
118
+ context.size == 1 and return
119
+ [prev, context, foll].flatten
120
+ end
121
+
122
+ def l10_context_valid?(context, idx, delim, regex)
123
+ found_delim = if delim[0].is_a?(Regexp) # punct to convert
124
+ delim[0].match?(context[idx])
125
+ else
126
+ context[idx] == delim[0]
127
+ end
128
+ found_delim &&
129
+ regex[0].match?(context[0...idx].join) && # preceding context
130
+ regex[1].match?(context[(idx + 1)..-1].join) # foll context
131
+ end
132
+
133
+ def l10n_zh_remove_space(text, prev, foll)
134
+ text = l10n_gsub(text, prev, foll, [" ", ""],
135
+ [/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o])
136
+ l10n_gsub(text, prev, foll, [" ", ""],
137
+ [/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o])
77
138
  end
78
139
 
79
- def l10n_fr1(text, locale)
80
- text = text.gsub(/(?<=\p{Alnum})([»›;?!])(?=\s)/, "\u202f\\1")
81
- text = text.gsub(/(?<=\p{Alnum})([»›;?!])$/, "\u202f\\1")
82
- text = text.gsub(/^([»›;?!])/, "\u202f\\1")
83
- text = text.gsub(/([«‹])/, "\\1\u202f")
140
+ def l10n_fr1(text, prev, foll, locale)
141
+ text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
142
+ [/\p{Alnum}$/, /^(\s|$)/])
143
+ text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"], [/$/, /^./])
84
144
  colonsp = locale == "CH" ? "\u202f" : "\u00a0"
85
- text = text.gsub(/(?<=\p{Alnum})(:)(?=\s)/, "#{colonsp}\\1")
86
- text = text.gsub(/(?<=\p{Alnum})(:)$/, "#{colonsp}\\1")
87
- text.gsub(/^(:\s)/, "#{colonsp}\\1")
145
+ l10n_gsub(text, prev, foll, [":", "#{colonsp}\\0"],
146
+ [/\p{Alnum}$/, /^(\s|$)/])
88
147
  end
89
148
 
90
149
  def self.cjk_extend(text)
@@ -102,7 +161,8 @@ module IsoDoc
102
161
 
103
162
  def interleave_space_cjk?(text)
104
163
  text.size == 2 or return
105
- ["\u2014\u2014", "\u2025\u2025", "\u2026\u2026", "\u22ef\u22ef"].include?(text) ||
164
+ ["\u2014\u2014", "\u2025\u2025", "\u2026\u2026",
165
+ "\u22ef\u22ef"].include?(text) ||
106
166
  /\d\d|\p{Latin}\p{Latin}|[[:space:]]/.match?(text) ||
107
167
  /^[\u2018\u201c(\u3014\[{\u3008\u300a\u300c\u300e\u3010\u2985\u3018\u3016\u00ab\u301d]/.match?(text) ||
108
168
  /[\u2019\u201d)\u3015\]}\u3009\u300b\u300d\u300f\u3011\u2986\u3019\u3017\u00bb\u301f]$/.match?(text) ||
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: isodoc-i18n
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ version: 1.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-02-19 00:00:00.000000000 Z
11
+ date: 2024-11-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: base64
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: debug
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -206,6 +220,20 @@ dependencies:
206
220
  - - ">="
207
221
  - !ruby/object:Gem::Version
208
222
  version: '0'
223
+ - !ruby/object:Gem::Dependency
224
+ name: xml-c14n
225
+ requirement: !ruby/object:Gem::Requirement
226
+ requirements:
227
+ - - ">="
228
+ - !ruby/object:Gem::Version
229
+ version: '0'
230
+ type: :development
231
+ prerelease: false
232
+ version_requirements: !ruby/object:Gem::Requirement
233
+ requirements:
234
+ - - ">="
235
+ - !ruby/object:Gem::Version
236
+ version: '0'
209
237
  description: 'Internationalisation for Metanorma rendering
210
238
 
211
239
  '
@@ -249,7 +277,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
249
277
  - !ruby/object:Gem::Version
250
278
  version: '0'
251
279
  requirements: []
252
- rubygems_version: 3.3.26
280
+ rubygems_version: 3.3.27
253
281
  signing_key:
254
282
  specification_version: 4
255
283
  summary: isodoc-i18n