isodoc-i18n 1.2.1 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7552e0a62364e47c4f8ea7b2810e1ba5d11e39d2a3711792c826412b1aef661f
4
- data.tar.gz: 29d4eef4f2a07bcf6ad1de7ddc229cf7ff9b0ffa8e55c81652d2174165cf2214
3
+ metadata.gz: d1cb9e9bc5f9e053a31ca971936606b5248577c9b001d3c858d8ecaed201ced8
4
+ data.tar.gz: 0ea1ee1c8b6913c3a708d63eee6f6f0af5f6ba171a65b04a7792063bd117a339
5
5
  SHA512:
6
- metadata.gz: 220fddc821f5f00d4f1eecca1f7583b0236254bb0a5141c50d7583f4c725be2b4aca949ffc4f89ed390c01850f2912d86b9562959773efabf1a3df4838e8ec60
7
- data.tar.gz: 59373ef3a3688bf4f382da1e2a5df55b3b7ae1615385bc04504faf9421e9c5e66333ed1859243f0e8a86d4f42724692ab1cd9b8f9f0bd62d4c4e70cceb7e93cd
6
+ metadata.gz: a40fba88473c09f93b3eaf3da323c22c6974e9a50a2a287d4235ba5a757852a38a553765e9fdf6289d0dd97721e4c03660e390d06ece34a6f082b154377c041b
7
+ data.tar.gz: c8e0127577dda72be74e84f9e211169b682817f12ba38507d98c81ac5da649b5ae92d3a4d29f1d63b38ec244c4dbc44a23cf95eb0043773d4e6ae3b59309bd1c
data/isodoc-i18n.gemspec CHANGED
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
26
26
  spec.add_dependency "liquid", "~> 5"
27
27
  spec.add_dependency "metanorma-utils", ">= 1.7.0"
28
28
  spec.add_dependency "twitter_cldr"
29
+ spec.add_dependency "base64"
29
30
 
30
31
  spec.add_development_dependency "debug"
31
32
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
@@ -37,5 +38,6 @@ Gem::Specification.new do |spec|
37
38
  spec.add_development_dependency "simplecov", "~> 0.15"
38
39
  spec.add_development_dependency "timecop", "~> 0.9"
39
40
  spec.add_development_dependency "webmock"
41
+ spec.add_development_dependency "xml-c14n"
40
42
  # spec.metadata["rubygems_mfa_required"] = "true"
41
43
  end
@@ -1,5 +1,5 @@
1
1
  module IsoDoc
2
2
  class I18n
3
- VERSION = "1.2.1".freeze
3
+ VERSION = "1.2.3".freeze
4
4
  end
5
5
  end
data/lib/isodoc/i18n.rb CHANGED
@@ -6,6 +6,7 @@ require_relative "l10n"
6
6
  require_relative "liquid/liquid"
7
7
  require "liquid"
8
8
  require_relative "i18n/version"
9
+ require "base64"
9
10
 
10
11
  module IsoDoc
11
12
  class I18n
data/lib/isodoc/l10n.rb CHANGED
@@ -5,9 +5,8 @@ module IsoDoc
5
5
  end
6
6
 
7
7
  # function localising spaces and punctuation.
8
- # Not clear if period needs to be localised for zh
9
8
  def l10n(text, lang = @lang, script = @script, locale = @locale)
10
- lang == "zh" and text = l10n_zh(text, script)
9
+ %w(zh ja ko).include?(lang) and text = l10n_zh(text, script)
11
10
  lang == "fr" && text = l10n_fr(text, locale || "FR")
12
11
  bidiwrap(text, lang, script)
13
12
  end
@@ -30,23 +29,35 @@ module IsoDoc
30
29
  .default_script(@lang))]
31
30
  end
32
31
 
32
+ # CJK
33
33
  def l10n_zh(text, script = "Hans")
34
34
  xml = Nokogiri::XML::DocumentFragment.parse(text)
35
- xml.traverse do |n|
36
- next unless n.text?
37
-
38
- n.replace(l10_zh1(cleanup_entities(n.text, is_xml: false), script))
35
+ t = xml.xpath(".//text()")
36
+ t.each_with_index do |n, i|
37
+ prev, foll = l10n_context(t, i)
38
+ text = cleanup_entities(n.text, is_xml: false)
39
+ n.replace(l10_zh1(text, prev, foll, script))
39
40
  end
40
41
  xml.to_xml(encoding: "UTF-8").gsub(/<b>/, "").gsub("</b>", "")
41
42
  .gsub(/<\?[^>]+>/, "")
42
43
  end
43
44
 
45
+ # previous, following context of current text node:
46
+ # do not use just the immediately adjoining text tokens for context
47
+ # deal with spaces and empty text by just concatenating entire context
48
+ def l10n_context(nodes, idx)
49
+ prev = nodes[0...idx].map(&:text).join
50
+ foll = nodes[(idx + 1)...(nodes.size)].map(&:text).join
51
+ [prev, foll]
52
+ end
53
+
44
54
  def l10n_fr(text, locale)
45
55
  xml = Nokogiri::XML::DocumentFragment.parse(text)
46
- xml.traverse do |n|
47
- next unless n.text?
48
-
49
- n.replace(l10n_fr1(cleanup_entities(n.text, is_xml: false), locale))
56
+ t = xml.xpath(".//text()")
57
+ t.each_with_index do |n, i|
58
+ prev, foll = l10n_context(t, i)
59
+ text = cleanup_entities(n.text, is_xml: false)
60
+ n.replace(l10n_fr1(text, prev, foll, locale))
50
61
  end
51
62
  xml.to_xml(encoding: "UTF-8")
52
63
  end
@@ -55,36 +66,84 @@ module IsoDoc
55
66
  "\\p{In Halfwidth And Fullwidth Forms}".freeze
56
67
 
57
68
  # note: we can't differentiate comma from enumeration comma 、
58
- def l10_zh1(text, _script)
59
- l10n_zh_remove_space(l10n_zh_punct(text))
69
+ # def l10_zh1(text, _script)
70
+ def l10_zh1(text, prev, foll, _script)
71
+ # l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
72
+ r = l10n_zh_punct(text, prev, foll)
73
+ r = l10n_zh_remove_space(r, prev, foll)
74
+ l10n_zh_dash(r, prev, foll)
60
75
  end
61
76
 
62
- def l10n_zh_punct(text)
63
- ["::", ",,", "..", "))", "]]", "::", ";;", "??", "!!", "–~"].each do |m|
64
- text = text.gsub(/#{Regexp.quote m[0]}/, m[1])
65
- end
66
- ["((", "[["].each do |m|
67
- text = text.gsub(/#{Regexp.quote m[0]}/, m[1])
77
+ ZH1_PUNCT = /(#{ZH_CHAR}|^) # CJK character, or start of string
78
+ (\s*)$ # Latin spaces optional
79
+ /xo.freeze
80
+ ZH2_PUNCT = /^\s* # followed by ignorable Latin spaces
81
+ [:,.()\[\];?!-]* # Latin punct which will also convert to CJK
82
+ (#{ZH_CHAR}|$) # CJK character, or end of string
83
+ /xo.freeze
84
+
85
+ # CJK punct if (^|CJK).($|CJK)
86
+ def l10n_zh_punct(text, prev, foll)
87
+ ["::", ",,", "..", "))", "]]", ";;", "??", "!!", "((", "[["].each do |m|
88
+ text = l10n_gsub(text, prev, foll, [m[0], m[1]],
89
+ [ZH1_PUNCT, ZH2_PUNCT])
68
90
  end
69
91
  text
70
92
  end
71
93
 
72
- def l10n_zh_remove_space(text)
73
- text.gsub(/(?<=#{ZH_CHAR}) (?=#{ZH_CHAR})/o, "")
74
- .gsub(/(?<=\d) (?=#{ZH_CHAR})/o, "")
75
- .gsub(/(?<=#{ZH_CHAR}) (?=\d)/o, "")
76
- .gsub(/(?<=#{ZH_CHAR}) (?=[A-Za-z](#{ZH_CHAR}|$))/o, "")
94
+ ZH1_DASH = /(#{ZH_CHAR}|^) # CJK character, or start of string
95
+ (\d*) # optional digits
96
+ $/xo.freeze
97
+
98
+ ZH2_DASH = /^\d* # followed by optional digits
99
+ (#{ZH_CHAR}|$) # CJK character, or end of string
100
+ /xo.freeze
101
+
102
+ def l10n_zh_dash(text, prev, foll)
103
+ l10n_gsub(text, prev, foll, %w(– ~), [ZH1_DASH, ZH2_DASH])
104
+ end
105
+
106
+ def l10n_gsub(text, prev, foll, delim, regex)
107
+ context = l10n_gsub_context(text, prev, foll, delim) or return text
108
+ (1...(context.size - 1)).each do |i|
109
+ l10_context_valid?(context, i, delim, regex) and
110
+ context[i] = delim[1].gsub("\\0", context[i]) # Full-width equivalent
111
+ end
112
+ context[1...(context.size - 1)].join
113
+ end
114
+
115
+ def l10n_gsub_context(text, prev, foll, delim)
116
+ d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
117
+ context = text.split(/(#{d})/) # delim to replace
118
+ context.size == 1 and return
119
+ [prev, context, foll].flatten
120
+ end
121
+
122
+ def l10_context_valid?(context, idx, delim, regex)
123
+ found_delim = if delim[0].is_a?(Regexp) # punct to convert
124
+ delim[0].match?(context[idx])
125
+ else
126
+ context[idx] == delim[0]
127
+ end
128
+ found_delim &&
129
+ regex[0].match?(context[0...idx].join) && # preceding context
130
+ regex[1].match?(context[(idx + 1)..-1].join) # foll context
131
+ end
132
+
133
+ def l10n_zh_remove_space(text, prev, foll)
134
+ text = l10n_gsub(text, prev, foll, [" ", ""],
135
+ [/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o])
136
+ l10n_gsub(text, prev, foll, [" ", ""],
137
+ [/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o])
77
138
  end
78
139
 
79
- def l10n_fr1(text, locale)
80
- text = text.gsub(/(?<=\p{Alnum})([»›;?!])(?=\s)/, "\u202f\\1")
81
- text = text.gsub(/(?<=\p{Alnum})([»›;?!])$/, "\u202f\\1")
82
- text = text.gsub(/^([»›;?!])/, "\u202f\\1")
83
- text = text.gsub(/([«‹])/, "\\1\u202f")
140
+ def l10n_fr1(text, prev, foll, locale)
141
+ text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
142
+ [/\p{Alnum}$/, /^(\s|$)/])
143
+ text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"], [/$/, /^./])
84
144
  colonsp = locale == "CH" ? "\u202f" : "\u00a0"
85
- text = text.gsub(/(?<=\p{Alnum})(:)(?=\s)/, "#{colonsp}\\1")
86
- text = text.gsub(/(?<=\p{Alnum})(:)$/, "#{colonsp}\\1")
87
- text.gsub(/^(:\s)/, "#{colonsp}\\1")
145
+ l10n_gsub(text, prev, foll, [":", "#{colonsp}\\0"],
146
+ [/\p{Alnum}$/, /^(\s|$)/])
88
147
  end
89
148
 
90
149
  def self.cjk_extend(text)
@@ -102,7 +161,8 @@ module IsoDoc
102
161
 
103
162
  def interleave_space_cjk?(text)
104
163
  text.size == 2 or return
105
- ["\u2014\u2014", "\u2025\u2025", "\u2026\u2026", "\u22ef\u22ef"].include?(text) ||
164
+ ["\u2014\u2014", "\u2025\u2025", "\u2026\u2026",
165
+ "\u22ef\u22ef"].include?(text) ||
106
166
  /\d\d|\p{Latin}\p{Latin}|[[:space:]]/.match?(text) ||
107
167
  /^[\u2018\u201c(\u3014\[{\u3008\u300a\u300c\u300e\u3010\u2985\u3018\u3016\u00ab\u301d]/.match?(text) ||
108
168
  /[\u2019\u201d)\u3015\]}\u3009\u300b\u300d\u300f\u3011\u2986\u3019\u3017\u00bb\u301f]$/.match?(text) ||
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: isodoc-i18n
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ version: 1.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-02-19 00:00:00.000000000 Z
11
+ date: 2024-11-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: base64
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: debug
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -206,6 +220,20 @@ dependencies:
206
220
  - - ">="
207
221
  - !ruby/object:Gem::Version
208
222
  version: '0'
223
+ - !ruby/object:Gem::Dependency
224
+ name: xml-c14n
225
+ requirement: !ruby/object:Gem::Requirement
226
+ requirements:
227
+ - - ">="
228
+ - !ruby/object:Gem::Version
229
+ version: '0'
230
+ type: :development
231
+ prerelease: false
232
+ version_requirements: !ruby/object:Gem::Requirement
233
+ requirements:
234
+ - - ">="
235
+ - !ruby/object:Gem::Version
236
+ version: '0'
209
237
  description: 'Internationalisation for Metanorma rendering
210
238
 
211
239
  '
@@ -249,7 +277,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
249
277
  - !ruby/object:Gem::Version
250
278
  version: '0'
251
279
  requirements: []
252
- rubygems_version: 3.3.26
280
+ rubygems_version: 3.3.27
253
281
  signing_key:
254
282
  specification_version: 4
255
283
  summary: isodoc-i18n