isodoc-i18n 1.2.2 → 1.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 490a22f13264a470afa34644c450651239bb43409aa8e579bae72229d165fe38
4
- data.tar.gz: 9be5c331d23f732e37e2588ff3269e867231f56b6b074417258f5e01fa55aeba
3
+ metadata.gz: d5373a1bd009370c01efe97c54aa373ad71eb520628aabc98207257b84b7a52a
4
+ data.tar.gz: 9c97a3e50fa228d20f890aaa16f94ae90efb722ccf9a6e96a62e36f819759745
5
5
  SHA512:
6
- metadata.gz: 024f04dedc8bdef757f52d1ae35b69af155d786bb5e1075b6be8c11e2a2039b0bf9ee4e39a0bbca260e3edee85aafb70360240286a38ce12cb41fef88db70e02
7
- data.tar.gz: 6952b1cf007e02b5e7fcb9ca3507cda68e22e27facee436fafc41daaa65b6b5eed3b5e0b9fbbde57d8c10361ab55dcc870a25b92a0f624597b02bb3a912c7586
6
+ metadata.gz: e7abebde7c6d4630a4f6f1916d746b5833c3f02c894d5401d9ccba720aae1c699639563066ed493107b6554ae78433c046a1e5ecce496b021cb6613242ef81c4
7
+ data.tar.gz: 40d2f238053553382f1ff63eac7d579402e5e296fa18f4f38fe20cae94854ac1d58e658b5d41fe05f039375d6dd4f25cf3bfbedb1c548c699d90dc4a880d7d15
data/isodoc-i18n.gemspec CHANGED
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
26
26
  spec.add_dependency "liquid", "~> 5"
27
27
  spec.add_dependency "metanorma-utils", ">= 1.7.0"
28
28
  spec.add_dependency "twitter_cldr"
29
+ spec.add_dependency "base64"
29
30
 
30
31
  spec.add_development_dependency "debug"
31
32
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
@@ -1,5 +1,5 @@
1
1
  module IsoDoc
2
2
  class I18n
3
- VERSION = "1.2.2".freeze
3
+ VERSION = "1.2.4".freeze
4
4
  end
5
5
  end
data/lib/isodoc/i18n.rb CHANGED
@@ -6,6 +6,7 @@ require_relative "l10n"
6
6
  require_relative "liquid/liquid"
7
7
  require "liquid"
8
8
  require_relative "i18n/version"
9
+ require "base64"
9
10
 
10
11
  module IsoDoc
11
12
  class I18n
data/lib/isodoc/l10n.rb CHANGED
@@ -32,71 +32,118 @@ module IsoDoc
32
32
  # CJK
33
33
  def l10n_zh(text, script = "Hans")
34
34
  xml = Nokogiri::XML::DocumentFragment.parse(text)
35
- xml.traverse do |n|
36
- n.text? or next
37
- n.replace(l10_zh1(cleanup_entities(n.text, is_xml: false), script))
35
+ t = xml.xpath(".//text()")
36
+ t.each_with_index do |n, i|
37
+ prev, foll = l10n_context(t, i)
38
+ text = cleanup_entities(n.text, is_xml: false)
39
+ n.replace(l10_zh1(text, prev, foll, script))
38
40
  end
39
41
  xml.to_xml(encoding: "UTF-8").gsub(/<b>/, "").gsub("</b>", "")
40
42
  .gsub(/<\?[^>]+>/, "")
41
43
  end
42
44
 
45
+ # previous, following context of current text node:
46
+ # do not use just the immediately adjoining text tokens for context
47
+ # deal with spaces and empty text by just concatenating entire context
48
+ def l10n_context(nodes, idx)
49
+ prev = nodes[0...idx].map(&:text).join
50
+ foll = nodes[(idx + 1)...(nodes.size)].map(&:text).join
51
+ [prev, foll]
52
+ end
53
+
43
54
  def l10n_fr(text, locale)
44
55
  xml = Nokogiri::XML::DocumentFragment.parse(text)
45
- xml.traverse do |n|
46
- next unless n.text?
47
-
48
- n.replace(l10n_fr1(cleanup_entities(n.text, is_xml: false), locale))
56
+ t = xml.xpath(".//text()")
57
+ t.each_with_index do |n, i|
58
+ prev, foll = l10n_context(t, i)
59
+ text = cleanup_entities(n.text, is_xml: false)
60
+ n.replace(l10n_fr1(text, prev, foll, locale))
49
61
  end
50
62
  xml.to_xml(encoding: "UTF-8")
51
63
  end
52
64
 
53
- ZH_CHAR = "\\p{Han}|\\p{In CJK Symbols And Punctuation}|" \
54
- "\\p{In Halfwidth And Fullwidth Forms}".freeze
65
+ ZH_CHAR = "(\\p{Han}|\\p{In CJK Symbols And Punctuation}|" \
66
+ "\\p{In Halfwidth And Fullwidth Forms})".freeze
55
67
 
56
68
  # note: we can't differentiate comma from enumeration comma 、
57
- def l10_zh1(text, _script)
58
- l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
69
+ # def l10_zh1(text, _script)
70
+ def l10_zh1(text, prev, foll, _script)
71
+ # l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
72
+ r = l10n_zh_punct(text, prev, foll)
73
+ r = l10n_zh_remove_space(r, prev, foll)
74
+ l10n_zh_dash(r, prev, foll)
59
75
  end
60
76
 
77
+ ZH1_PUNCT = /(#{ZH_CHAR}|^) # CJK character, or start of string
78
+ (\s*)$ # Latin spaces optional
79
+ /xo.freeze
80
+ ZH2_PUNCT = /^\s* # followed by ignorable Latin spaces
81
+ [:,.()\[\];?!-]* # Latin punct which will also convert to CJK
82
+ (#{ZH_CHAR}|$) # CJK character, or end of string
83
+ /xo.freeze
84
+
61
85
  # CJK punct if (^|CJK).($|CJK)
62
- def l10n_zh_punct(text)
86
+ def l10n_zh_punct(text, prev, foll)
63
87
  ["::", ",,", "..", "))", "]]", ";;", "??", "!!", "((", "[["].each do |m|
64
- text = text.gsub(/(?<=#{ZH_CHAR}|^) # CJK character, or start of string
65
- (\s*) # Latin spaces optional
66
- #{Regexp.quote(m[0])} # Latin punctuation we want to convert to CJK
67
- (?= \s* # followed (lookahead) by ignorable Latin spaces
68
- [:,.()\[\];?!-]* # Latin punctuation which we will also convert to CJK
69
- (#{ZH_CHAR}|$) # CJK character, or end of string
70
- ) /x, "\\1#{m[1]}")
88
+ text = l10n_gsub(text, prev, foll, [m[0], m[1]],
89
+ [ZH1_PUNCT, ZH2_PUNCT])
71
90
  end
72
91
  text
73
92
  end
74
93
 
75
- def l10n_zh_dash(text)
76
- text.gsub(/(?<=#{ZH_CHAR}|^) # CJK character, or start of string
77
- (\d*) # optional digits
78
- – # en-dash
79
- (\d*) # optional digits
80
- (#{ZH_CHAR}|$) # CJK character, or end of string
81
- /xo, "\\1~\\2\\3")
94
+ ZH1_DASH = /(#{ZH_CHAR}|^) # CJK character, or start of string
95
+ (\d*) # optional digits
96
+ $/xo.freeze
97
+
98
+ ZH2_DASH = /^\d* # followed by optional digits
99
+ (#{ZH_CHAR}|$) # CJK character, or end of string
100
+ /xo.freeze
101
+
102
+ def l10n_zh_dash(text, prev, foll)
103
+ l10n_gsub(text, prev, foll, %w(– ~), [ZH1_DASH, ZH2_DASH])
104
+ end
105
+
106
+ def l10n_gsub(text, prev, foll, delim, regex)
107
+ context = l10n_gsub_context(text, prev, foll, delim) or return text
108
+ (1...(context.size - 1)).each do |i|
109
+ l10_context_valid?(context, i, delim, regex) and
110
+ context[i] = delim[1].gsub("\\0", context[i]) # Full-width equivalent
111
+ end
112
+ context[1...(context.size - 1)].join
113
+ end
114
+
115
+ def l10n_gsub_context(text, prev, foll, delim)
116
+ d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
117
+ context = text.split(/(#{d})/) # delim to replace
118
+ context.size == 1 and return
119
+ [prev, context, foll].flatten
120
+ end
121
+
122
+ def l10_context_valid?(context, idx, delim, regex)
123
+ found_delim = if delim[0].is_a?(Regexp) # punct to convert
124
+ delim[0].match?(context[idx])
125
+ else
126
+ context[idx] == delim[0]
127
+ end
128
+ found_delim &&
129
+ regex[0].match?(context[0...idx].join) && # preceding context
130
+ regex[1].match?(context[(idx + 1)..-1].join) # foll context
82
131
  end
83
132
 
84
- def l10n_zh_remove_space(text)
85
- text.gsub(/(?<=#{ZH_CHAR}) (?=#{ZH_CHAR})/o, "")
86
- .gsub(/(?<=\d) (?=#{ZH_CHAR})/o, "")
87
- .gsub(/(?<=#{ZH_CHAR}) (?=\d)/o, "")
88
- .gsub(/(?<=#{ZH_CHAR}) (?=[A-Za-z](#{ZH_CHAR}|$))/o, "")
133
+ def l10n_zh_remove_space(text, prev, foll)
134
+ text = l10n_gsub(text, prev, foll, [" ", ""],
135
+ [/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o])
136
+ l10n_gsub(text, prev, foll, [" ", ""],
137
+ [/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o])
89
138
  end
90
139
 
91
- def l10n_fr1(text, locale)
92
- text = text.gsub(/(?<=\p{Alnum})([»›;?!])(?=\s)/, "\u202f\\1")
93
- text = text.gsub(/(?<=\p{Alnum})([»›;?!])$/, "\u202f\\1")
94
- text = text.gsub(/^([»›;?!])/, "\u202f\\1")
95
- text = text.gsub(/([«‹])/, "\\1\u202f")
140
+ def l10n_fr1(text, prev, foll, locale)
141
+ text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
142
+ [/\p{Alnum}$/, /^(\s|$)/])
143
+ text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"], [/$/, /^./])
96
144
  colonsp = locale == "CH" ? "\u202f" : "\u00a0"
97
- text = text.gsub(/(?<=\p{Alnum})(:)(?=\s)/, "#{colonsp}\\1")
98
- text = text.gsub(/(?<=\p{Alnum})(:)$/, "#{colonsp}\\1")
99
- text.gsub(/^(:\s)/, "#{colonsp}\\1")
145
+ l10n_gsub(text, prev, foll, [":", "#{colonsp}\\0"],
146
+ [/\p{Alnum}$/, /^(\s|$)/])
100
147
  end
101
148
 
102
149
  def self.cjk_extend(text)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: isodoc-i18n
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.2
4
+ version: 1.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-10-25 00:00:00.000000000 Z
11
+ date: 2024-11-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: base64
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: debug
71
85
  requirement: !ruby/object:Gem::Requirement