isodoc-i18n 1.2.2 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 490a22f13264a470afa34644c450651239bb43409aa8e579bae72229d165fe38
4
- data.tar.gz: 9be5c331d23f732e37e2588ff3269e867231f56b6b074417258f5e01fa55aeba
3
+ metadata.gz: d1cb9e9bc5f9e053a31ca971936606b5248577c9b001d3c858d8ecaed201ced8
4
+ data.tar.gz: 0ea1ee1c8b6913c3a708d63eee6f6f0af5f6ba171a65b04a7792063bd117a339
5
5
  SHA512:
6
- metadata.gz: 024f04dedc8bdef757f52d1ae35b69af155d786bb5e1075b6be8c11e2a2039b0bf9ee4e39a0bbca260e3edee85aafb70360240286a38ce12cb41fef88db70e02
7
- data.tar.gz: 6952b1cf007e02b5e7fcb9ca3507cda68e22e27facee436fafc41daaa65b6b5eed3b5e0b9fbbde57d8c10361ab55dcc870a25b92a0f624597b02bb3a912c7586
6
+ metadata.gz: a40fba88473c09f93b3eaf3da323c22c6974e9a50a2a287d4235ba5a757852a38a553765e9fdf6289d0dd97721e4c03660e390d06ece34a6f082b154377c041b
7
+ data.tar.gz: c8e0127577dda72be74e84f9e211169b682817f12ba38507d98c81ac5da649b5ae92d3a4d29f1d63b38ec244c4dbc44a23cf95eb0043773d4e6ae3b59309bd1c
data/isodoc-i18n.gemspec CHANGED
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
26
26
  spec.add_dependency "liquid", "~> 5"
27
27
  spec.add_dependency "metanorma-utils", ">= 1.7.0"
28
28
  spec.add_dependency "twitter_cldr"
29
+ spec.add_dependency "base64"
29
30
 
30
31
  spec.add_development_dependency "debug"
31
32
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
@@ -1,5 +1,5 @@
1
1
  module IsoDoc
2
2
  class I18n
3
- VERSION = "1.2.2".freeze
3
+ VERSION = "1.2.3".freeze
4
4
  end
5
5
  end
data/lib/isodoc/i18n.rb CHANGED
@@ -6,6 +6,7 @@ require_relative "l10n"
6
6
  require_relative "liquid/liquid"
7
7
  require "liquid"
8
8
  require_relative "i18n/version"
9
+ require "base64"
9
10
 
10
11
  module IsoDoc
11
12
  class I18n
data/lib/isodoc/l10n.rb CHANGED
@@ -32,20 +32,32 @@ module IsoDoc
32
32
  # CJK
33
33
  def l10n_zh(text, script = "Hans")
34
34
  xml = Nokogiri::XML::DocumentFragment.parse(text)
35
- xml.traverse do |n|
36
- n.text? or next
37
- n.replace(l10_zh1(cleanup_entities(n.text, is_xml: false), script))
35
+ t = xml.xpath(".//text()")
36
+ t.each_with_index do |n, i|
37
+ prev, foll = l10n_context(t, i)
38
+ text = cleanup_entities(n.text, is_xml: false)
39
+ n.replace(l10_zh1(text, prev, foll, script))
38
40
  end
39
41
  xml.to_xml(encoding: "UTF-8").gsub(/<b>/, "").gsub("</b>", "")
40
42
  .gsub(/<\?[^>]+>/, "")
41
43
  end
42
44
 
45
+ # previous, following context of current text node:
46
+ # do not use just the immediately adjoining text tokens for context
47
+ # deal with spaces and empty text by just concatenating entire context
48
+ def l10n_context(nodes, idx)
49
+ prev = nodes[0...idx].map(&:text).join
50
+ foll = nodes[(idx + 1)...(nodes.size)].map(&:text).join
51
+ [prev, foll]
52
+ end
53
+
43
54
  def l10n_fr(text, locale)
44
55
  xml = Nokogiri::XML::DocumentFragment.parse(text)
45
- xml.traverse do |n|
46
- next unless n.text?
47
-
48
- n.replace(l10n_fr1(cleanup_entities(n.text, is_xml: false), locale))
56
+ t = xml.xpath(".//text()")
57
+ t.each_with_index do |n, i|
58
+ prev, foll = l10n_context(t, i)
59
+ text = cleanup_entities(n.text, is_xml: false)
60
+ n.replace(l10n_fr1(text, prev, foll, locale))
49
61
  end
50
62
  xml.to_xml(encoding: "UTF-8")
51
63
  end
@@ -54,49 +66,84 @@ module IsoDoc
54
66
  "\\p{In Halfwidth And Fullwidth Forms}".freeze
55
67
 
56
68
  # note: we can't differentiate comma from enumeration comma 、
57
- def l10_zh1(text, _script)
58
- l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
69
+ # def l10_zh1(text, _script)
70
+ def l10_zh1(text, prev, foll, _script)
71
+ # l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
72
+ r = l10n_zh_punct(text, prev, foll)
73
+ r = l10n_zh_remove_space(r, prev, foll)
74
+ l10n_zh_dash(r, prev, foll)
59
75
  end
60
76
 
77
+ ZH1_PUNCT = /(#{ZH_CHAR}|^) # CJK character, or start of string
78
+ (\s*)$ # Latin spaces optional
79
+ /xo.freeze
80
+ ZH2_PUNCT = /^\s* # followed by ignorable Latin spaces
81
+ [:,.()\[\];?!-]* # Latin punct which will also convert to CJK
82
+ (#{ZH_CHAR}|$) # CJK character, or end of string
83
+ /xo.freeze
84
+
61
85
  # CJK punct if (^|CJK).($|CJK)
62
- def l10n_zh_punct(text)
86
+ def l10n_zh_punct(text, prev, foll)
63
87
  ["::", ",,", "..", "))", "]]", ";;", "??", "!!", "((", "[["].each do |m|
64
- text = text.gsub(/(?<=#{ZH_CHAR}|^) # CJK character, or start of string
65
- (\s*) # Latin spaces optional
66
- #{Regexp.quote(m[0])} # Latin punctuation we want to convert to CJK
67
- (?= \s* # followed (lookahead) by ignorable Latin spaces
68
- [:,.()\[\];?!-]* # Latin punctuation which we will also convert to CJK
69
- (#{ZH_CHAR}|$) # CJK character, or end of string
70
- ) /x, "\\1#{m[1]}")
88
+ text = l10n_gsub(text, prev, foll, [m[0], m[1]],
89
+ [ZH1_PUNCT, ZH2_PUNCT])
71
90
  end
72
91
  text
73
92
  end
74
93
 
75
- def l10n_zh_dash(text)
76
- text.gsub(/(?<=#{ZH_CHAR}|^) # CJK character, or start of string
77
- (\d*) # optional digits
78
- – # en-dash
79
- (\d*) # optional digits
80
- (#{ZH_CHAR}|$) # CJK character, or end of string
81
- /xo, "\\1~\\2\\3")
94
+ ZH1_DASH = /(#{ZH_CHAR}|^) # CJK character, or start of string
95
+ (\d*) # optional digits
96
+ $/xo.freeze
97
+
98
+ ZH2_DASH = /^\d* # followed by optional digits
99
+ (#{ZH_CHAR}|$) # CJK character, or end of string
100
+ /xo.freeze
101
+
102
+ def l10n_zh_dash(text, prev, foll)
103
+ l10n_gsub(text, prev, foll, %w(– ~), [ZH1_DASH, ZH2_DASH])
104
+ end
105
+
106
+ def l10n_gsub(text, prev, foll, delim, regex)
107
+ context = l10n_gsub_context(text, prev, foll, delim) or return text
108
+ (1...(context.size - 1)).each do |i|
109
+ l10_context_valid?(context, i, delim, regex) and
110
+ context[i] = delim[1].gsub("\\0", context[i]) # Full-width equivalent
111
+ end
112
+ context[1...(context.size - 1)].join
113
+ end
114
+
115
+ def l10n_gsub_context(text, prev, foll, delim)
116
+ d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
117
+ context = text.split(/(#{d})/) # delim to replace
118
+ context.size == 1 and return
119
+ [prev, context, foll].flatten
120
+ end
121
+
122
+ def l10_context_valid?(context, idx, delim, regex)
123
+ found_delim = if delim[0].is_a?(Regexp) # punct to convert
124
+ delim[0].match?(context[idx])
125
+ else
126
+ context[idx] == delim[0]
127
+ end
128
+ found_delim &&
129
+ regex[0].match?(context[0...idx].join) && # preceding context
130
+ regex[1].match?(context[(idx + 1)..-1].join) # foll context
82
131
  end
83
132
 
84
- def l10n_zh_remove_space(text)
85
- text.gsub(/(?<=#{ZH_CHAR}) (?=#{ZH_CHAR})/o, "")
86
- .gsub(/(?<=\d) (?=#{ZH_CHAR})/o, "")
87
- .gsub(/(?<=#{ZH_CHAR}) (?=\d)/o, "")
88
- .gsub(/(?<=#{ZH_CHAR}) (?=[A-Za-z](#{ZH_CHAR}|$))/o, "")
133
+ def l10n_zh_remove_space(text, prev, foll)
134
+ text = l10n_gsub(text, prev, foll, [" ", ""],
135
+ [/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o])
136
+ l10n_gsub(text, prev, foll, [" ", ""],
137
+ [/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o])
89
138
  end
90
139
 
91
- def l10n_fr1(text, locale)
92
- text = text.gsub(/(?<=\p{Alnum})([»›;?!])(?=\s)/, "\u202f\\1")
93
- text = text.gsub(/(?<=\p{Alnum})([»›;?!])$/, "\u202f\\1")
94
- text = text.gsub(/^([»›;?!])/, "\u202f\\1")
95
- text = text.gsub(/([«‹])/, "\\1\u202f")
140
+ def l10n_fr1(text, prev, foll, locale)
141
+ text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
142
+ [/\p{Alnum}$/, /^(\s|$)/])
143
+ text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"], [/$/, /^./])
96
144
  colonsp = locale == "CH" ? "\u202f" : "\u00a0"
97
- text = text.gsub(/(?<=\p{Alnum})(:)(?=\s)/, "#{colonsp}\\1")
98
- text = text.gsub(/(?<=\p{Alnum})(:)$/, "#{colonsp}\\1")
99
- text.gsub(/^(:\s)/, "#{colonsp}\\1")
145
+ l10n_gsub(text, prev, foll, [":", "#{colonsp}\\0"],
146
+ [/\p{Alnum}$/, /^(\s|$)/])
100
147
  end
101
148
 
102
149
  def self.cjk_extend(text)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: isodoc-i18n
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.2
4
+ version: 1.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-10-25 00:00:00.000000000 Z
11
+ date: 2024-11-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: base64
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: debug
71
85
  requirement: !ruby/object:Gem::Requirement