RubyGems - isodoc-i18n - Versions diffs - 1.2.2 → 1.2.4 - Mend

isodoc-i18n 1.2.2 → 1.2.4

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 490a22f13264a470afa34644c450651239bb43409aa8e579bae72229d165fe38
-  data.tar.gz: 9be5c331d23f732e37e2588ff3269e867231f56b6b074417258f5e01fa55aeba
+  metadata.gz: d5373a1bd009370c01efe97c54aa373ad71eb520628aabc98207257b84b7a52a
+  data.tar.gz: 9c97a3e50fa228d20f890aaa16f94ae90efb722ccf9a6e96a62e36f819759745
 SHA512:
-  metadata.gz: 024f04dedc8bdef757f52d1ae35b69af155d786bb5e1075b6be8c11e2a2039b0bf9ee4e39a0bbca260e3edee85aafb70360240286a38ce12cb41fef88db70e02
-  data.tar.gz: 6952b1cf007e02b5e7fcb9ca3507cda68e22e27facee436fafc41daaa65b6b5eed3b5e0b9fbbde57d8c10361ab55dcc870a25b92a0f624597b02bb3a912c7586
+  metadata.gz: e7abebde7c6d4630a4f6f1916d746b5833c3f02c894d5401d9ccba720aae1c699639563066ed493107b6554ae78433c046a1e5ecce496b021cb6613242ef81c4
+  data.tar.gz: 40d2f238053553382f1ff63eac7d579402e5e296fa18f4f38fe20cae94854ac1d58e658b5d41fe05f039375d6dd4f25cf3bfbedb1c548c699d90dc4a880d7d15

data/isodoc-i18n.gemspec CHANGED Viewed

@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
   spec.add_dependency "liquid", "~> 5"
   spec.add_dependency "metanorma-utils", ">= 1.7.0"
   spec.add_dependency "twitter_cldr"
+  spec.add_dependency "base64"
   spec.add_development_dependency "debug"
   spec.add_development_dependency "equivalent-xml", "~> 0.6"

data/lib/isodoc/i18n/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module IsoDoc
   class I18n
-    VERSION = "1.2.2".freeze
+    VERSION = "1.2.4".freeze
   end
 end

data/lib/isodoc/i18n.rb CHANGED Viewed

@@ -6,6 +6,7 @@ require_relative "l10n"
 require_relative "liquid/liquid"
 require "liquid"
 require_relative "i18n/version"
+require "base64"
 module IsoDoc
   class I18n

data/lib/isodoc/l10n.rb CHANGED Viewed

@@ -32,71 +32,118 @@ module IsoDoc
     # CJK
     def l10n_zh(text, script = "Hans")
       xml = Nokogiri::XML::DocumentFragment.parse(text)
-      xml.traverse do |n|
-        n.text? or next
-        n.replace(l10_zh1(cleanup_entities(n.text, is_xml: false), script))
+      t = xml.xpath(".//text()")
+      t.each_with_index do |n, i|
+        prev, foll = l10n_context(t, i)
+        text = cleanup_entities(n.text, is_xml: false)
+        n.replace(l10_zh1(text, prev, foll, script))
       end
       xml.to_xml(encoding: "UTF-8").gsub(/<b>/, "").gsub("</b>", "")
         .gsub(/<\?[^>]+>/, "")
     end
+    # previous, following context of current text node:
+    # do not use just the immediately adjoining text tokens for context
+    # deal with spaces and empty text by just concatenating entire context
+    def l10n_context(nodes, idx)
+      prev = nodes[0...idx].map(&:text).join
+      foll = nodes[(idx + 1)...(nodes.size)].map(&:text).join
+      [prev, foll]
+    end
     def l10n_fr(text, locale)
       xml = Nokogiri::XML::DocumentFragment.parse(text)
-      xml.traverse do |n|
-        next unless n.text?
-        n.replace(l10n_fr1(cleanup_entities(n.text, is_xml: false), locale))
+      t = xml.xpath(".//text()")
+      t.each_with_index do |n, i|
+        prev, foll = l10n_context(t, i)
+        text = cleanup_entities(n.text, is_xml: false)
+        n.replace(l10n_fr1(text, prev, foll, locale))
       end
       xml.to_xml(encoding: "UTF-8")
     end
-    ZH_CHAR = "\\p{Han}|\\p{In CJK Symbols And Punctuation}|" \
-              "\\p{In Halfwidth And Fullwidth Forms}".freeze
+    ZH_CHAR = "(\\p{Han}|\\p{In CJK Symbols And Punctuation}|" \
+              "\\p{In Halfwidth And Fullwidth Forms})".freeze
     # note: we can't differentiate comma from enumeration comma 、
-    def l10_zh1(text, _script)
-      l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
+    # def l10_zh1(text, _script)
+    def l10_zh1(text, prev, foll, _script)
+      # l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
+      r = l10n_zh_punct(text, prev, foll)
+      r = l10n_zh_remove_space(r, prev, foll)
+      l10n_zh_dash(r, prev, foll)
     end
+    ZH1_PUNCT = /(#{ZH_CHAR}|^)   # CJK character, or start of string
+         (\s*)$                   # Latin spaces optional
+    /xo.freeze
+    ZH2_PUNCT = /^\s*             # followed by ignorable Latin spaces
+                [:,.()\[\];?!-]*  # Latin punct which will also convert to CJK
+                (#{ZH_CHAR}|$)    # CJK character, or end of string
+      /xo.freeze
     # CJK punct if (^|CJK).($|CJK)
-    def l10n_zh_punct(text)
+    def l10n_zh_punct(text, prev, foll)
       [":：", ",，", ".．", ")）", "]］", ";；", "?？", "!！", "(（", "[［"].each do |m|
-        text = text.gsub(/(?<=#{ZH_CHAR}|^) # CJK character, or start of string
-    (\s*)                  # Latin spaces optional
-    #{Regexp.quote(m[0])}  # Latin punctuation we want to convert to CJK
-    (?=   \s*              # followed (lookahead) by ignorable Latin spaces
-      [:,.()\[\];?!-]*     # Latin punctuation which we will also convert to CJK
-      (#{ZH_CHAR}|$)       # CJK character, or end of string
-    ) /x, "\\1#{m[1]}")
+        text = l10n_gsub(text, prev, foll, [m[0], m[1]],
+                         [ZH1_PUNCT, ZH2_PUNCT])
       end
       text
     end
-    def l10n_zh_dash(text)
-      text.gsub(/(?<=#{ZH_CHAR}|^) # CJK character, or start of string
-                (\d*)              # optional digits
-                –                  # en-dash
-                (\d*)              # optional digits
-                (#{ZH_CHAR}|$)     # CJK character, or end of string
-                /xo, "\\1～\\2\\3")
+    ZH1_DASH = /(#{ZH_CHAR}|^)    # CJK character, or start of string
+                (\d*)             # optional digits
+    $/xo.freeze
+    ZH2_DASH = /^\d*              # followed by optional digits
+                (#{ZH_CHAR}|$)    # CJK character, or end of string
+      /xo.freeze
+    def l10n_zh_dash(text, prev, foll)
+      l10n_gsub(text, prev, foll, %w(– ～), [ZH1_DASH, ZH2_DASH])
+    end
+    def l10n_gsub(text, prev, foll, delim, regex)
+      context = l10n_gsub_context(text, prev, foll, delim) or return text
+      (1...(context.size - 1)).each do |i|
+        l10_context_valid?(context, i, delim, regex) and
+          context[i] = delim[1].gsub("\\0", context[i]) # Full-width equivalent
+      end
+      context[1...(context.size - 1)].join
+    end
+    def l10n_gsub_context(text, prev, foll, delim)
+      d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
+      context = text.split(/(#{d})/) # delim to replace
+      context.size == 1 and return
+      [prev, context, foll].flatten
+    end
+    def l10_context_valid?(context, idx, delim, regex)
+      found_delim = if delim[0].is_a?(Regexp) # punct to convert
+                      delim[0].match?(context[idx])
+                    else
+                      context[idx] == delim[0]
+                    end
+      found_delim &&
+        regex[0].match?(context[0...idx].join) && # preceding context
+        regex[1].match?(context[(idx + 1)..-1].join) # foll context
     end
-    def l10n_zh_remove_space(text)
-      text.gsub(/(?<=#{ZH_CHAR}) (?=#{ZH_CHAR})/o, "")
-        .gsub(/(?<=\d) (?=#{ZH_CHAR})/o, "")
-        .gsub(/(?<=#{ZH_CHAR}) (?=\d)/o, "")
-        .gsub(/(?<=#{ZH_CHAR}) (?=[A-Za-z](#{ZH_CHAR}|$))/o, "")
+    def l10n_zh_remove_space(text, prev, foll)
+      text = l10n_gsub(text, prev, foll, [" ", ""],
+                       [/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o])
+      l10n_gsub(text, prev, foll, [" ", ""],
+                [/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o])
     end
-    def l10n_fr1(text, locale)
-      text = text.gsub(/(?<=\p{Alnum})([»›;?!])(?=\s)/, "\u202f\\1")
-      text = text.gsub(/(?<=\p{Alnum})([»›;?!])$/, "\u202f\\1")
-      text = text.gsub(/^([»›;?!])/, "\u202f\\1")
-      text = text.gsub(/([«‹])/, "\\1\u202f")
+    def l10n_fr1(text, prev, foll, locale)
+      text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
+                       [/\p{Alnum}$/, /^(\s|$)/])
+      text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"], [/$/, /^./])
       colonsp = locale == "CH" ? "\u202f" : "\u00a0"
-      text = text.gsub(/(?<=\p{Alnum})(:)(?=\s)/, "#{colonsp}\\1")
-      text = text.gsub(/(?<=\p{Alnum})(:)$/, "#{colonsp}\\1")
-      text.gsub(/^(:\s)/, "#{colonsp}\\1")
+      l10n_gsub(text, prev, foll, [":", "#{colonsp}\\0"],
+                [/\p{Alnum}$/, /^(\s|$)/])
     end
     def self.cjk_extend(text)

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: isodoc-i18n
 version: !ruby/object:Gem::Version
-  version: 1.2.2
+  version: 1.2.4
 platform: ruby
 authors:
 - Ribose Inc.
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-10-25 00:00:00.000000000 Z
+date: 2024-11-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: htmlentities
@@ -66,6 +66,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: base64
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: debug
   requirement: !ruby/object:Gem::Requirement