RubyGems - isodoc-i18n - Versions diffs - 1.2.2 → 1.2.3 - Mend

isodoc-i18n 1.2.2 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 490a22f13264a470afa34644c450651239bb43409aa8e579bae72229d165fe38
-  data.tar.gz: 9be5c331d23f732e37e2588ff3269e867231f56b6b074417258f5e01fa55aeba
+  metadata.gz: d1cb9e9bc5f9e053a31ca971936606b5248577c9b001d3c858d8ecaed201ced8
+  data.tar.gz: 0ea1ee1c8b6913c3a708d63eee6f6f0af5f6ba171a65b04a7792063bd117a339
 SHA512:
-  metadata.gz: 024f04dedc8bdef757f52d1ae35b69af155d786bb5e1075b6be8c11e2a2039b0bf9ee4e39a0bbca260e3edee85aafb70360240286a38ce12cb41fef88db70e02
-  data.tar.gz: 6952b1cf007e02b5e7fcb9ca3507cda68e22e27facee436fafc41daaa65b6b5eed3b5e0b9fbbde57d8c10361ab55dcc870a25b92a0f624597b02bb3a912c7586
+  metadata.gz: a40fba88473c09f93b3eaf3da323c22c6974e9a50a2a287d4235ba5a757852a38a553765e9fdf6289d0dd97721e4c03660e390d06ece34a6f082b154377c041b
+  data.tar.gz: c8e0127577dda72be74e84f9e211169b682817f12ba38507d98c81ac5da649b5ae92d3a4d29f1d63b38ec244c4dbc44a23cf95eb0043773d4e6ae3b59309bd1c

data/isodoc-i18n.gemspec CHANGED Viewed

@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
   spec.add_dependency "liquid", "~> 5"
   spec.add_dependency "metanorma-utils", ">= 1.7.0"
   spec.add_dependency "twitter_cldr"
+  spec.add_dependency "base64"
   spec.add_development_dependency "debug"
   spec.add_development_dependency "equivalent-xml", "~> 0.6"

data/lib/isodoc/i18n/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module IsoDoc
   class I18n
-    VERSION = "1.2.2".freeze
+    VERSION = "1.2.3".freeze
   end
 end

data/lib/isodoc/i18n.rb CHANGED Viewed

@@ -6,6 +6,7 @@ require_relative "l10n"
 require_relative "liquid/liquid"
 require "liquid"
 require_relative "i18n/version"
+require "base64"
 module IsoDoc
   class I18n

data/lib/isodoc/l10n.rb CHANGED Viewed

@@ -32,20 +32,32 @@ module IsoDoc
     # CJK
     def l10n_zh(text, script = "Hans")
       xml = Nokogiri::XML::DocumentFragment.parse(text)
-      xml.traverse do |n|
-        n.text? or next
-        n.replace(l10_zh1(cleanup_entities(n.text, is_xml: false), script))
+      t = xml.xpath(".//text()")
+      t.each_with_index do |n, i|
+        prev, foll = l10n_context(t, i)
+        text = cleanup_entities(n.text, is_xml: false)
+        n.replace(l10_zh1(text, prev, foll, script))
       end
       xml.to_xml(encoding: "UTF-8").gsub(/<b>/, "").gsub("</b>", "")
         .gsub(/<\?[^>]+>/, "")
     end
+    # previous, following context of current text node:
+    # do not use just the immediately adjoining text tokens for context
+    # deal with spaces and empty text by just concatenating entire context
+    def l10n_context(nodes, idx)
+      prev = nodes[0...idx].map(&:text).join
+      foll = nodes[(idx + 1)...(nodes.size)].map(&:text).join
+      [prev, foll]
+    end
     def l10n_fr(text, locale)
       xml = Nokogiri::XML::DocumentFragment.parse(text)
-      xml.traverse do |n|
-        next unless n.text?
-        n.replace(l10n_fr1(cleanup_entities(n.text, is_xml: false), locale))
+      t = xml.xpath(".//text()")
+      t.each_with_index do |n, i|
+        prev, foll = l10n_context(t, i)
+        text = cleanup_entities(n.text, is_xml: false)
+        n.replace(l10n_fr1(text, prev, foll, locale))
       end
       xml.to_xml(encoding: "UTF-8")
     end
@@ -54,49 +66,84 @@ module IsoDoc
               "\\p{In Halfwidth And Fullwidth Forms}".freeze
     # note: we can't differentiate comma from enumeration comma 、
-    def l10_zh1(text, _script)
-      l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
+    # def l10_zh1(text, _script)
+    def l10_zh1(text, prev, foll, _script)
+      # l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
+      r = l10n_zh_punct(text, prev, foll)
+      r = l10n_zh_remove_space(r, prev, foll)
+      l10n_zh_dash(r, prev, foll)
     end
+    ZH1_PUNCT = /(#{ZH_CHAR}|^)   # CJK character, or start of string
+         (\s*)$                   # Latin spaces optional
+    /xo.freeze
+    ZH2_PUNCT = /^\s*             # followed by ignorable Latin spaces
+                [:,.()\[\];?!-]*  # Latin punct which will also convert to CJK
+                (#{ZH_CHAR}|$)    # CJK character, or end of string
+      /xo.freeze
     # CJK punct if (^|CJK).($|CJK)
-    def l10n_zh_punct(text)
+    def l10n_zh_punct(text, prev, foll)
       [":：", ",，", ".．", ")）", "]］", ";；", "?？", "!！", "(（", "[［"].each do |m|
-        text = text.gsub(/(?<=#{ZH_CHAR}|^) # CJK character, or start of string
-    (\s*)                  # Latin spaces optional
-    #{Regexp.quote(m[0])}  # Latin punctuation we want to convert to CJK
-    (?=   \s*              # followed (lookahead) by ignorable Latin spaces
-      [:,.()\[\];?!-]*     # Latin punctuation which we will also convert to CJK
-      (#{ZH_CHAR}|$)       # CJK character, or end of string
-    ) /x, "\\1#{m[1]}")
+        text = l10n_gsub(text, prev, foll, [m[0], m[1]],
+                         [ZH1_PUNCT, ZH2_PUNCT])
       end
       text
     end
-    def l10n_zh_dash(text)
-      text.gsub(/(?<=#{ZH_CHAR}|^) # CJK character, or start of string
-                (\d*)              # optional digits
-                –                  # en-dash
-                (\d*)              # optional digits
-                (#{ZH_CHAR}|$)     # CJK character, or end of string
-                /xo, "\\1～\\2\\3")
+    ZH1_DASH = /(#{ZH_CHAR}|^)    # CJK character, or start of string
+                (\d*)             # optional digits
+    $/xo.freeze
+    ZH2_DASH = /^\d*              # followed by optional digits
+                (#{ZH_CHAR}|$)    # CJK character, or end of string
+      /xo.freeze
+    def l10n_zh_dash(text, prev, foll)
+      l10n_gsub(text, prev, foll, %w(– ～), [ZH1_DASH, ZH2_DASH])
+    end
+    def l10n_gsub(text, prev, foll, delim, regex)
+      context = l10n_gsub_context(text, prev, foll, delim) or return text
+      (1...(context.size - 1)).each do |i|
+        l10_context_valid?(context, i, delim, regex) and
+          context[i] = delim[1].gsub("\\0", context[i]) # Full-width equivalent
+      end
+      context[1...(context.size - 1)].join
+    end
+    def l10n_gsub_context(text, prev, foll, delim)
+      d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
+      context = text.split(/(#{d})/) # delim to replace
+      context.size == 1 and return
+      [prev, context, foll].flatten
+    end
+    def l10_context_valid?(context, idx, delim, regex)
+      found_delim = if delim[0].is_a?(Regexp) # punct to convert
+                      delim[0].match?(context[idx])
+                    else
+                      context[idx] == delim[0]
+                    end
+      found_delim &&
+        regex[0].match?(context[0...idx].join) && # preceding context
+        regex[1].match?(context[(idx + 1)..-1].join) # foll context
     end
-    def l10n_zh_remove_space(text)
-      text.gsub(/(?<=#{ZH_CHAR}) (?=#{ZH_CHAR})/o, "")
-        .gsub(/(?<=\d) (?=#{ZH_CHAR})/o, "")
-        .gsub(/(?<=#{ZH_CHAR}) (?=\d)/o, "")
-        .gsub(/(?<=#{ZH_CHAR}) (?=[A-Za-z](#{ZH_CHAR}|$))/o, "")
+    def l10n_zh_remove_space(text, prev, foll)
+      text = l10n_gsub(text, prev, foll, [" ", ""],
+                       [/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o])
+      l10n_gsub(text, prev, foll, [" ", ""],
+                [/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o])
     end
-    def l10n_fr1(text, locale)
-      text = text.gsub(/(?<=\p{Alnum})([»›;?!])(?=\s)/, "\u202f\\1")
-      text = text.gsub(/(?<=\p{Alnum})([»›;?!])$/, "\u202f\\1")
-      text = text.gsub(/^([»›;?!])/, "\u202f\\1")
-      text = text.gsub(/([«‹])/, "\\1\u202f")
+    def l10n_fr1(text, prev, foll, locale)
+      text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
+                       [/\p{Alnum}$/, /^(\s|$)/])
+      text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"], [/$/, /^./])
       colonsp = locale == "CH" ? "\u202f" : "\u00a0"
-      text = text.gsub(/(?<=\p{Alnum})(:)(?=\s)/, "#{colonsp}\\1")
-      text = text.gsub(/(?<=\p{Alnum})(:)$/, "#{colonsp}\\1")
-      text.gsub(/^(:\s)/, "#{colonsp}\\1")
+      l10n_gsub(text, prev, foll, [":", "#{colonsp}\\0"],
+                [/\p{Alnum}$/, /^(\s|$)/])
     end
     def self.cjk_extend(text)

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: isodoc-i18n
 version: !ruby/object:Gem::Version
-  version: 1.2.2
+  version: 1.2.3
 platform: ruby
 authors:
 - Ribose Inc.
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-10-25 00:00:00.000000000 Z
+date: 2024-11-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: htmlentities
@@ -66,6 +66,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: base64
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: debug
   requirement: !ruby/object:Gem::Requirement