RubyGems - sanscript - Versions diffs - 0.1.0 - Mend

sanscript 0.1.0

Files changed (22) hide show

checksums.yaml +7 -0
data/.codeclimate.yml +9 -0
data/.gitignore +9 -0
data/.rspec +2 -0
data/.rubocop.yml +97 -0
data/.travis.yml +9 -0
data/CODE_OF_CONDUCT.md +49 -0
data/Gemfile +5 -0
data/LICENSE.txt +22 -0
data/README.md +43 -0
data/Rakefile +7 -0
data/bin/console +12 -0
data/bin/setup +8 -0
data/lib/sanscript.rb +29 -0
data/lib/sanscript/benchmark.rb +53 -0
data/lib/sanscript/detect.rb +77 -0
data/lib/sanscript/refinements.rb +94 -0
data/lib/sanscript/transliterate.rb +343 -0
data/lib/sanscript/transliterate/schemes.rb +312 -0
data/lib/sanscript/version.rb +4 -0
data/sanscript.gemspec +29 -0
metadata +148 -0

@@ -0,0 +1,94 @@
+# frozen_string_literal: true
+require "ice_nine"
+module Sanscript
+  module Refinements
+    refine Object do
+      def deep_dup
+        dup
+      rescue TypeError
+        self
+      end
+      def deep_freeze
+        IceNine.deep_freeze(self)
+      end
+    end
+    refine NilClass do
+      def deep_dup
+        self
+      end
+    end
+    refine FalseClass do
+      def deep_dup
+        self
+      end
+    end
+    refine TrueClass do
+      def deep_dup
+        self
+      end
+    end
+    refine Symbol do
+      def deep_dup
+        self
+      end
+    end
+    refine Numeric do
+      def deep_dup
+        self
+      end
+    end
+    # Necessary to re-override Numeric
+    require "bigdecimal"
+    refine BigDecimal do
+      def deep_dup
+        dup
+      end
+    end
+    refine String do
+      def w_split
+        split(/\s/)
+      end
+    end
+    refine Array do
+      def deep_dup
+        map { |value| value.deep_dup } # rubocop:disable Style/SymbolProc
+      end
+    end
+    refine Hash do
+      def deep_dup
+        hash = dup
+        each_pair do |key, value|
+          if ::String === key # rubocop:disable Style/CaseEquality
+            hash[key] = value.deep_dup
+          else
+            hash.delete(key)
+            hash[key.deep_dup] = value.deep_dup
+          end
+        end
+        hash
+      end
+    end
+    refine Set do
+      def deep_dup
+        set_a = to_a
+        set_a.map! do |val|
+          next val if ::String === val # rubocop:disable Style/CaseEquality
+          val.deep_dup
+        end
+        self.class[set_a]
+      end
+    end
+  end
+end

data/lib/sanscript/transliterate.rb ADDED

@@ -0,0 +1,343 @@
+# frozen_string_literal: true
+require "sanscript/refinements"
+require "sanscript/transliterate/schemes"
+#
+# Sanscript
+#
+# Sanscript is a Sanskrit transliteration library. Currently, it supports
+# other Indian languages only incidentally.
+#
+# Released under the MIT and GPL Licenses.
+#
+module Sanscript
+  using Refinements
+  module Transliterate
+    class << self
+      attr_reader :defaults, :schemes, :roman_schemes, :all_alternates
+    end
+    @defaults = {
+      skip_sgml: false,
+      syncope: false,
+    }
+    @cache = {}
+    module_function
+    #
+    #  Return a list of available schemes.
+    #
+    #  @return      array of scheme identifiers
+    #
+    def scheme_names
+      @schemes.keys.sort!
+    end
+    #
+    #  Check whether the given scheme encodes romanized Sanskrit.
+    #
+    #  @param name  the scheme name
+    #  @return      boolean
+    #
+    def roman_scheme?(name)
+      @roman_schemes.include?(name.to_sym)
+    end
+    #
+    # Add a Brahmic scheme to Sanscript.
+    #
+    # Schemes are of two types: "Brahmic" and "roman". Brahmic consonants
+    # have an inherent vowel sound, but roman consonants do not. This is the
+    # main difference between these two types of scheme.
+    #
+    # A scheme definition is an object ("{}") that maps a group name to a
+    # list of characters. For illustration, see the "devanagari" scheme at
+    # the top of this file.
+    #
+    # You can use whatever group names you like, but for the best results,
+    # you should use the same group names that Sanscript does.
+    #
+    # @param name    the scheme name
+    # @param scheme  the scheme data itself. This should be constructed as
+    #                described above.
+    #
+    def add_brahmic_scheme(name, scheme)
+      @schemes[name.to_sym] = scheme.deep_dup.deep_freeze
+    end
+    #
+    # Add a roman scheme to Sanscript.
+    #
+    # See the comments on Sanscript.add_brahmic_scheme. The "vowel_marks" field
+    # can be omitted.
+    #
+    # @param name    the scheme name
+    # @param scheme  the scheme data itself
+    #
+    def add_roman_scheme(name, scheme)
+      name = name.to_sym
+      scheme = scheme.deep_dup
+      scheme[:vowel_marks] = scheme[:vowels][1..-1] unless scheme.key?(:vowel_marks)
+      @schemes[name] = scheme.deep_freeze
+      @roman_schemes.add(name)
+    end
+    #
+    # Create a deep copy of an object, for certain kinds of objects.
+    #
+    # @param scheme  the scheme to copy
+    # @return        the copy
+    #
+    # Set up various schemes
+    begin
+      # Set up roman schemes
+      kolkata = @schemes[:kolkata] = @schemes[:iast].deep_dup
+      scheme_names = %i[iast itrans hk kolkata slp1 velthuis wx]
+      kolkata[:vowels] = %w[a ā i ī u ū ṛ ṝ ḷ ḹ e ē ai o ō au]
+      # These schemes already belong to Sanscript.schemes. But by adding
+      # them again with `addRomanScheme`, we automatically build up
+      # `roman_schemes` and define a `vowel_marks` field for each one.
+      scheme_names.each do |name|
+        add_roman_scheme(name, @schemes[name])
+      end
+      # ITRANS variant, which supports Dravidian short 'e' and 'o'.
+      itrans_dravidian = @schemes[:itrans].deep_dup
+      itrans_dravidian[:vowels] = %w[a A i I u U Ri RRI LLi LLi e E ai o O au]
+      itrans_dravidian[:vowel_marks] = itrans_dravidian[:vowels][1..-1]
+      @all_alternates[:itrans_dravidian] = @all_alternates[:itrans]
+      add_roman_scheme(:itrans_dravidian, itrans_dravidian)
+      # ensure deep freeze on all existing schemes and alternates
+      @schemes.each { |_, scheme| scheme.deep_freeze }
+      @all_alternates.each { |_, scheme| scheme.deep_freeze }
+    end
+    # /**
+    # Transliterate from one script to another.
+    #  *
+    # @param data     the string to transliterate
+    # @param from     the source script
+    # @param to       the destination script
+    # @param options  transliteration options
+    # @return         the finished string
+    #
+    def transliterate(data, from, to, options = {})
+      from = from.to_sym
+      to = to.to_sym
+      raise "Scheme not known ':#{from}'" unless @schemes.key?(from)
+      raise "Scheme not known ':#{to}'" unless @schemes.key?(to)
+      data = data.to_str.dup
+      options = @defaults.merge(options)
+      map = make_map(from, to)
+      data.gsub!(/(<.*?>)/, "##\\1##") if options[:skip_sgml]
+      # Easy way out for "{\m+}", "\", and ".h".
+      if from == :itrans
+        data.gsub!(/\{\\m\+\}/, ".h.N")
+        data.gsub!(/\.h/, "")
+        data.gsub!(/\\([^'`_]|$)/, "##\\1##")
+      end
+      if map[:from_roman?]
+        transliterate_roman(data, map, options)
+      else
+        transliterate_brahmic(data, map)
+      end
+    end
+    class << self
+      private
+      #
+      # Create a map from every character in `from` to its partner in `to`.
+      # Also, store any "marks" that `from` might have.
+      #
+      # @param from     input scheme
+      # @param to       output scheme
+      #
+      def make_map(from, to)
+        @cache[:"#{from}_#{to}"] ||= begin
+          alternates = @all_alternates[from] || {}
+          consonants = {}
+          from_scheme = @schemes[from]
+          letters = {}
+          token_lengths = []
+          marks = {}
+          to_scheme = @schemes[to]
+          from_scheme.each do |group, from_group|
+            to_group = to_scheme[group]
+            next if to_group.nil?
+            from_group.each_with_index do |f, i|
+              t = to_group[i]
+              alts = alternates[f] || []
+              token_lengths.push(f.length)
+              token_lengths.concat(alts.map(&:length))
+              if group == :vowel_marks || group == :virama
+                marks[f] = t
+                alts.each { |alt| marks[alt] = t }
+              else
+                letters[f] = t
+                alts.each { |alt| letters[alt] = t }
+                if group == :consonants || group == :other
+                  consonants[f] = t
+                  alts.each { |alt| consonants[alt] = t }
+                end
+              end
+            end
+          end
+          {
+            consonants: consonants,
+            from_roman?: roman_scheme?(from),
+            letters: letters,
+            marks: marks,
+            max_token_length: token_lengths.max,
+            to_roman?: roman_scheme?(to),
+            virama: to_scheme[:virama].first,
+          }.deep_freeze
+        end
+      end
+      #
+      # Transliterate from a romanized script.
+      #
+      # @param data     the string to transliterate
+      # @param map      map data generated from makeMap()
+      # @param options  transliteration options
+      # @return         the finished string
+      #
+      def transliterate_roman(data, map, options = {})
+        options = @defaults.merge(options)
+        data = data.to_str.dup
+        buf = []
+        token_buffer = String.new
+        had_consonant = false
+        transliteration_enabled = true
+        until data.empty? && token_buffer.empty?
+          token_buffer << data.slice!(0, map[:max_token_length] - token_buffer.length)
+          # Match all token substrings to our map.
+          (0...map[:max_token_length]).each do |j|
+            token = token_buffer[0, map[:max_token_length] - j]
+            if token == "##"
+              transliteration_enabled = !transliteration_enabled
+              token_buffer.slice!(0, 2)
+              break
+            end
+            temp_letter = map[:letters][token]
+            if !temp_letter.nil? && transliteration_enabled
+              if map[:to_roman?]
+                buf << temp_letter
+              else
+                # Handle the implicit vowel. Ignore 'a' and force
+                # vowels to appear as marks if we've just seen a
+                # consonant.
+                if had_consonant
+                  temp_mark = map[:marks][token]
+                  if !temp_mark.nil?
+                    buf << temp_mark
+                  elsif token != "a"
+                    buf << map[:virama] << temp_letter
+                  end
+                else
+                  buf << temp_letter
+                end
+                had_consonant = map[:consonants].key?(token)
+              end
+              token_buffer.slice!(0, map[:max_token_length] - j)
+              break
+            elsif j == map[:max_token_length] - 1
+              if had_consonant
+                had_consonant = false
+                buf << map[:virama] unless options[:syncope]
+              end
+              buf << token
+              token_buffer.slice!(0, 1)
+              # 'break' is redundant here, "j == ..." is true only on
+              # the last iteration.
+            end
+          end
+        end
+        buf << map[:virama] if had_consonant && !options[:syncope]
+        buf.join("")
+      end
+      #
+      # Transliterate from a Brahmic script.
+      #
+      # @param data     the string to transliterate
+      # @param map      map data generated from makeMap()
+      # @return         the finished string
+      #
+      def transliterate_brahmic(data, map)
+        data = data.to_str.dup
+        buf = []
+        dangling_hash = false
+        had_roman_consonant = false
+        transliteration_enabled = true
+        until data.empty?
+          l = data.slice!(0, 1)
+          # Toggle transliteration state
+          if l == "#"
+            if dangling_hash
+              transliteration_enabled = !transliteration_enabled
+              dangling_hash = false
+            else
+              dangling_hash = true
+            end
+            if had_roman_consonant
+              buf << "a"
+              had_roman_consonant = false
+            end
+            next
+          elsif !transliteration_enabled
+            buf << l
+            next
+          end
+          temp = map[:marks][l]
+          if !temp.nil?
+            buf << temp
+            had_roman_consonant = false
+          else
+            if dangling_hash
+              buf << "#"
+              dangling_hash = false
+            end
+            if had_roman_consonant
+              buf << "a"
+              had_roman_consonant = false
+            end
+            # Push transliterated letter if possible. Otherwise, push
+            # the letter itself.
+            temp = map[:letters][l]
+            if !temp.nil?
+              buf << temp
+              had_roman_consonant = map[:to_roman?] && map[:consonants].key?(l)
+            else
+              buf << l
+            end
+          end
+        end
+        buf << "a" if had_roman_consonant
+        buf.join("")
+      end
+    end
+  end
+end

data/lib/sanscript/transliterate/schemes.rb ADDED

@@ -0,0 +1,312 @@
+# frozen_string_literal: true
+require "sanscript/refinements"
+module Sanscript
+  using Refinements
+  module Transliterate
+    #  Schemes
+    #  =======
+    #  Schemes are of two kinds: "Brahmic" and "roman." "Brahmic" schemes
+    #  describe abugida scripts found in India. "Roman" schemes describe
+    #  manufactured alphabets that are meant to describe or encode Brahmi
+    #  scripts. Abugidas and alphabets are processed by separate algorithms
+    #  because of the unique difficulties involved with each.
+    #
+    #  Brahmic consonants are stated without a virama. Roman consonants are
+    #  stated without the vowel 'a'.
+    #
+    #  (Since "abugida" is not a well-known term, Sanscript uses "Brahmic"
+    #  and "roman" for clarity.)
+    #
+    @schemes = {
+      # Bengali
+      # -------
+      # 'va' and 'ba' are both rendered as ব.
+      #
+      bengali: {
+        vowels: "অ আ ই ঈ উ ঊ ঋ ৠ ঌ ৡ এ ঐ ও ঔ".w_split,
+        vowel_marks: "া ি ী ু ূ ৃ ৄ ৢ ৣ  ে ৈ  ো ৌ".w_split,
+        other_marks: "ং ঃ ঁ".w_split,
+        virama: ["্"],
+        consonants: "ক খ গ ঘ ঙ চ ছ জ ঝ ঞ ট ঠ ড ঢ ণ ত থ দ ধ ন প ফ ব ভ ম য র ল ব শ ষ স হ ळ ক্ষ জ্ঞ".w_split,
+        symbols: "০ ১ ২ ৩ ৪ ৫ ৬ ৭ ৮ ৯ ॐ ঽ । ॥".w_split,
+        other: "    ড ঢ  য ".w_split,
+      },
+      # Devanagari
+      # ----------
+      # The most comprehensive and unambiguous Brahmic script listed.
+      #
+      devanagari: {
+        # "Independent" forms of the vowels. These are used whenever the
+        # vowel does not immediately follow a consonant.
+        vowels: "अ आ इ ई उ ऊ ऋ ॠ ऌ ॡ ऎ ए ऐ ऒ ओ औ".w_split,
+        # "Dependent" forms of the vowels. These are used whenever the
+        # vowel immediately follows a consonant. If a letter is not
+        # listed in `vowels`, it should not be listed here.
+        vowel_marks: "ा ि ी ु ू ृ ॄ ॢ ॣ ॆ े ै ॊ ो ौ".w_split,
+        # Miscellaneous marks, all of which are used in Sanskrit.
+        other_marks: "ं ः ँ".w_split,
+        # In syllabic scripts like Devanagari, consonants have an inherent
+        # vowel that must be suppressed explicitly. We do so by putting a
+        # virama after the consonant.
+        virama: ["्"],
+        # Various Sanskrit consonants and consonant clusters. Every token
+        # here has an explicit vowel. Thus "क" is "ka" instead of "k".
+        consonants: "क ख ग घ ङ च छ ज झ ञ ट ठ ड ढ ण त थ द ध न प फ ब भ म य र ल व श ष स ह ळ क्ष ज्ञ".w_split,
+        # Numbers and punctuation
+        symbols: "० १ २ ३ ४ ५ ६ ७ ८ ९ ॐ ऽ । ॥".w_split,
+        # Zero-width joiner. This is used to separate a consonant cluster
+        # and avoid a complex ligature.
+        zwj: ["\u200D"],
+        # Dummy consonant. This is used in ITRANS to prevert certain types
+        # of parser ambiguity. Thus "barau" -> बरौ but "bara_u" -> बरउ.
+        skip: [""],
+        # Vedic accent. Udatta and anudatta.
+        accent: %W[\u0951 \u0952],
+        # Accent combined with anusvara and and visarga. For compatibility
+        # with ITRANS, which allows the reverse of these four.
+        combo_accent: "ः॑ ः॒ ं॑ ं॒".w_split,
+        candra: ["ॅ"],
+        # Non-Sanskrit consonants
+        other: "क़ ख़ ग़ ज़ ड़ ढ़ फ़ य़ ऱ".w_split,
+      },
+      # Gujarati
+      # --------
+      # Sanskrit-complete.
+      #
+      gujarati: {
+        vowels: "અ આ ઇ ઈ ઉ ઊ ઋ ૠ ઌ ૡ  એ ઐ  ઓ ઔ".w_split,
+        vowel_marks: "ા િ ી ુ ૂ ૃ ૄ ૢ ૣ  ે ૈ  ો ૌ".w_split,
+        other_marks: "ં ઃ ઁ".w_split,
+        virama: ["્"],
+        consonants: "ક ખ ગ ઘ ઙ ચ છ જ ઝ ઞ ટ ઠ ડ ઢ ણ ત થ દ ધ ન પ ફ બ ભ મ ય ર લ વ શ ષ સ હ ળ ક્ષ જ્ઞ".w_split,
+        symbols: "૦ ૧ ૨ ૩ ૪ ૫ ૬ ૭ ૮ ૯ ૐ ઽ ૤ ૥".w_split,
+        candra: ["ૅ"],
+      },
+      # Gurmukhi
+      # --------
+      # Missing R/RR/lR/lRR
+      #
+      gurmukhi: {
+        vowels: "ਅ ਆ ਇ ਈ ਉ ਊ      ਏ ਐ  ਓ ਔ".w_split,
+        vowel_marks: "ਾ ਿ ੀ ੁ ੂ      ੇ ੈ  ੋ ੌ".w_split,
+        other_marks: "ਂ ਃ ਁ".w_split,
+        virama: ["੍"],
+        consonants: "ਕ ਖ ਗ ਘ ਙ ਚ ਛ ਜ ਝ ਞ ਟ ਠ ਡ ਢ ਣ ਤ ਥ ਦ ਧ ਨ ਪ ਫ ਬ ਭ ਮ ਯ ਰ ਲ ਵ ਸ਼ ਸ਼ ਸ ਹ ਲ਼ ਕ੍ਸ਼ ਜ੍ਞ".w_split,
+        symbols: "੦ ੧ ੨ ੩ ੪ ੫ ੬ ੭ ੮ ੯ ॐ ऽ । ॥".w_split,
+        other: " ਖ ਗ ਜ ਡ  ਫ  ".w_split,
+      },
+      # Kannada
+      # -------
+      # Sanskrit-complete.
+      #
+      kannada: {
+        vowels: "ಅ ಆ ಇ ಈ ಉ ಊ ಋ ೠ ಌ ೡ ಎ ಏ ಐ ಒ ಓ ಔ".w_split,
+        vowel_marks: "ಾ ಿ ೀ ು ೂ ೃ ೄ ೢ ೣ ೆ ೇ ೈ ೊ ೋ ೌ".w_split,
+        other_marks: "ಂ ಃ ँ".w_split,
+        virama: ["್"],
+        consonants: "ಕ ಖ ಗ ಘ ಙ ಚ ಛ ಜ ಝ ಞ ಟ ಠ ಡ ಢ ಣ ತ ಥ ದ ಧ ನ ಪ ಫ ಬ ಭ ಮ ಯ ರ ಲ ವ ಶ ಷ ಸ ಹ ಳ ಕ್ಷ ಜ್ಞ".w_split,
+        symbols: "೦ ೧ ೨ ೩ ೪ ೫ ೬ ೭ ೮ ೯ ಓಂ ಽ । ॥".w_split,
+        other: "      ಫ  ಱ".w_split,
+      },
+      # Malayalam
+      # ---------
+      # Sanskrit-complete.
+      #
+      malayalam: {
+        vowels: "അ ആ ഇ ഈ ഉ ഊ ഋ ൠ ഌ ൡ എ ഏ ഐ ഒ ഓ ഔ".w_split,
+        vowel_marks: "ാ ി ീ ു ൂ ൃ ൄ ൢ ൣ െ േ ൈ ൊ ോ ൌ".w_split,
+        other_marks: "ം ഃ ँ".w_split,
+        virama: ["്"],
+        consonants: "ക ഖ ഗ ഘ ങ ച ഛ ജ ഝ ഞ ട ഠ ഡ ഢ ണ ത ഥ ദ ധ ന പ ഫ ബ ഭ മ യ ര ല വ ശ ഷ സ ഹ ള ക്ഷ ജ്ഞ".w_split,
+        symbols: "൦ ൧ ൨ ൩ ൪ ൫ ൬ ൭ ൮ ൯ ഓം ഽ । ॥".w_split,
+        other: "        റ".w_split,
+      },
+      # Oriya
+      # -----
+      # Sanskrit-complete.
+      #
+      oriya: {
+        vowels: "ଅ ଆ ଇ ଈ ଉ ଊ ଋ ୠ ଌ ୡ  ଏ ଐ  ଓ ଔ".w_split,
+        vowel_marks: "ା ି ୀ ୁ ୂ ୃ ୄ ୢ ୣ  େ ୈ  ୋ ୌ".w_split,
+        other_marks: "ଂ ଃ ଁ".w_split,
+        virama: ["୍"],
+        consonants: "କ ଖ ଗ ଘ ଙ ଚ ଛ ଜ ଝ ଞ ଟ ଠ ଡ ଢ ଣ ତ ଥ ଦ ଧ ନ ପ ଫ ବ ଭ ମ ଯ ର ଲ ଵ ଶ ଷ ସ ହ ଳ କ୍ଷ ଜ୍ଞ".w_split,
+        symbols: "୦ ୧ ୨ ୩ ୪ ୫ ୬ ୭ ୮ ୯ ଓଂ ଽ । ॥".w_split,
+        other: "    ଡ ଢ  ଯ ".w_split,
+      },
+      # Tamil
+      # -----
+      # Missing R/RR/lR/lRR vowel marks and voice/aspiration distinctions.
+      # The most incomplete of the Sanskrit schemes here.
+      #
+      tamil: {
+        vowels: "அ ஆ இ ஈ உ ஊ     எ ஏ ஐ ஒ ஓ ஔ".w_split,
+        vowel_marks: "ா ி ீ ு ூ     ெ ே ை ொ ோ ௌ".w_split,
+        other_marks: "ஂ ஃ ".w_split,
+        virama: ["்"],
+        consonants: "க க க க ங ச ச ஜ ச ஞ ட ட ட ட ண த த த த ந ப ப ப ப ம ய ர ல வ ஶ ஷ ஸ ஹ ள க்ஷ ஜ்ஞ".w_split,
+        symbols: "௦ ௧ ௨ ௩ ௪ ௫ ௬ ௭ ௮ ௯ ௐ ऽ । ॥".w_split,
+        other: "        ற".w_split,
+      },
+      # Telugu
+      # ------
+      # Sanskrit-complete.
+      #
+      telugu: {
+        vowels: "అ ఆ ఇ ఈ ఉ ఊ ఋ ౠ ఌ ౡ ఎ ఏ ఐ ఒ ఓ ఔ".w_split,
+        vowel_marks: "ా ి ీ ు ూ ృ ౄ ౢ ౣ ె ే ై ొ ో ౌ".w_split,
+        other_marks: "ం ః ఁ".w_split,
+        virama: ["్"],
+        consonants: "క ఖ గ ఘ ఙ చ ఛ జ ఝ ఞ ట ఠ డ ఢ ణ త థ ద ధ న ప ఫ బ భ మ య ర ల వ శ ష స హ ళ క్ష జ్ఞ".w_split,
+        symbols: "౦ ౧ ౨ ౩ ౪ ౫ ౬ ౭ ౮ ౯ ఓం ఽ । ॥".w_split,
+        other: "        ఱ".w_split,
+      },
+      # International Alphabet of Sanskrit Transliteration
+      # --------------------------------------------------
+      # The most "professional" Sanskrit romanization scheme.
+      #
+      iast: {
+        vowels: "a ā i ī u ū ṛ ṝ ḷ ḹ  e ai  o au".w_split,
+        other_marks: ["ṃ", "ḥ", "~"],
+        virama: [""],
+        consonants: "k kh g gh ṅ c ch j jh ñ ṭ ṭh ḍ ḍh ṇ t th d dh n p ph b bh m y r l v ś ṣ s h ḻ kṣ jñ".w_split,
+        symbols: "0 1 2 3 4 5 6 7 8 9 oṃ ' । ॥".w_split,
+      },
+      # ITRANS
+      # ------
+      # One of the first romanization schemes -- and one of the most
+      # complicated. For alternate forms, see the "allAlternates" variable
+      # below.
+      #  *
+      # '_' is a "null" letter, which allows adjacent vowels.
+      #
+      itrans: {
+        vowels: "a A i I u U RRi RRI LLi LLI  e ai  o au".w_split,
+        other_marks: ["M", "H", ".N"],
+        virama: [""],
+        consonants: "k kh g gh ~N ch Ch j jh ~n T Th D Dh N t th d dh n p ph b bh m y r l v sh Sh s h L kSh j~n".w_split,
+        symbols: "0 1 2 3 4 5 6 7 8 9 OM .a | ||".w_split,
+        candra: [".c"],
+        zwj: ["{}"],
+        skip: ["_"],
+        accent: ["\\'", "\\_"],
+        combo_accent: "\\'H \\_H \\'M \\_M".w_split,
+        other: "q K G z .D .Dh f Y R".w_split,
+      },
+      # Harvard-Kyoto
+      # -------------
+      # A simple 1:1 mapping.
+      #
+      hk: {
+        vowels: "a A i I u U R RR lR lRR  e ai  o au".w_split,
+        other_marks: "M H ~".w_split,
+        virama: [""],
+        consonants: "k kh g gh G c ch j jh J T Th D Dh N t th d dh n p ph b bh m y r l v z S s h L kS jJ".w_split,
+        symbols: "0 1 2 3 4 5 6 7 8 9 OM ' | ||".w_split,
+      },
+      # National Library at Kolkata
+      # ---------------------------
+      # Apart from using "ē" and "ō" instead of "e" and "o", this scheme is
+      # identical to IAST. ṝ, ḷ, and ḹ are not part of the scheme proper.
+      #  *
+      # This is defined further below.
+      #
+      # Sanskrit Library Phonetic Basic
+      # -------------------------------
+      # With one ASCII letter per phoneme, this is the tersest transliteration
+      # scheme in use today and is especially suited to computer processing.
+      #
+      slp1: {
+        vowels: "a A i I u U f F x X  e E  o O".w_split,
+        other_marks: "M H ~".w_split,
+        virama: [""],
+        consonants: "k K g G N c C j J Y w W q Q R t T d D n p P b B m y r l v S z s h L kz jY".w_split,
+        symbols: "0 1 2 3 4 5 6 7 8 9 oM ' . ..".w_split,
+      },
+      # Velthuis
+      # --------
+      # A case-insensitive Sanskrit encoding.
+      #
+      velthuis: {
+        vowels: "a aa i ii u uu .r .rr .li .ll  e ai  o au".w_split,
+        other_marks: ".m .h ".w_split,
+        virama: [""],
+        consonants: 'k kh g gh "n c ch j jh ~n .t .th .d .d .n t th d dh n p ph b bh m y r l v ~s .s s h L k.s j~n'.w_split,
+        symbols: "0 1 2 3 4 5 6 7 8 9 o.m ' | ||".w_split,
+      },
+      # WX
+      # --
+      # As terse as SLP1.
+      #
+      wx: {
+        vowels: "a A i I u U q Q L   e E  o O".w_split,
+        other_marks: "M H z".w_split,
+        virama: [""],
+        consonants: "k K g G f c C j J F t T d D N w W x X n p P b B m y r l v S R s h  kR jF".w_split,
+        symbols: "0 1 2 3 4 5 6 7 8 9 oM ' | ||".w_split,
+      },
+    }
+    # Set of names of schemes
+    @roman_schemes = Set.new
+    # Map of alternate encodings.
+    @all_alternates = {
+      itrans: {
+        "A" => ["aa"],
+        "I" => %w[ii ee],
+        "U" => %w[uu oo],
+        "RRi" => ["R^i"],
+        "RRI" => ["R^I"],
+        "LLi" => ["L^i"],
+        "LLI" => ["L^I"],
+        "M" => [".m", ".n"],
+        "~N" => ["N^"],
+        "ch" => ["c"],
+        "Ch" => %w[C chh],
+        "~n" => ["JN"],
+        "v" => ["w"],
+        "Sh" => %w[S shh],
+        "kSh" => %w[kS x],
+        "j~n" => %w[GY dny],
+        "OM" => ["AUM"],
+        "\\_" => ["\\`"],
+        "\\_H" => ["\\`H"],
+        "\\'M" => ["\\'.m", "\\'.n"],
+        "\\_M" => "\\_.m \\_.n \\`M \\`.m \\`.n".w_split,
+        ".a" => ["~"],
+        "|" => ["."],
+        "||" => [".."],
+        "z" => ["J"],
+      },
+    }
+  end
+end