RubyGems - lang - Versions diffs - 0.1.0.pre - Mend

lang 0.1.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/README.rdoc +46 -0
data/bin/lang +150 -0
data/lib/lang/subtags.rb +147 -0
data/lib/lang/subtags/entry.rb +40 -0
data/lib/lang/subtags/extlang.rb +19 -0
data/lib/lang/subtags/grandfathered.rb +9 -0
data/lib/lang/subtags/language.rb +18 -0
data/lib/lang/subtags/redundant.rb +9 -0
data/lib/lang/subtags/region.rb +9 -0
data/lib/lang/subtags/script.rb +9 -0
data/lib/lang/subtags/variant.rb +17 -0
data/lib/lang/tag.rb +141 -0
data/lib/lang/tag/canonicalization.rb +376 -0
data/lib/lang/tag/composition.rb +141 -0
data/lib/lang/tag/filtering.rb +143 -0
data/lib/lang/tag/grandfathered.rb +36 -0
data/lib/lang/tag/langtag.rb +437 -0
data/lib/lang/tag/lookup.rb +77 -0
data/lib/lang/tag/pattern.rb +31 -0
data/lib/lang/tag/privateuse.rb +34 -0
data/lib/lang/version.rb +5 -0
metadata +108 -0

data/lib/lang/subtags/region.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module Lang #:nodoc:
+  module Subtags
+    # Holds data about region subtags.
+    class Region < Entry
+    end
+  end
+end
+# EOF

data/lib/lang/subtags/script.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module Lang #:nodoc:
+  module Subtags
+    # Holds data about script subtags.
+    class Script < Entry
+    end
+  end
+end
+# EOF

data/lib/lang/subtags/variant.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module Lang #:nodoc:
+  module Subtags
+    # Holds data about variant subtags.
+    class Variant < Entry
+      attr_reader :prefixes
+      def add_prefix(prefix)
+        @prefixes ||= []
+        @prefixes << prefix
+      end
+    end
+  end
+end
+# EOF

data/lib/lang/tag.rb ADDED Viewed

@@ -0,0 +1,141 @@
+require 'lang/tag/pattern'
+require 'lang/tag/composition'
+require 'lang/tag/langtag'
+require 'lang/tag/grandfathered'
+require 'lang/tag/privateuse'
+module Lang
+  def self.Tag(thing)
+    #return thing if Tag::Composition === thing
+    Tag::Grandfathered(thing) rescue
+    Tag::Langtag(thing) rescue
+    Tag::Privateuse(thing)
+  rescue
+    raise ArgumentError, "#{thing.inspect} is not a language tag."
+  end
+  module Tag
+    class Error < StandardError
+    end
+    class InvalidComponentError < Error
+    end
+    #--
+    # Grandfathered tags that do not match the 'langtag' production in the
+    # ABNF and would otherwise be invalid are considered 'irregular'
+    # grandfathered tags.  With the exception of "en-GB-oed", which is a
+    # variant of "en-GB", each of them, in its entirety, represents a
+    # language.
+    #++
+    IRREGULAR = {
+      'en-gb-oed'   => nil   ,
+      'i-ami'       => 'ami' ,
+      'i-bnn'       => 'bnn' ,
+      'i-default'   => nil   ,
+      'i-enochian'  => nil   ,
+      'i-hak'       => 'hak' ,
+      'i-klingon'   => 'tlh' ,
+      'i-lux'       => 'lb'  ,
+      'i-mingo'     => nil   ,
+      'i-navajo'    => 'nv'  ,
+      'i-pwn'       => 'pwn' ,
+      'i-tao'       => 'tao' ,
+      'i-tay'       => 'tay' ,
+      'i-tsu'       => 'tsu' ,
+      'sgn-be-fr'   => 'sfb' ,
+      'sgn-be-nl'   => 'vgt' ,
+      'sgn-ch-de'   => 'sgg' ,
+    }.freeze
+    #--
+    # Grandfathered tags that (appear to) match the 'langtag' production in
+    # Figure 1 are considered 'regular' grandfathered tags.  These tags
+    # contain one or more subtags that either do not individually appear in
+    # the registry or appear but with a different semantic meaning: each
+    # tag, in its entirety, represents a language or collection of
+    # languages.
+    #++
+    GRANDFATHERED = IRREGULAR.merge(
+      'art-lojban'  => 'jbo' ,
+      'cel-gaulish' => nil   ,
+      'no-bok'      => 'nb'  ,
+      'no-nyn'      => 'nn'  ,
+      'zh-guoyu'    => 'cmn' ,
+      'zh-hakka'    => 'hak' ,
+      'zh-min'      => nil   ,
+      'zh-min-nan'  => 'nan' ,
+      'zh-xiang'    => 'hsn'
+    ).freeze
+    HYPHEN                                = '-'.freeze
+    HYPHEN_SPLITTER                       = RUBY_VERSION < '1.9.1' ? /-/.freeze : HYPHEN
+    PRIVATEUSE                            = 'x'.freeze
+    LANGUAGE_REGEX                        = /^(?:#{PATTERN::LANGUAGE})$/io.freeze
+    SCRIPT_REGEX                          = /^(?:#{PATTERN::SCRIPT})$/io.freeze
+    REGION_REGEX                          = /^(?:#{PATTERN::REGION})$/io.freeze
+    VARIANTS_SEQUENCE_REGEX               = /^(?:#{PATTERN::VARIANT_SEQUENCE}+)$/io.freeze
+    EXTENSIONS_SEQUENCE_REGEX             = /^#{PATTERN::EXTENSION_SEQUENCE}+$/io.freeze
+    EXTENSIONS_SEQUENCE_SPLITTER          = /(?:^|-)(?=#{PATTERN::SINGLETON}-)/io.freeze
+    PRIVATEUSE_REGEX                      = /^#{PATTERN::PRIVATEUSE}$/io.freeze
+    LANGTAG_REGEX = /^
+      (#{PATTERN::LANGUAGE})              (?# shortest ISO 639 code plus extlang or reserved for future use or registered language subtag)
+      (?:-(#{PATTERN::SCRIPT}))?          (?# ISO 15924 code)
+      (?:-(#{PATTERN::REGION}))?          (?# ISO 3166-1 code or UN M.49 code)
+      (#{PATTERN::VARIANT_SEQUENCE}*)?    (?# registered variants)
+      (#{PATTERN::EXTENSION_SEQUENCE}*)?  (?# extensions)
+      (?=(?:-#{PATTERN::PRIVATEUSE})?$)   (?# privateuse)
+      /iox.freeze
+    LANGTAG_WELLFORMEDNESS_REGEX = /^
+      (?:#{PATTERN::LOOSE_LANGUAGE})      (?# shortest ISO 639 code plus at most 3 extlangs or reserved for future use or registered language subtag)
+      (?:-(?:#{PATTERN::SCRIPT}))?        (?# ISO 15924 code)
+      (?:-(?:#{PATTERN::REGION}))?        (?# ISO 3166-1 code or UN M.49 code)
+      (?=#{PATTERN::VARIANT_SEQUENCE}*    (?# registered variants)
+      #{PATTERN::EXTENSION_SEQUENCE}*     (?# extensions)
+      (?:-#{PATTERN::PRIVATEUSE})?$)      (?# privateuse)
+      /iox.freeze
+    class << self
+      # Checks if the +String+ passed represents a 'privateuse' language tag.
+      # Works case-insensitively.
+      #
+      def privateuse?(snippet)
+        PRIVATEUSE_REGEX === snippet
+      end
+      # Checks if the +String+ passed represents a 'grandfathered' language tag.
+      # Works case-insensitively.
+      #
+      def grandfathered?(snippet)
+        GRANDFATHERED.key?(snippet) || GRANDFATHERED.key?(snippet.downcase)
+      end
+      #--
+      # RFC 5646, Section 2.2.9:
+      # A tag is considered "well-formed" if it conforms to the ABNF
+      # (Section 2.1). Language tags may be well-formed in terms of syntax
+      # but not valid in terms of content. However, many operations
+      # involving language tags work well without knowing anything about the
+      # meaning or validity of the subtags.
+      #++
+      # Checks if the +String+ passed represents a well-formed language tag.
+      # Works case-insensitively.
+      #
+      def wellformed?(snippet)
+        privateuse?(snippet) || grandfathered?(snippet) || LANGTAG_WELLFORMEDNESS_REGEX === snippet
+      end
+    end
+  end
+end
+# EOF

data/lib/lang/tag/canonicalization.rb ADDED Viewed

@@ -0,0 +1,376 @@
+require 'lang/tag'
+require 'lang/subtags'
+module Lang #:nodoc:
+  module Tag
+    module Canonicalization
+      # Handles exceptions that might
+      # appear in canonicalization or validation processes.
+      #
+      class Error < Error
+      end
+      #--
+      # RFC 5646, Section 2.2.1
+      # The subtags in the range 'qaa' through 'qtz' are reserved for
+      # private use in language tags. These subtags correspond to codes
+      # reserved by ISO 639-2 for private use. These codes MAY be used
+      # for non-registered primary language subtags (instead of using
+      # private use subtags following 'x-').
+      #++
+      PRIVATE_LANGUAGE_REGEX = /^q[a-t][a-z]$/i.freeze
+      #--
+      # RFC 5646, Section 3.1.7
+      # Extended language subtags always have a mapping to their
+      # identical primary language subtag.  For example, the extended
+      # language subtag 'yue' (Cantonese) can be used to form the tag
+      # "zh-yue". It has a 'Preferred-Value' mapping to the primary
+      # language subtag 'yue', meaning that a tag such as
+      # "zh-yue-Hant-HK" can be canonicalized to "yue-Hant-HK".
+      #++
+      # Canonicalizes language component, applying rules that described
+      # in RFC5646, sections 2.2.1, 2.2.2 and 4.5. Also validates the
+      # language sequence using the 'Prefix' field-value of the extlang.
+      #
+      def canonicalize_language
+        raise InvalidComponentError, "Language can not be omitted." unless @language
+        decompose_language unless @primary
+        if @extlang
+          subtag = Subtags::Extlang(@extlang)
+          raise Error, "Extlang #{@extlang.inspect} is not registered." unless subtag
+          # RFC 5646, Section 2.2.2
+          # Extended language subtag records MUST include a 'Preferred-
+          # Value'. The 'Preferred-Value' and 'Subtag' fields MUST be
+          # identical.
+          # RFC 5646, Section 4.5
+          # For extlangs, the original primary language subtag is also
+          # replaced if there is a primary language subtag in the 'Preferred-Value'.
+          # The 'Preferred-Value' field in subtag records of type "extlang" also
+          # contains an "extended language range".  This allows the subtag to be
+          # deprecated in favor of either a single primary language subtag or a
+          # new language-extlang sequence.
+          unless subtag.prefix == @primary ||
+                 subtag.prefix == @primary.downcase # as of now, we have exactly one extlang
+            # RFC 5646, Section 3.4
+            # Extended language subtag records MUST include exactly one
+            # 'Prefix' field indicating an appropriate subtag or sequence of
+            # subtags for that extended language subtag.
+            raise Error, "Extlang #{@extlang.inspect} requires prefix #{subtag.prefix.inspect}."
+          end
+          @language = subtag.preferred_value
+          @primary = nil
+          @extlang = nil
+          dirty
+        elsif PRIVATE_LANGUAGE_REGEX !~ @primary
+          subtag = Subtags::Language(@primary)
+          raise Error, "Language #{@primary.inspect} is not registered." unless subtag
+          if subtag.preferred_value
+            @language = subtag.preferred_value
+            @primary = nil
+            dirty
+          end
+        end
+        nil
+      end
+      protected :canonicalize_language
+      #--
+      # RFC 5646, Section 2.2.3
+      # The script subtags 'Qaaa' through 'Qabx' are reserved for private
+      # use in language tags. These subtags correspond to codes reserved
+      # by ISO 15924 for private use. These codes MAY be used for non-
+      # registered script values. Please refer to Section 4.6 for more
+      # information on private use subtags.
+      #++
+      PRIVATE_SCRIPT_REGEX = /^Qa[ab][a-x]$/i.freeze
+      def canonicalize_script
+        return if !@script || PRIVATE_SCRIPT_REGEX === @script
+        subtag = Subtags::Script(@script)
+        raise Error, "Script #{@script.inspect} is not registered." unless subtag
+        if subtag.preferred_value
+          @script = subtag.preferred_value
+          dirty
+        end
+        nil
+      end
+      protected :canonicalize_script
+      #--
+      # RFC 5646, Section 2.2.4
+      # The region subtags 'AA', 'QM'-'QZ', 'XA'-'XZ', and 'ZZ' are
+      # reserved for private use in language tags. These subtags
+      # correspond to codes reserved by ISO 3166 for private use.  These
+      # codes MAY be used for private use region subtags (instead of
+      # using a private use subtag sequence). Please refer to
+      # Section 4.6 for more information on private use subtags.
+      #++
+      PRIVATE_REGION_REGEX = /^(?:AA|Q[M-Z]|X[A-Z]|ZZ)$/i.freeze
+      #--
+      # RFC 5646, Section 4.5
+      # Example: Although the tag "en-BU" (English as used in Burma)
+      # maintains its validity, the language tag "en-BU" is not in canonical
+      # form because the 'BU' subtag has a canonical mapping to 'MM'
+      # (Myanmar).
+      #++
+      def canonicalize_region
+        return if !@region || PRIVATE_REGION_REGEX === @region
+        subtag = Subtags::Region(@region)
+        raise Error, "Region #{@region.inspect} is not registered." unless subtag
+        if subtag.preferred_value
+          @region = subtag.preferred_value
+          dirty
+        end
+        nil
+      end
+      protected :canonicalize_region
+      #--
+      # RFC 5646, Section 3.1.8
+      # The 'Prefix' also indicates when variant subtags make sense when used
+      # together (many that otherwise share a 'Prefix' are mutually
+      # exclusive) and what the relative ordering of variants is supposed to
+      # be. For example, the variant '1994' (Standardized Resian
+      # orthography) has several 'Prefix' fields in the registry ("sl-rozaj",
+      # "sl-rozaj-biske", "sl-rozaj-njiva", "sl-rozaj-osojs", and "sl-rozaj-
+      # solba").  This indicates not only that '1994' is appropriate to use
+      # with each of these five Resian variant subtags ('rozaj', 'biske',
+      # 'njiva', 'osojs', and 'solba'), but also that it SHOULD appear
+      # following any of these variants in a tag. Thus, the language tag
+      # ought to take the form "sl-rozaj-biske-1994", rather than "sl-1994-
+      # rozaj-biske" or "sl-rozaj-1994-biske".
+      #++
+      PREFIX_REGEX = /^(#{PATTERN::LANGUAGE})(?:-(#{PATTERN::SCRIPT}))?(?:-(#{PATTERN::REGION}))?(?:-(.+))?$/io.freeze
+      # Canonicalizes variants, applying rules that described in RFC 5646,
+      # sections 2.2.5 and 4.5. Also validates the sequence of variants
+      # using 'Prefix' field-values (see RFC 5646, Section 3.1.8).
+      #
+      def canonicalize_variants
+        return unless @variants_sequence
+        sequence = nil
+        sequence_dirty = false
+        @variants = variants.map do |variant|
+          v = Subtags::Variant(variant)
+          raise Error, "Variant #{variant.inspect} is not registered." unless v
+          if !v.prefixes || v.prefixes.any? { |prefix|
+            PREFIX_REGEX === prefix
+            ($4 == nil || $4 == sequence) &&
+            ($3 == nil || @region && ($3 == @region || $3 == @region.upcase)) &&
+            ($2 == nil || @script && ($2 == @script || $2 == @script.capitalize)) &&
+            ($1 == @language || $1 == @language.downcase)
+            }
+            sequence ? sequence << HYPHEN : sequence = ""
+            sequence << v.name
+            if v.preferred_value
+              sequence_dirty ||= true
+              v.preferred_value
+            else
+              variant
+            end
+          else raise Error,
+            "Variant #{variant.inspect} requires " \
+            "one of following prefixes: " \
+            "#{v.prefixes.map{ |p| p.inspect }.join(", ")}."
+          end
+        end
+        if sequence_dirty
+          @variants_sequence = @variants.join(HYPHEN)
+          dirty
+        end
+        nil
+      end
+      protected :canonicalize_variants
+      #--
+      # RFC 5646, Section 4.5
+      # Example: The language tag "en-a-aaa-b-ccc-bbb-x-xyz" is in canonical
+      # form, while "en-b-ccc-bbb-a-aaa-X-xyz" is well-formed and potentially
+      # valid (extensions 'a' and 'b' are not defined as of the publication
+      # of this document) but not in canonical form (the extensions are not
+      # in alphabetical order).
+      #++
+      def canonicalize_extensions
+        return unless @extensions_sequence
+        ordered = @extensions_sequence.
+          split(EXTENSIONS_SEQUENCE_SPLITTER).
+          sort!{ |k,v| k.downcase <=> v.downcase }.join(HYPHEN)
+        unless @extensions_sequence == ordered
+          @extensions_sequence = ordered
+          dirty
+        end
+        nil
+      end
+      protected :canonicalize_extensions
+      #--
+      # RFC 5646, Section 3.1.7
+      # For example, the tags "zh-yue-Hant-HK" and "yue-Hant-HK"
+      # are semantically equivalent and ought to be treated as
+      # if they were the same tag.
+      #++
+      def same?(other)
+        self.canonicalize == other.canonicalize
+      end
+      def canonicalize
+        duplicated = self.dup
+        duplicated.canonicalize!
+        duplicated
+      end
+      def canonicalize!
+        # 1. Extension sequences are ordered into case-insensitive ASCII order
+        # by singleton subtag.
+        canonicalize_extensions
+        # A redundant tag is a grandfathered
+        # registration whose individual subtags appear with the same semantic
+        # meaning in the registry. For example, the tag "zh-Hant" (Traditional
+        # Chinese) can now be composed from the subtags 'zh' (Chinese) and
+        # 'Hant' (Han script traditional variant). These redundant tags are
+        # maintained in the registry as records of type 'redundant', mostly as
+        # a matter of historical curiosity.
+        # 2. Redundant or grandfathered tags are replaced by their 'Preferred-
+        # Value', if there is one.
+        if re = Subtags::Redundant(composition)
+          return recompose(re.preferred_value) if re.preferred_value
+        end
+        # 3. Subtags are replaced by their 'Preferred-Value', if there is one.
+        # For extlangs, the original primary language subtag is also
+        # replaced if there is a primary language subtag in the 'Preferred-
+        # Value'.
+        canonicalize_language
+        canonicalize_script
+        canonicalize_region
+        canonicalize_variants
+        nil
+      end
+      alias :to_canonical_form! :canonicalize!
+      alias :to_canonical_form  :canonicalize
+      #--
+      # RFC 5646, Section 4.5
+      # For example, "hak-CN" (Hakka, China) has the primary language
+      # subtag 'hak', which in turn has an 'extlang' record with a
+      # 'Prefix' 'zh' (Chinese).  The extlang form is "zh-hak-CN"
+      # (Chinese, Hakka, China).
+      #++
+      def to_extlang_form!
+        canonicalize!
+        subtag = Subtags::Extlang(@language)
+        @primary = subtag.prefix
+        @extlang = @language
+        @language = "#{@primary}#{HYPHEN}#{@extlang}"
+        dirty
+        nil
+      end
+      def to_extlang_form
+        duplicated = self.dup
+        duplicated.to_extlang_form!
+        duplicated
+      end
+      #--
+      # RFC 5646, Section 4.1
+      # The script subtag SHOULD NOT be used to form language tags unless
+      # the script adds some distinguishing information to the tag.
+      # ...
+      # The field 'Suppress-Script' in the primary or extended language
+      # record in the registry indicates script subtags that do not add
+      # distinguishing information for most applications; this field
+      # defines when users SHOULD NOT include a script subtag with a
+      # particular primary language subtag.
+      #
+      # For example, if an implementation selects content using Basic
+      # Filtering [RFC4647] (originally described in Section 14.4 of
+      # [RFC2616]) and the user requested the language range "en-US",
+      # content labeled "en-Latn-US" will not match the request and thus
+      # not be selected. Therefore, it is important to know when script
+      # subtags will customarily be used and when they ought not be used.
+      #++
+      def suppress_script!
+        return unless @script && @language
+        decompose_language unless @primary
+        return if PRIVATE_LANGUAGE_REGEX === @primary
+        subtag = Subtags::Language(@primary)
+        raise Error, "Language #{@primary.inspect} is not registered." unless subtag
+        if subtag.suppress_script && @script == subtag.suppress_script
+          @script = nil
+          dirty
+        #elsif @extlang
+        #  subtag = Subtags::Extlang(@extlang)
+        #  raise Error, "Extlang #{@extlang.inspect} is not registered." unless subtag
+        #  if subtag.suppress_script && @script == subtag.suppress_script
+        #    dirty
+        #  end
+        end
+        nil
+      end
+      def suppress_script
+        duplicated = self.dup
+        duplicated.suppress_script!
+        duplicated
+      end
+    end
+    class Langtag
+      include Canonicalization
+    end
+  end
+end
+# EOF