RubyGems - lang - Versions diffs - 0.1.0.pre - Mend

lang 0.1.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/README.rdoc +46 -0
data/bin/lang +150 -0
data/lib/lang/subtags.rb +147 -0
data/lib/lang/subtags/entry.rb +40 -0
data/lib/lang/subtags/extlang.rb +19 -0
data/lib/lang/subtags/grandfathered.rb +9 -0
data/lib/lang/subtags/language.rb +18 -0
data/lib/lang/subtags/redundant.rb +9 -0
data/lib/lang/subtags/region.rb +9 -0
data/lib/lang/subtags/script.rb +9 -0
data/lib/lang/subtags/variant.rb +17 -0
data/lib/lang/tag.rb +141 -0
data/lib/lang/tag/canonicalization.rb +376 -0
data/lib/lang/tag/composition.rb +141 -0
data/lib/lang/tag/filtering.rb +143 -0
data/lib/lang/tag/grandfathered.rb +36 -0
data/lib/lang/tag/langtag.rb +437 -0
data/lib/lang/tag/lookup.rb +77 -0
data/lib/lang/tag/pattern.rb +31 -0
data/lib/lang/tag/privateuse.rb +34 -0
data/lib/lang/version.rb +5 -0
metadata +108 -0

data/lib/lang/tag/composition.rb ADDED Viewed

@@ -0,0 +1,141 @@
+module Lang #:nodoc:
+  module Tag
+    # Handles abstract compositions of subtags
+    # incl. basic and extended language-ranges.
+    #
+    # ==== Example
+    #
+    #   class LanguageRange < Lang::Tag::Composition
+    #
+    #     def initialize(thing)
+    #       raise TypeError, "Can't convert #{thing.class} into String" unless thing.respond_to?(:to_str)
+    #       sequence = thing.to_str
+    #       unless /^(?:\*|[a-z]{1,8})(?:-[a-z\d]{1,8}|-\*)*$/i === sequence
+    #         raise Error, "#{sequence.inspect} is not a language-range."
+    #       end
+    #       @sequence = sequence
+    #     end
+    #
+    #     def simplify! # to basic language-range
+    #       /^\*-/ === @sequence ? @sequence = '*' : @sequence.gsub!('-*','')
+    #       dirty
+    #     end
+    #
+    #   end
+    #
+    class Composition
+      def initialize(thing)
+        raise TypeError, "Can't convert #{thing.class} into String" unless thing.respond_to?(:to_str)
+        @sequence = thing.to_str
+      end
+      # Returns +true+ if compositions are equal.
+      # Allows comparison against +Strings+.
+      #
+      def ===(other)
+        return false unless other.respond_to?(:to_str)
+        s = other.to_str
+        composition == s || composition == s.downcase
+      end
+      # Returns +true+ if Compositions are equal.
+      #
+      def ==(other)
+        return false unless other.kind_of?(self.class)
+        self.composition == other.composition
+      end
+      def eql?(other)
+        return false unless other.kind_of?(self.class)
+        self.to_s == other.to_s
+      end
+      def hash
+        to_s.hash
+      end
+      def composition
+        @composition ||= to_s.downcase
+      end
+      def to_s
+        @sequence
+      end
+      alias :to_str :to_s
+      def to_a
+        to_s.split(HYPHEN_SPLITTER)
+      end
+      def decomposition
+        @decomposition ||= composition.split(HYPHEN_SPLITTER)
+      end
+      private :decomposition
+      def dirty
+        @composition = nil
+        @decomposition = nil
+        nil
+      end
+      private :dirty
+      # Duplicates self.
+      #
+      def dup
+        self.class.new(to_s.dup)
+      end
+      def length
+        to_s.length
+      end
+      # Returns the number of subtags in self.
+      #
+      def subtags_count
+        to_s.count(HYPHEN) + 1
+      end
+      #--
+      # RFC 5646, Section 2.1.1
+      # An implementation can reproduce this format without accessing the
+      # registry as follows.  All subtags, including extension and private
+      # use subtags, use lowercase letters with two exceptions: two-letter
+      # and four-letter subtags that neither appear at the start of the tag
+      # nor occur after singletons.  Such two-letter subtags are all
+      # uppercase (as in the tags "en-CA-x-ca" or "sgn-BE-FR") and four-
+      # letter subtags are titlecase (as in the tag "az-Latn-x-latn").
+      #++
+      def nicecase!
+        @sequence.downcase!
+        @sequence.gsub!(/-(?:([a-z\d]{4})|[a-z\d]{2}|[a-z\d]-.*)(?=-|$)/) do |sequence|
+          if $1
+            sequence = HYPHEN + $1.capitalize
+          elsif sequence.size == 3
+            sequence.upcase!
+          end
+          sequence
+        end
+        nil
+      end
+      def nicecase
+        duplicated = self.dup
+        duplicated.nicecase!
+        duplicated
+      end
+      def inspect
+        sprintf("#<%s:%#0x %s>", self.class.to_s, self.object_id, self.to_s)
+      end
+    end
+  end
+end
+# EOF

data/lib/lang/tag/filtering.rb ADDED Viewed

@@ -0,0 +1,143 @@
+require 'lang/tag'
+module Lang #:nodoc:
+  module Tag
+    # Basic and extended filtering.
+    # RFC 4647, Sections 3.3.1, 3.3.2.
+    #
+    module Filtering
+      WILDCARD = '*'.freeze
+      #--
+      # RFC 4647, Section 3.3.2 ('Extended Filtering')
+      #
+      # Much like basic filtering, extended filtering selects content with
+      # arbitrarily long tags that share the same initial subtags as the
+      # language range.  In addition, extended filtering selects language
+      # tags that contain any intermediate subtags not specified in the
+      # language range.  For example, the extended language range "de-*-DE"
+      # (or its synonym "de-DE") matches all of the following tags:
+      #
+      #   de-DE (German, as used in Germany)
+      #   de-de (German, as used in Germany)
+      #   de-Latn-DE (Latin script)
+      #   de-Latf-DE (Fraktur variant of Latin script)
+      #   de-DE-x-goethe (private-use subtag)
+      #   de-Latn-DE-1996 (orthography of 1996)
+      #   de-Deva-DE (Devanagari script)
+      #
+      # The same range does not match any of the following tags for the
+      # reasons shown:
+      #
+      #   de (missing 'DE')
+      #   de-x-DE (singleton 'x' occurs before 'DE')
+      #   de-Deva ('Deva' not equal to 'DE')
+      #++
+      # Checks if the *extended* language-range (in the shortest notation)
+      # passed matches self.
+      #
+      # ==== Example
+      #   Lang::Tag('de-DE').matched_by_extended_range?('de-*-DE) #=> true
+      #   Lang::Tag('de-DE-x-goethe').matched_by_extended_range?('de-*-DE) #=> true
+      #   Lang::Tag('de-Latn-DE').matched_by_extended_range?('de-*-DE) #=> true
+      #   Lang::Tag('de-Latf-DE').matched_by_extended_range?('de-*-DE) #=> true
+      #   Lang::Tag('de-x-DE').matched_by_extended_range?('de-*-DE) #=> false
+      #   Lang::Tag('de-Deva').matched_by_extended_range?('de-*-DE) #=> false
+      #
+      def matched_by_extended_range?(range)
+        subtags = decomposition.dup
+        subranges = range.to_str.downcase.split(HYPHEN_SPLITTER)
+        subrange = subranges.shift
+        subtag = subtags.shift
+        while subrange
+          if subrange == WILDCARD
+            subrange = subranges.shift
+          elsif subtag == nil
+            return false
+          elsif subtag == subrange
+            subtag = subtags.shift
+            subrange = subranges.shift
+          elsif subtag.size == 1
+            return false
+          else
+            subtag = subtags.shift
+          end
+        end
+        true
+      rescue
+        false
+      end
+      #--
+      # RFC 4647, Section 3.3.1 ('Basic Filtering')
+      #
+      # A language range matches a
+      # particular language tag if, in a case-insensitive comparison, it
+      # exactly equals the tag, or if it exactly equals a prefix of the tag
+      # such that the first character following the prefix is "-".  For
+      # example, the language-range "de-de" (German as used in Germany)
+      # matches the language tag "de-DE-1996" (German as used in Germany,
+      # orthography of 1996), but not the language tags "de-Deva" (German as
+      # written in the Devanagari script) or "de-Latn-DE" (German, Latin
+      # script, as used in Germany).
+      #++
+      # Checks if the *basic* language-range passed matches self.
+      #
+      # ==== Example
+      #   tag = Lang::Tag('de-Latn-DE')
+      #   tag.matched_by_basic_range?('de-Latn-DE') #=> true
+      #   tag.matched_by_basic_range?('de-Latn') #=> true
+      #   tag.matched_by_basic_range?('*') #=> true
+      #   tag.matched_by_basic_range?('de-La') #=> false
+      #   tag.matched_by_basic_range?('de-de') #=> false
+      #   tag.matched_by_basic_range?('malformedlangtag') #=> false
+      #
+      def matched_by_basic_range?(range)
+        if range.kind_of?(Composition)
+          s = range.composition
+        elsif range.respond_to?(:to_str)
+          s = range.to_str.downcase
+          return true if s == WILDCARD
+        else
+          return false
+        end
+        composition == s ||
+        composition.index(s + HYPHEN) == 0
+      end
+      alias :has_prefix? :matched_by_basic_range?
+    end
+    #--
+    # Filtering is defined for the language tags only.
+    #
+    # RFC 4647, Section 3.3
+    # Filtering is used to select the set of language tags
+    # that matches a given language priority list.
+    #++
+    class Langtag
+      include Filtering
+    end
+    class Grandfathered
+      include Filtering
+    end
+    class Privateuse
+      include Filtering
+    end
+  end
+end
+# EOF

data/lib/lang/tag/grandfathered.rb ADDED Viewed

@@ -0,0 +1,36 @@
+require 'lang/tag'
+module Lang #:nodoc:
+  module Tag
+    def self.Grandfathered(thing)
+      return thing if Grandfathered === thing
+      Grandfathered.new(thing)
+    end
+    # Handles grandfathered registrations.
+    #
+    class Grandfathered < Composition
+      def initialize(thing)
+        raise TypeError, "Can't convert #{thing.class} into String" unless thing.respond_to?(:to_str)
+        sequence = thing.to_str
+        unless Lang::Tag.grandfathered?(sequence)
+          raise ArgumentError, "#{sequence.inspect} is not a grandfathered language tag"
+        end
+        @sequence = sequence
+      end
+      def to_langtag
+        unless preferred_value = GRANDFATHERED[@sequence.downcase]
+          raise Error, "There is no preferred value for the grandfathered language tag #{@sequence.inspect}."
+        end
+        Tag::Langtag(preferred_value)
+      end
+    end
+  end
+end
+# EOF

data/lib/lang/tag/langtag.rb ADDED Viewed

@@ -0,0 +1,437 @@
+require 'lang/tag'
+module Lang
+  module Tag
+    def self.Langtag(thing = nil)
+      return thing if Langtag === thing
+      Langtag.new(thing)
+    end
+    # Handles the 'langtag' production
+    # i.e normal language tags.
+    #
+    class Langtag < Composition
+      attr_reader :language, :script, :region, :variants_sequence, :extensions_sequence, :privateuse_sequence
+      def initialize(thing = nil)
+        recompose(thing) if thing
+      end
+      #--
+      # RFC 5646, sec. 2.2.1:
+      # The primary language subtag is the first subtag in a language tag and
+      # cannot be omitted, with two exceptions:
+      #
+      # The single-character subtag 'x' as the primary subtag indicates
+      # that the language tag consists solely of subtags whose meaning is
+      # defined by private agreement. For example, in the tag "x-fr-CH",
+      # the subtags 'fr' and 'CH' do not represent the French language or
+      # the country of Switzerland (or any other value in the IANA
+      # registry) unless there is a private agreement in place to do so.
+      # See Section 4.6.
+      #
+      # The single-character subtag 'i' is used by some grandfathered tags
+      # (see Section 2.2.8) such as "i-klingon" and "i-bnn". (Other
+      # grandfathered tags have a primary language subtag in their first
+      # position.)
+      #++
+      #--
+      # RFC 5646, sec. 2.2.2:
+      # Extended language subtags are used to identify certain specially
+      # selected languages that, for various historical and compatibility
+      # reasons, are closely identified with or tagged using an existing
+      # primary language subtag. Extended language subtags are always used
+      # with their enclosing primary language subtag (indicated with a
+      # 'Prefix' field in the registry) when used to form the language tag.
+      #++
+      # Sets the language component for this langtag.
+      #
+      def language=(value)
+        raise InvalidComponentError, "Primary subtag cannot be omitted." unless value
+        sequence = value.to_str
+        if LANGUAGE_REGEX !~ sequence
+          raise InvalidComponentError,
+          "#{value.inspect} does not conform to the 'language' ABNF " \
+          "or to the associated rules."
+        end
+        @language = sequence
+        @primary  = nil
+        @extlang  = nil
+        dirty
+        validate
+      end
+      # Returns a primary language subtag.
+      #
+      def primary
+        return nil unless @language
+        decompose_language unless @primary
+        @primary
+      end
+      # Returns a second component of the extended language, if any.
+      #
+      def extlang
+        return nil unless @language
+        decompose_language unless @primary
+        @extlang
+      end
+      # Decomposes a language component.
+      #
+      def decompose_language
+        @primary, @extlang = @language.split(HYPHEN_SPLITTER, 2)
+        nil
+      end
+      protected :decompose_language
+      #--
+      # RFC 5646, sec. 2.2.3:
+      # Script subtags are used to indicate the script or writing system
+      # variations that distinguish the written forms of a language or its
+      # dialects.
+      #++
+      # Sets the script component for this langtag.
+      #
+      def script=(value)
+        subtag = value ? value.to_str : nil
+        if subtag && SCRIPT_REGEX !~ subtag
+          raise InvalidComponentError, "#{value.inspect} does not conform to the 'script' ABNF."
+        end
+        @script = subtag
+        dirty
+        validate
+      end
+      #--
+      # RFC 5646, sec. 2.2.4:
+      # Region subtags are used to indicate linguistic variations associated
+      # with or appropriate to a specific country, territory, or region.
+      # Typically, a region subtag is used to indicate variations such as
+      # regional dialects or usage, or region-specific spelling conventions.
+      # It can also be used to indicate that content is expressed in a way
+      # that is appropriate for use throughout a region, for instance,
+      # Spanish content tailored to be useful throughout Latin America.
+      #++
+      # Sets the region component for this langtag.
+      #
+      def region=(value)
+        subtag = value ? value.to_str : nil
+        if subtag && REGION_REGEX !~ subtag
+          raise InvalidComponentError, "#{value.inspect} does not conform to the 'region' ABNF."
+        end
+        @region = subtag
+        dirty
+        validate
+      end
+      #--
+      # RFC 5646, sec. 2.2.5:
+      # Variant subtags are used to indicate additional, well-recognized
+      # variations that define a language or its dialects that are not
+      # covered by other available subtags.
+      #++
+      # Sets the sequence of variants for this langtag.
+      #
+      # ==== Example
+      #
+      #   tag = Lang::Tag('ja')
+      #   tag.variants_sequence = 'hepburn-heploc'
+      #   tag.variants #=> ['hepburn', 'heploc']
+      #   tag.has_variant?('heploc') #=> true
+      #   tag.has_variant?('nedis') #=> false
+      #
+      def variants_sequence=(value)
+        sequence = value ? value.to_str : nil
+        if sequence && VARIANTS_SEQUENCE_REGEX !~ "#{HYPHEN}#{sequence}"
+          raise InvalidComponentError, "#{value.inspect} does not conform to the 'variants' ABNF."
+        end
+        set_variants_sequence(sequence)
+        dirty
+        validate
+      end
+      # Friendly version of the #variants_sequence=.
+      # Sets the sequence of variants for this langtag.
+      #
+      # ==== Example
+      #
+      #   tag = Lang::Tag('sl')
+      #   tag.variants = ['rozaj', 'solba', '1994']
+      #   tag.variants_sequence #=> 'rozaj-solba-1994'
+      #   tag.variants #=> ['rozaj', 'solba', '1994']
+      #
+      def variants=(value)
+        subtags = Array(value).flatten
+        if subtags.empty?
+          self.variants_sequence = nil
+        else
+          self.variants_sequence = subtags.join(HYPHEN)
+          @variants = subtags
+        end
+      end
+      # Returns a list of variants of this lantag.
+      #
+      def variants
+        return nil unless @variants_sequence
+        @variants ||= @variants_sequence.split(HYPHEN_SPLITTER)
+      end
+      def set_variants_sequence(sequence)
+        if sequence && sequence.downcase.split(HYPHEN_SPLITTER).uniq!
+          raise InvalidComponentError, "#{sequence.inspect} sequence includes repeated variants."
+        end
+        @variants_sequence = sequence
+        @variants = nil
+        nil
+      end
+      protected :set_variants_sequence
+      # Checks if self has a variant or a sequence of
+      # variants passed. Works case-insensitively.
+      #
+      def has_variant?(sequence)
+        return false unless @variants_sequence
+        /(?:^|-)#{sequence}(?:-|$)/i === @variants_sequence
+      end
+      #--
+      # RFC 5646, sec. 2.2.6:
+      # Extensions provide a mechanism for extending language tags for use in
+      # various applications. They are intended to identify information that
+      # is commonly used in association with languages or language tags but
+      # that is not part of language identification.
+      #++
+      # Sets the sequence of extensions for this langtag.
+      #
+      def extensions_sequence=(value)
+        sequence = value ? value.to_str : nil
+        if sequence && EXTENSIONS_SEQUENCE_REGEX !~ "#{HYPHEN}#{sequence}"
+          raise InvalidComponentError, "#{value.inspect} does not conform to the 'extensions' ABNF."
+        end
+        set_extensions_sequence(sequence)
+        dirty
+        validate
+      end
+      # Friendly version of the #extensions_sequence=.
+      # Sets the sequence of extensions for this langtag.
+      #
+      def extensions=(value)
+        subtags = Array(value).flatten
+        self.extensions_sequence = subtags.empty? ? nil : subtags.join(HYPHEN)
+      end
+      def set_extensions_sequence(sequence)
+        if sequence
+          exthash = {}
+          sequence.split(EXTENSIONS_SEQUENCE_SPLITTER).each do |seq|
+            k,v = seq[0...1], seq[2..-1] # sequence.split(HYPHEN_SPLITTER,2)
+            k.downcase!
+            if exthash.key?(k)
+              raise InvalidComponentError, "#{sequence.inspect} sequence includes repeated singletons."
+            end
+            exthash[k] = v
+          end
+          @extensions_sequence = sequence
+          @extensions = exthash
+        else
+          @extensions_sequence = nil
+          @extensions = nil
+        end
+        nil
+      end
+      protected :set_extensions_sequence
+      # Builds an *ordered* list of *downcased* singletons.
+      #
+      def singletons
+        return nil unless @extensions
+        keys = @extensions.keys
+        keys.sort!
+        keys
+      end
+      # Returns a sequense of subtags for a singleton passed.
+      # Works case-insensitively.
+      #
+      def extension(key)
+        return nil unless @extensions
+        sequence = @extensions[key] || @extensions[key = key.downcase]
+        return sequence unless String === sequence
+        @extensions[key] = sequence.split(HYPHEN) #lazy
+        @extensions[key]
+      end
+      # Checks if self has a singleton passed.
+      # Works case-insensitively.
+      #
+      def has_singleton?(key)
+        return false unless @extensions
+        @extensions.key?(key) || @extensions.key?(key.downcase)
+      end
+      alias :has_extension? :has_singleton?
+      #--
+      # RFC 5646, sec. 2.2.7:
+      # Private use subtags are used to indicate distinctions in language
+      # that are important in a given context by private agreement.
+      #
+      # RFC 5646, sec. 2.2.7:
+      # For example, suppose a group of scholars is studying some texts in
+      # medieval Greek.  They might agree to use some collection of private
+      # use subtags to identify different styles of writing in the texts.
+      # For example, they might use 'el-x-koine' for documents in the
+      # "common" style while using 'el-x-attic' for other documents that
+      # mimic the Attic style.  These subtags would not be recognized by
+      # outside processes or systems, but might be useful in categorizing
+      # various texts for study by those in the group.
+      #++
+      def privateuse
+        return nil unless @privateuse_sequence
+        @privateuse ||= @privateuse_sequence.split(HYPHEN)[1..-1]
+      end
+      # Friendly version of the #privateuse_sequence=.
+      # Sets the 'privateuse' sequence for this langtag.
+      #
+      # ==== Example
+      #
+      #   tag = Lang::Tag('de')
+      #   tag.privateuse = ['private', 'use', 'sequence']
+      #   tag.privateuse_sequence #=> 'x-private-use-sequence'
+      #
+      def privateuse=(value)
+        subtags = Array(value).flatten
+        if subtags.empty?
+          self.privateuse_sequence = nil
+        else
+          self.privateuse_sequence = subtags.unshift(PRIVATEUSE).join(HYPHEN)
+          @privateuse = subtags
+        end
+      end
+      # Sets the 'privateuse' sequence for this langtag.
+      #
+      def privateuse_sequence=(value)
+        sequence = value ? value.to_str : nil
+        if sequence && Tag::PRIVATEUSE_REGEX !~ sequence
+          raise InvalidComponentError, "#{value.inspect} does not conform to the 'privateuse' ABNF."
+        end
+        @privateuse_sequence = sequence
+        @privateuse = nil
+        dirty
+        validate
+      end
+      def dirty
+        @sequence = nil
+        super
+      end
+      private :dirty
+      def defer_validation(&block)
+        raise LocalJumpError, "No block given." unless block
+        @validation_deferred = true
+        yield
+        @validation_deferred = false
+        validate
+        nil
+      end
+      def validate
+        return if !!@validation_deferred
+        if @language.nil?
+          raise InvalidComponentError, "Primary subtag cannot be omitted."
+        end
+        nil
+      end
+      private :validate
+      def nicecase!
+        # ugly, but faster than recompose
+        if @language && @language.downcase!
+          @primary = nil
+          @extlang = nil
+        end
+        # [ISO639-1] recommends that language codes be written in lowercase ('mn' Mongolian).
+        # [ISO15924] recommends that script codes use lowercase with the initial letter capitalized ('Cyrl' Cyrillic).
+        # [ISO3166-1] recommends that country codes be capitalized ('MN' Mongolia).
+        @script.capitalize! if @script
+        @region.upcase! if @region
+        @variants = nil if @variants_sequence &&
+          @variants_sequence.downcase!
+        set_extensions_sequence(@extensions_sequence) if @extensions_sequence &&
+          @extensions_sequence.downcase!
+        @privateuse = nil if @privateuse_sequence &&
+          @privateuse_sequence.downcase!
+        @sequence = nil
+      end
+      def to_s
+        return @sequence if @sequence
+        @sequence = ""
+        @sequence << @language if @language
+        @sequence << HYPHEN << @script if @script
+        @sequence << HYPHEN << @region if @region
+        @sequence << HYPHEN << @variants_sequence if @variants_sequence
+        @sequence << HYPHEN << @extensions_sequence if @extensions_sequence
+        @sequence << HYPHEN << @privateuse_sequence if @privateuse_sequence
+        @sequence
+      end
+      def recompose(thing)
+        raise TypeError, "Can't convert #{thing.class} into String" unless thing.respond_to?(:to_str)
+        tag = thing.to_str
+        if LANGTAG_REGEX === tag
+          dirty
+          @sequence               = tag
+          @primary                = nil
+          @extlang                = nil
+          @language               = $1
+          @script                 = $2
+          @region                 = $3
+          set_variants_sequence     $4[1..-1]
+          set_extensions_sequence   $5[1..-1]
+          @privateuse_sequence    = $'[1..-1]
+          @privateuse             = nil
+        else
+          raise ArgumentError, "Ill-formed, grandfathered or 'privateuse' language tag: #{thing.inspect}."
+        end
+        self
+      end
+    end
+  end
+end
+# EOF