twitter_cldr 1.6.1 → 1.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
 - data/Rakefile +8 -3
 - data/lib/twitter_cldr/collation/collator.rb +9 -11
 - data/lib/twitter_cldr/collation/trie.rb +8 -0
 - data/lib/twitter_cldr/collation/trie_builder.rb +12 -15
 - data/lib/twitter_cldr/collation/trie_loader.rb +50 -0
 - data/lib/twitter_cldr/collation/trie_with_fallback.rb +6 -4
 - data/lib/twitter_cldr/collation.rb +1 -0
 - data/lib/twitter_cldr/resources/tailoring_importer.rb +203 -0
 - data/lib/twitter_cldr/resources/tries_dumper.rb +43 -0
 - data/lib/twitter_cldr/resources.rb +3 -2
 - data/lib/twitter_cldr/version.rb +1 -1
 - data/resources/collation/tries/af.dump +0 -0
 - data/resources/collation/tries/ar.dump +0 -0
 - data/resources/collation/tries/ca.dump +0 -0
 - data/resources/collation/tries/cs.dump +0 -0
 - data/resources/collation/tries/da.dump +0 -0
 - data/resources/collation/tries/de.dump +0 -0
 - data/resources/collation/tries/default.dump +0 -0
 - data/resources/collation/tries/el.dump +0 -0
 - data/resources/collation/tries/en.dump +0 -0
 - data/resources/collation/tries/es.dump +0 -0
 - data/resources/collation/tries/eu.dump +0 -0
 - data/resources/collation/tries/fa.dump +0 -0
 - data/resources/collation/tries/fi.dump +0 -0
 - data/resources/collation/tries/fil.dump +0 -0
 - data/resources/collation/tries/fr.dump +0 -0
 - data/resources/collation/tries/he.dump +0 -0
 - data/resources/collation/tries/hi.dump +0 -0
 - data/resources/collation/tries/hu.dump +0 -0
 - data/resources/collation/tries/id.dump +0 -0
 - data/resources/collation/tries/it.dump +0 -0
 - data/resources/collation/tries/ja.dump +0 -0
 - data/resources/collation/tries/ko.dump +0 -0
 - data/resources/collation/tries/ms.dump +0 -0
 - data/resources/collation/tries/nb.dump +0 -0
 - data/resources/collation/tries/nl.dump +0 -0
 - data/resources/collation/tries/pl.dump +0 -0
 - data/resources/collation/tries/pt.dump +0 -0
 - data/resources/collation/tries/ru.dump +0 -0
 - data/resources/collation/tries/sv.dump +0 -0
 - data/resources/collation/tries/th.dump +0 -0
 - data/resources/collation/tries/tr.dump +0 -0
 - data/resources/collation/tries/uk.dump +0 -0
 - data/resources/collation/tries/ur.dump +0 -0
 - data/resources/collation/tries/zh-Hant.dump +0 -0
 - data/resources/collation/tries/zh.dump +0 -0
 - data/spec/collation/collation_spec.rb +4 -2
 - data/spec/collation/collator_spec.rb +36 -30
 - data/spec/collation/tailoring_spec.rb +3 -1
 - data/spec/collation/tailoring_tests/ja.txt +6 -5
 - data/spec/collation/trie_builder_spec.rb +21 -26
 - data/spec/collation/trie_dumps_spec.rb +26 -0
 - data/spec/collation/trie_loader_spec.rb +72 -0
 - data/spec/collation/trie_spec.rb +14 -0
 - data/spec/collation/trie_with_fallback_spec.rb +6 -0
 - data/spec/normalization/normalization_spec.rb +2 -2
 - metadata +43 -21
 - data/lib/twitter_cldr/resources/import/tailoring.rb +0 -202
 - data/lib/twitter_cldr/resources/import.rb +0 -12
 
| 
         @@ -1,202 +0,0 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            # encoding: UTF-8
         
     | 
| 
       2 
     | 
    
         
            -
             
     | 
| 
       3 
     | 
    
         
            -
            # Copyright 2012 Twitter, Inc
         
     | 
| 
       4 
     | 
    
         
            -
            # http://www.apache.org/licenses/LICENSE-2.0
         
     | 
| 
       5 
     | 
    
         
            -
             
     | 
| 
       6 
     | 
    
         
            -
            require 'nokogiri'
         
     | 
| 
       7 
     | 
    
         
            -
            require 'yaml'
         
     | 
| 
       8 
     | 
    
         
            -
            require 'java'
         
     | 
| 
       9 
     | 
    
         
            -
             
     | 
| 
       10 
     | 
    
         
            -
            module TwitterCldr
         
     | 
| 
       11 
     | 
    
         
            -
              module Resources
         
     | 
| 
       12 
     | 
    
         
            -
                module Import
         
     | 
| 
       13 
     | 
    
         
            -
             
     | 
| 
       14 
     | 
    
         
            -
                  # This class should be used with JRuby 1.7 in 1.9 mode and ICU4J version 49.1 (available at
         
     | 
| 
       15 
     | 
    
         
            -
                  # http://download.icu-project.org/files/icu4j/49.1/icu4j-49_1.jar).
         
     | 
| 
       16 
     | 
    
         
            -
                  #
         
     | 
| 
       17 
     | 
    
         
            -
                  class Tailoring
         
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
                    SUPPORTED_RULES   = %w[p s t i pc sc tc ic x]
         
     | 
| 
       20 
     | 
    
         
            -
                    SIMPLE_RULES      = %w[p s t i]
         
     | 
| 
       21 
     | 
    
         
            -
                    LEVEL_RULE_REGEXP = /^(p|s|t|i)(c?)$/
         
     | 
| 
       22 
     | 
    
         
            -
             
     | 
| 
       23 
     | 
    
         
            -
                    IGNORED_TAGS = %w[reset text #comment]
         
     | 
| 
       24 
     | 
    
         
            -
             
     | 
| 
       25 
     | 
    
         
            -
                    LAST_BYTE_MASK = 0xFF
         
     | 
| 
       26 
     | 
    
         
            -
             
     | 
| 
       27 
     | 
    
         
            -
                    LOCALES_MAP = {
         
     | 
| 
       28 
     | 
    
         
            -
                        :'zh-Hant' => :'zh_Hant',
         
     | 
| 
       29 
     | 
    
         
            -
                        :id => :root,
         
     | 
| 
       30 
     | 
    
         
            -
                        :it => :root,
         
     | 
| 
       31 
     | 
    
         
            -
                        :ms => :root,
         
     | 
| 
       32 
     | 
    
         
            -
                        :nl => :root,
         
     | 
| 
       33 
     | 
    
         
            -
                        :pt => :root
         
     | 
| 
       34 
     | 
    
         
            -
                    }
         
     | 
| 
       35 
     | 
    
         
            -
             
     | 
| 
       36 
     | 
    
         
            -
                    EMPTY_TAILORING_DATA = { 'collator_options' => {}, 'tailored_table' => '', 'suppressed_contractions' => '' }
         
     | 
| 
       37 
     | 
    
         
            -
             
     | 
| 
       38 
     | 
    
         
            -
                    class ImportError < RuntimeError; end
         
     | 
| 
       39 
     | 
    
         
            -
             
     | 
| 
       40 
     | 
    
         
            -
                    # Arguments:
         
     | 
| 
       41 
     | 
    
         
            -
                    #
         
     | 
| 
       42 
     | 
    
         
            -
                    #   input_path  - path to a directory containing CLDR tailoring data (available at
         
     | 
| 
       43 
     | 
    
         
            -
                    #                 http://unicode.org/cldr/trac/browser/tags/release-21/common/collation/
         
     | 
| 
       44 
     | 
    
         
            -
                    #                 or as a part of CLDR release at http://cldr.unicode.org/index/downloads)
         
     | 
| 
       45 
     | 
    
         
            -
                    #
         
     | 
| 
       46 
     | 
    
         
            -
                    #   output_path - output directory for imported YAML files
         
     | 
| 
       47 
     | 
    
         
            -
                    #
         
     | 
| 
       48 
     | 
    
         
            -
                    #   icu4j_path  - path to ICU4J jar file
         
     | 
| 
       49 
     | 
    
         
            -
                    #
         
     | 
| 
       50 
     | 
    
         
            -
                    def initialize(input_path, output_path, icu4j_path)
         
     | 
| 
       51 
     | 
    
         
            -
                      require icu4j_path
         
     | 
| 
       52 
     | 
    
         
            -
             
     | 
| 
       53 
     | 
    
         
            -
                      @input_path  = input_path
         
     | 
| 
       54 
     | 
    
         
            -
                      @output_path = output_path
         
     | 
| 
       55 
     | 
    
         
            -
                    end
         
     | 
| 
       56 
     | 
    
         
            -
             
     | 
| 
       57 
     | 
    
         
            -
                    def import(locale)
         
     | 
| 
       58 
     | 
    
         
            -
                      print "Importing %8s\t--\t" % locale
         
     | 
| 
       59 
     | 
    
         
            -
             
     | 
| 
       60 
     | 
    
         
            -
                      if tailoring_present?(locale)
         
     | 
| 
       61 
     | 
    
         
            -
                        YAML.dump(tailoring_data(locale), open(resource_file_path(locale), 'w'))
         
     | 
| 
       62 
     | 
    
         
            -
                        puts "Done."
         
     | 
| 
       63 
     | 
    
         
            -
                      else
         
     | 
| 
       64 
     | 
    
         
            -
                        YAML.dump(EMPTY_TAILORING_DATA, open(resource_file_path(locale), 'w'))
         
     | 
| 
       65 
     | 
    
         
            -
                        puts "Missing (generated empty tailoring resource)."
         
     | 
| 
       66 
     | 
    
         
            -
                      end
         
     | 
| 
       67 
     | 
    
         
            -
                    rescue ImportError => e
         
     | 
| 
       68 
     | 
    
         
            -
                      puts "Error: #{e.message}"
         
     | 
| 
       69 
     | 
    
         
            -
                    end
         
     | 
| 
       70 
     | 
    
         
            -
             
     | 
| 
       71 
     | 
    
         
            -
                    private
         
     | 
| 
       72 
     | 
    
         
            -
             
     | 
| 
       73 
     | 
    
         
            -
                    def tailoring_present?(locale)
         
     | 
| 
       74 
     | 
    
         
            -
                      File.file?(locale_file_path(locale))
         
     | 
| 
       75 
     | 
    
         
            -
                    end
         
     | 
| 
       76 
     | 
    
         
            -
             
     | 
| 
       77 
     | 
    
         
            -
                    def translated_locale(locale)
         
     | 
| 
       78 
     | 
    
         
            -
                      LOCALES_MAP.fetch(locale, locale)
         
     | 
| 
       79 
     | 
    
         
            -
                    end
         
     | 
| 
       80 
     | 
    
         
            -
             
     | 
| 
       81 
     | 
    
         
            -
                    def locale_file_path(locale)
         
     | 
| 
       82 
     | 
    
         
            -
                      File.join(@input_path, "#{translated_locale(locale)}.xml")
         
     | 
| 
       83 
     | 
    
         
            -
                    end
         
     | 
| 
       84 
     | 
    
         
            -
             
     | 
| 
       85 
     | 
    
         
            -
                    def resource_file_path(locale)
         
     | 
| 
       86 
     | 
    
         
            -
                      File.join(@output_path, "#{locale}.yml")
         
     | 
| 
       87 
     | 
    
         
            -
                    end
         
     | 
| 
       88 
     | 
    
         
            -
             
     | 
| 
       89 
     | 
    
         
            -
                    def tailoring_data(locale)
         
     | 
| 
       90 
     | 
    
         
            -
                      doc = Nokogiri::XML(open(locale_file_path(locale)))
         
     | 
| 
       91 
     | 
    
         
            -
                      collations = doc.at_xpath('//collations')
         
     | 
| 
       92 
     | 
    
         
            -
             
     | 
| 
       93 
     | 
    
         
            -
                      collation_alias = collations.at_xpath('alias[@path="//ldml/collations"]')
         
     | 
| 
       94 
     | 
    
         
            -
                      aliased_locale = collation_alias && collation_alias.attr('source')
         
     | 
| 
       95 
     | 
    
         
            -
             
     | 
| 
       96 
     | 
    
         
            -
                      return tailoring_data(aliased_locale) if aliased_locale
         
     | 
| 
       97 
     | 
    
         
            -
             
     | 
| 
       98 
     | 
    
         
            -
                      standard_tailoring = collations.at_xpath('collation[@type="standard"]')
         
     | 
| 
       99 
     | 
    
         
            -
             
     | 
| 
       100 
     | 
    
         
            -
                      {
         
     | 
| 
       101 
     | 
    
         
            -
                          'collator_options'        => parse_collator_options(standard_tailoring),
         
     | 
| 
       102 
     | 
    
         
            -
                          'tailored_table'          => parse_tailorings(standard_tailoring, locale),
         
     | 
| 
       103 
     | 
    
         
            -
                          'suppressed_contractions' => parse_suppressed_contractions(standard_tailoring)
         
     | 
| 
       104 
     | 
    
         
            -
                      }
         
     | 
| 
       105 
     | 
    
         
            -
                    end
         
     | 
| 
       106 
     | 
    
         
            -
             
     | 
| 
       107 
     | 
    
         
            -
                    def parse_tailorings(data, locale)
         
     | 
| 
       108 
     | 
    
         
            -
                      rules = data && data.at_xpath('rules')
         
     | 
| 
       109 
     | 
    
         
            -
             
     | 
| 
       110 
     | 
    
         
            -
                      return '' unless rules
         
     | 
| 
       111 
     | 
    
         
            -
             
     | 
| 
       112 
     | 
    
         
            -
                      collator = Java::ComIbmIcuText::Collator.get_instance(Java::JavaUtil::Locale.new(locale.to_s))
         
     | 
| 
       113 
     | 
    
         
            -
             
     | 
| 
       114 
     | 
    
         
            -
                      rules.children.map do |child|
         
     | 
| 
       115 
     | 
    
         
            -
                        validate_tailoring_rule(child)
         
     | 
| 
       116 
     | 
    
         
            -
             
     | 
| 
       117 
     | 
    
         
            -
                        if child.name =~ LEVEL_RULE_REGEXP
         
     | 
| 
       118 
     | 
    
         
            -
                          if $2.empty?
         
     | 
| 
       119 
     | 
    
         
            -
                            table_entry_for_rule(collator, child.text)
         
     | 
| 
       120 
     | 
    
         
            -
                          else
         
     | 
| 
       121 
     | 
    
         
            -
                            child.text.chars.map { |char| table_entry_for_rule(collator, char) }
         
     | 
| 
       122 
     | 
    
         
            -
                          end
         
     | 
| 
       123 
     | 
    
         
            -
                        elsif child.name == 'x'
         
     | 
| 
       124 
     | 
    
         
            -
                          context = ''
         
     | 
| 
       125 
     | 
    
         
            -
                          child.children.each_with_object([]) do |c, memo|
         
     | 
| 
       126 
     | 
    
         
            -
                            if SIMPLE_RULES.include?(c.name)
         
     | 
| 
       127 
     | 
    
         
            -
                              memo << table_entry_for_rule(collator, context + c.text)
         
     | 
| 
       128 
     | 
    
         
            -
                            elsif c.name == 'context'
         
     | 
| 
       129 
     | 
    
         
            -
                              context = c.text
         
     | 
| 
       130 
     | 
    
         
            -
                            elsif c.name != 'extend'
         
     | 
| 
       131 
     | 
    
         
            -
                              raise ImportError, "Rule '#{c.name}' inside <x></x> is not supported."
         
     | 
| 
       132 
     | 
    
         
            -
                            end
         
     | 
| 
       133 
     | 
    
         
            -
                          end
         
     | 
| 
       134 
     | 
    
         
            -
                        else
         
     | 
| 
       135 
     | 
    
         
            -
                          raise ImportError, "Tag '#{child.name}' is not supported." unless IGNORED_TAGS.include?(child.name)
         
     | 
| 
       136 
     | 
    
         
            -
                        end
         
     | 
| 
       137 
     | 
    
         
            -
                      end.flatten.compact.join("\n")
         
     | 
| 
       138 
     | 
    
         
            -
                    end
         
     | 
| 
       139 
     | 
    
         
            -
             
     | 
| 
       140 
     | 
    
         
            -
                    def table_entry_for_rule(collator, tailored_value)
         
     | 
| 
       141 
     | 
    
         
            -
                      code_points = get_code_points(tailored_value)
         
     | 
| 
       142 
     | 
    
         
            -
             
     | 
| 
       143 
     | 
    
         
            -
                      collation_elements = get_collation_elements(collator, tailored_value).map do |ce|
         
     | 
| 
       144 
     | 
    
         
            -
                        ce.map { |l| l.to_s(16).upcase }.join(', ')
         
     | 
| 
       145 
     | 
    
         
            -
                      end
         
     | 
| 
       146 
     | 
    
         
            -
             
     | 
| 
       147 
     | 
    
         
            -
                      "#{code_points.join(' ')}; [#{collation_elements.join('][')}]"
         
     | 
| 
       148 
     | 
    
         
            -
                    end
         
     | 
| 
       149 
     | 
    
         
            -
             
     | 
| 
       150 
     | 
    
         
            -
                    def parse_suppressed_contractions(data)
         
     | 
| 
       151 
     | 
    
         
            -
                      node = data && data.at_xpath('suppress_contractions')
         
     | 
| 
       152 
     | 
    
         
            -
                      node ? Java::ComIbmIcuText::UnicodeSet.to_array(Java::ComIbmIcuText::UnicodeSet.new(node.text)).to_a.join : ''
         
     | 
| 
       153 
     | 
    
         
            -
                    end
         
     | 
| 
       154 
     | 
    
         
            -
             
     | 
| 
       155 
     | 
    
         
            -
                    def parse_collator_options(data)
         
     | 
| 
       156 
     | 
    
         
            -
                      options = {}
         
     | 
| 
       157 
     | 
    
         
            -
             
     | 
| 
       158 
     | 
    
         
            -
                      if data
         
     | 
| 
       159 
     | 
    
         
            -
                        case_first_setting = data.at_xpath('settings[@caseFirst]')
         
     | 
| 
       160 
     | 
    
         
            -
                        options['case_first'] = case_first_setting.attr('caseFirst').to_sym if case_first_setting
         
     | 
| 
       161 
     | 
    
         
            -
                      end
         
     | 
| 
       162 
     | 
    
         
            -
             
     | 
| 
       163 
     | 
    
         
            -
                      options
         
     | 
| 
       164 
     | 
    
         
            -
                    end
         
     | 
| 
       165 
     | 
    
         
            -
             
     | 
| 
       166 
     | 
    
         
            -
                    def validate_tailoring_rule(rule)
         
     | 
| 
       167 
     | 
    
         
            -
                      return if IGNORED_TAGS.include?(rule.name)
         
     | 
| 
       168 
     | 
    
         
            -
             
     | 
| 
       169 
     | 
    
         
            -
                      raise ImportError, "Rule '#{rule.name}' is not supported." unless SUPPORTED_RULES.include?(rule.name)
         
     | 
| 
       170 
     | 
    
         
            -
                    end
         
     | 
| 
       171 
     | 
    
         
            -
             
     | 
| 
       172 
     | 
    
         
            -
                    def get_collation_elements(collator, string)
         
     | 
| 
       173 
     | 
    
         
            -
                      iter = collator.get_collation_element_iterator(string)
         
     | 
| 
       174 
     | 
    
         
            -
             
     | 
| 
       175 
     | 
    
         
            -
                      collation_elements = []
         
     | 
| 
       176 
     | 
    
         
            -
                      ce = iter.next
         
     | 
| 
       177 
     | 
    
         
            -
             
     | 
| 
       178 
     | 
    
         
            -
                      while ce != Java::ComIbmIcuText::CollationElementIterator::NULLORDER
         
     | 
| 
       179 
     | 
    
         
            -
                        p1 = (ce >> 24) & LAST_BYTE_MASK
         
     | 
| 
       180 
     | 
    
         
            -
                        p2 = (ce >> 16) & LAST_BYTE_MASK
         
     | 
| 
       181 
     | 
    
         
            -
             
     | 
| 
       182 
     | 
    
         
            -
                        primary   = p2.zero? ? p1 : (p1 << 8) + p2
         
     | 
| 
       183 
     | 
    
         
            -
                        secondary = (ce >> 8) & LAST_BYTE_MASK
         
     | 
| 
       184 
     | 
    
         
            -
                        tertiarly = ce & LAST_BYTE_MASK
         
     | 
| 
       185 
     | 
    
         
            -
             
     | 
| 
       186 
     | 
    
         
            -
                        collation_elements << [primary, secondary, tertiarly]
         
     | 
| 
       187 
     | 
    
         
            -
             
     | 
| 
       188 
     | 
    
         
            -
                        ce = iter.next
         
     | 
| 
       189 
     | 
    
         
            -
                      end
         
     | 
| 
       190 
     | 
    
         
            -
             
     | 
| 
       191 
     | 
    
         
            -
                      collation_elements
         
     | 
| 
       192 
     | 
    
         
            -
                    end
         
     | 
| 
       193 
     | 
    
         
            -
             
     | 
| 
       194 
     | 
    
         
            -
                    def get_code_points(string)
         
     | 
| 
       195 
     | 
    
         
            -
                      TwitterCldr::Normalization::NFD.normalize_code_points(TwitterCldr::Utils::CodePoints.from_string(string))
         
     | 
| 
       196 
     | 
    
         
            -
                    end
         
     | 
| 
       197 
     | 
    
         
            -
             
     | 
| 
       198 
     | 
    
         
            -
                  end
         
     | 
| 
       199 
     | 
    
         
            -
             
     | 
| 
       200 
     | 
    
         
            -
                end
         
     | 
| 
       201 
     | 
    
         
            -
              end
         
     | 
| 
       202 
     | 
    
         
            -
            end
         
     |