twitter_cldr 2.4.3 → 3.0.0.beta1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e7f44e818f4e5166f45c42579d91ecc27f28a3d5
4
- data.tar.gz: ae6815b54c99d68d8b1cf1fec574d5f0ae09097c
3
+ metadata.gz: 937a3b56e72068594f42c3c09d38da78c33e2a2c
4
+ data.tar.gz: d2f86005a0d9c50ba0ac90f4553e779544bd4ddf
5
5
  SHA512:
6
- metadata.gz: 8573dcbf880ff7da628f00a0cfbb03e60f358cb9ee1aec72edf2ec5a0e2bf30a259355a7827fb48ca6591faecb114e0e0b4363a2928e879b22cf26ac3a9028f7
7
- data.tar.gz: c2ccdcac7d53d42ff5901f832578e42ac7ba31bd6ff47ba26ac8939d68938532427cd442ca799fdce72aaaa5fff42602b4d78d1371a7570ca7990b19045c4c38
6
+ metadata.gz: eab5fae457ea2a42ac4a9943763eadda90c9505874f454a6fcf35b94af94f11aa09197d087965e3b06f1e1dffe7d389a59fb3c09c9792369d004ac35590d132d
7
+ data.tar.gz: 803107aec40257f79a901a02e1252e58b80f539590053641b579442d142d82a5ab78b4a0f6a0e298d4c2cf3512c29ac2dced848410b5c4ae0d6ef4a33d4ec0eb
data/History.txt CHANGED
@@ -1,6 +1,7 @@
1
- == 2.4.3
1
+ == 3.0.0
2
2
 
3
- * Fixing abbreviated timespan formats for en-GB.
3
+ * Adding maximum_level option to SortKeyBuilder to limit the size of collation sort keys (@jrochkind).
4
+ * Significant performance enhancements for normalization, estimated ~70% speed improvement.
4
5
 
5
6
  == 2.4.2
6
7
 
data/README.md CHANGED
@@ -86,7 +86,7 @@ In addition to formatting regular decimals, TwitterCLDR supports short and long
86
86
 
87
87
  ### Dates and Times
88
88
 
89
- `Date`, `Time`, and `DateTime` objects are supported:
89
+ `Time`, and `DateTime` objects are supported. `Date` objects are supported transiently:
90
90
 
91
91
  ```ruby
92
92
  DateTime.now.localize(:es).to_full_s # "lunes, 12 de diciembre de 2011 21:44:57 UTC -08:00"
@@ -94,15 +94,15 @@ DateTime.now.localize(:es).to_long_s # "12 de diciembre de 2011 21
94
94
  DateTime.now.localize(:es).to_medium_s # "12/12/2011 21:44:57"
95
95
  DateTime.now.localize(:es).to_short_s # "12/12/11 21:44"
96
96
 
97
- Date.today.localize(:es).to_full_s # "lunes 12 de diciembre de 2011"
98
- Date.today.localize(:es).to_long_s # "12 de diciembre de 2011"
99
- Date.today.localize(:es).to_medium_s # "12/12/2011"
100
- Date.today.localize(:es).to_short_s # "12/12/11"
101
-
102
97
  Time.now.localize(:es).to_full_s # "21:44:57 UTC -0800"
103
98
  Time.now.localize(:es).to_long_s # "21:44:57 UTC"
104
99
  Time.now.localize(:es).to_medium_s # "21:44:57"
105
100
  Time.now.localize(:es).to_short_s # "21:44"
101
+
102
+ DateTime.now.localize(:es).to_date.to_full_s # "lunes 12 de diciembre de 2011"
103
+ DateTime.now.localize(:es).to_date.to_long_s # "12 de diciembre de 2011"
104
+ DateTime.now.localize(:es).to_date.to_medium_s # "12/12/2011"
105
+ DateTime.now.localize(:es).to_date.to_short_s # "12/12/11"
106
106
  ```
107
107
 
108
108
  The default CLDR data set only includes 4 date formats, full, long, medium, and short. See below for a list of additional formats.
data/Rakefile CHANGED
@@ -133,4 +133,12 @@ namespace :update do
133
133
  task :canonical_compositions do
134
134
  TwitterCldr::Resources::CanonicalCompositionsUpdater.new('./resources/unicode_data').update
135
135
  end
136
+
137
+ desc 'Import normalization quick check data'
138
+ task :normalization_quick_check do
139
+ TwitterCldr::Resources::NormalizationQuickCheckImporter.new(
140
+ './vendor',
141
+ './resources/unicode_data'
142
+ ).import
143
+ end
136
144
  end
@@ -32,8 +32,10 @@ module TwitterCldr
32
32
  string_a == string_b ? 0 : get_sort_key(string_a) <=> get_sort_key(string_b)
33
33
  end
34
34
 
35
- def get_sort_key(string_or_code_points)
36
- TwitterCldr::Collation::SortKeyBuilder.build(get_collation_elements(string_or_code_points), @options[:case_first])
35
+ # Second arg options, supports an option :maximum_level, to
36
+ # pass on to SortKeyBuilder :maximum_level.
37
+ def get_sort_key(string_or_code_points, method_options = {})
38
+ TwitterCldr::Collation::SortKeyBuilder.build(get_collation_elements(string_or_code_points), :case_first => @options[:case_first], :maximum_level => method_options[:maximum_level])
37
39
  end
38
40
 
39
41
  def get_collation_elements(string_or_code_points)
@@ -17,7 +17,8 @@ module TwitterCldr
17
17
 
18
18
  LEVEL_SEPARATOR = 1 # separate levels in a sort key '01' bytes
19
19
 
20
- VALID_CASE_FIRST_OPTIONS = [nil, :lower, :upper]
20
+ VALID_CASE_FIRST_OPTIONS = [nil, :lower, :upper]
21
+ VALID_MAXIMUM_LEVEL_OPTIONS = [nil, 1, 2, 3]
21
22
 
22
23
  attr_reader :collation_elements, :case_first
23
24
 
@@ -26,25 +27,36 @@ module TwitterCldr
26
27
  # Arguments:
27
28
  #
28
29
  # collation_elements - an array of collation elements, represented as arrays of integer weights.
29
- # case_first - case-first sorting order setting.
30
+ # options - hash of options:
31
+ # case_first - optional case-first sorting order setting: :upper, :lower, nil (discard case bits).
32
+ # maximum_level - only append weights to maximum level specified (1 or 2), can be useful for searching/matching applications
30
33
  #
31
34
  # An instance of the class is created only to prevent passing of @collation_elements and @bytes_array from one
32
35
  # method into another while forming the sort key.
33
36
  #
34
- def self.build(collation_elements, case_first = nil)
35
- new(collation_elements, case_first).bytes_array
37
+ def self.build(collation_elements, options = nil)
38
+ new(collation_elements, options).bytes_array
36
39
  end
37
40
 
38
41
  # Arguments:
39
42
  #
40
43
  # collation_elements - an array of collation elements, represented as arrays of integer weights.
41
- # case_first - optional case-first sorting order setting: :upper, :lower, nil (discard case bits).
44
+ # options - hash of options:
45
+ # case_first - optional case-first sorting order setting: :upper, :lower, nil (discard case bits).
46
+ # maximum_level - only append weights to maximum level specified (1 or 2), can be useful for searching/matching applications
42
47
  #
43
- def initialize(collation_elements, case_first = nil)
48
+ def initialize(collation_elements, options = {})
49
+ raise ArgumentError, "second argument should be an options hash, not `#{options}`. Do you mean `:case_first => #{options}`?" unless options.kind_of? Hash
50
+
51
+ case_first = options[:case_first]
44
52
  raise ArgumentError, "invalid case-first options '#{case_first.inspect}'" unless VALID_CASE_FIRST_OPTIONS.include?(case_first)
45
53
 
54
+ maximum_level = options[:maximum_level]
55
+ raise ArgumentError, "invalid maximum_level option 'options[:maximum_level]'" unless VALID_MAXIMUM_LEVEL_OPTIONS.include?(maximum_level)
56
+
46
57
  @collation_elements = collation_elements
47
58
  @case_first = case_first
59
+ @maximum_level = maximum_level
48
60
 
49
61
  init_tertiary_constants
50
62
  end
@@ -59,8 +71,8 @@ module TwitterCldr
59
71
  @bytes_array = []
60
72
 
61
73
  append_primary_bytes
62
- append_secondary_bytes
63
- append_tertiary_bytes
74
+ append_secondary_bytes unless @maximum_level && (@maximum_level < 2)
75
+ append_tertiary_bytes unless @maximum_level && (@maximum_level < 3)
64
76
 
65
77
  @bytes_array
66
78
  end
@@ -5,12 +5,13 @@
5
5
 
6
6
  module TwitterCldr
7
7
  module Normalization
8
- autoload :Base, 'twitter_cldr/normalization/base'
9
- autoload :Hangul, 'twitter_cldr/normalization/hangul'
10
- autoload :NFC, 'twitter_cldr/normalization/nfc'
11
- autoload :NFD, 'twitter_cldr/normalization/nfd'
12
- autoload :NFKC, 'twitter_cldr/normalization/nfkc'
13
- autoload :NFKD, 'twitter_cldr/normalization/nfkd'
8
+ autoload :Base, 'twitter_cldr/normalization/base'
9
+ autoload :Hangul, 'twitter_cldr/normalization/hangul'
10
+ autoload :QuickCheck, 'twitter_cldr/normalization/quick_check'
11
+ autoload :NFC, 'twitter_cldr/normalization/nfc'
12
+ autoload :NFD, 'twitter_cldr/normalization/nfd'
13
+ autoload :NFKC, 'twitter_cldr/normalization/nfkc'
14
+ autoload :NFKD, 'twitter_cldr/normalization/nfkd'
14
15
 
15
16
  VALID_NORMALIZERS = [:NFD, :NFKD, :NFC, :NFKC]
16
17
  DEFAULT_NORMALIZER = :NFD
@@ -3,6 +3,8 @@
3
3
  # Copyright 2012 Twitter, Inc
4
4
  # http://www.apache.org/licenses/LICENSE-2.0
5
5
 
6
+ require 'hamster'
7
+
6
8
  module TwitterCldr
7
9
  module Normalization
8
10
  class Base
@@ -16,11 +18,18 @@ module TwitterCldr
16
18
  end
17
19
 
18
20
  def combining_class_for(code_point)
19
- TwitterCldr::Shared::CodePoint.find(code_point).combining_class.to_i
21
+ combining_class_cache[code_point] ||=
22
+ TwitterCldr::Shared::CodePoint.find(code_point).combining_class.to_i
20
23
  rescue NoMethodError
21
24
  0
22
25
  end
23
26
 
27
+ protected
28
+
29
+ def combining_class_cache
30
+ @combining_class_cache ||= {}
31
+ end
32
+
24
33
  end
25
34
 
26
35
  end
@@ -9,6 +9,23 @@ module TwitterCldr
9
9
 
10
10
  class << self
11
11
 
12
+ SBASE = 0xAC00
13
+ LBASE = 0x1100
14
+ VBASE = 0x1161
15
+ TBASE = 0x11A7
16
+
17
+ LCOUNT = 19
18
+ VCOUNT = 21
19
+ TCOUNT = 28
20
+
21
+ NCOUNT = VCOUNT * TCOUNT # 588
22
+ SCOUNT = LCOUNT * NCOUNT # 11172
23
+
24
+ LLIMIT = LBASE + LCOUNT # 0x1113 = 4371
25
+ VLIMIT = VBASE + VCOUNT # 0x1176 = 4470
26
+ TLIMIT = TBASE + TCOUNT # 0x11C3 = 4547
27
+ SLIMIT = SBASE + SCOUNT # 0xD7A4 = 55204
28
+
12
29
  # Special composition for Hangul syllables. Documented in Section 3.12 at
13
30
  # http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf
14
31
  #
@@ -24,45 +41,39 @@ module TwitterCldr
24
41
  # Also see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm#Hangul_Implicit_CEs
25
42
  #
26
43
  def decompose(code_point)
27
- l = code_point - SBASE
44
+ decomposition_cache[code_point] ||= begin
45
+ l = code_point - SBASE
28
46
 
29
- t = l % TCOUNT
30
- l /= TCOUNT
31
- v = l % VCOUNT
32
- l /= VCOUNT
47
+ t = l % TCOUNT
48
+ l /= TCOUNT
49
+ v = l % VCOUNT
50
+ l /= VCOUNT
33
51
 
34
- result = []
52
+ result = []
35
53
 
36
- result << LBASE + l
37
- result << VBASE + v
38
- result << TBASE + t if t > 0
54
+ result << LBASE + l
55
+ result << VBASE + v
56
+ result << TBASE + t if t > 0
39
57
 
40
- result
58
+ result
59
+ end
41
60
  end
42
61
 
43
62
  def hangul_syllable?(code_point)
44
63
  (SBASE...SLIMIT).include?(code_point)
45
64
  end
46
65
 
47
- SBASE = 0xAC00
48
- LBASE = 0x1100
49
- VBASE = 0x1161
50
- TBASE = 0x11A7
66
+ private
51
67
 
52
- LCOUNT = 19
53
- VCOUNT = 21
54
- TCOUNT = 28
55
-
56
- NCOUNT = VCOUNT * TCOUNT # 588
57
- SCOUNT = LCOUNT * NCOUNT # 11172
68
+ def syllable_cache
69
+ @syllable_cache ||= {}
70
+ end
58
71
 
59
- LLIMIT = LBASE + LCOUNT # 0x1113 = 4371
60
- VLIMIT = VBASE + VCOUNT # 0x1176 = 4470
61
- TLIMIT = TBASE + TCOUNT # 0x11C3 = 4547
62
- SLIMIT = SBASE + SCOUNT # 0xD7A4 = 55204
72
+ def decomposition_cache
73
+ @decomposition_cache ||= {}
74
+ end
63
75
 
64
76
  end
65
-
66
77
  end
67
78
  end
68
79
  end
@@ -13,6 +13,12 @@ module TwitterCldr
13
13
 
14
14
  class << self
15
15
 
16
+ VALID_HANGUL_SEQUENCES = [
17
+ [0, :lparts],
18
+ [1, :vparts],
19
+ [2, :tparts]
20
+ ]
21
+
16
22
  def normalize_code_points(code_points)
17
23
  compose(TwitterCldr::Normalization::NFKD.normalize_code_points(code_points))
18
24
  end
@@ -49,7 +55,7 @@ module TwitterCldr
49
55
  end
50
56
 
51
57
  def valid_hangul_sequence?(buffer_size, hangul_type)
52
- [[0, :lparts], [1, :vparts], [2, :tparts]].include?([buffer_size, hangul_type])
58
+ VALID_HANGUL_SEQUENCES.include?([buffer_size, hangul_type])
53
59
  end
54
60
 
55
61
  def compose_hangul(code_points)
@@ -16,7 +16,6 @@ module TwitterCldr
16
16
  #
17
17
  class NFKD < Base
18
18
 
19
-
20
19
  class << self
21
20
 
22
21
  def normalize_code_points(code_points)
@@ -26,14 +25,19 @@ module TwitterCldr
26
25
  protected
27
26
 
28
27
  def decompose(code_points)
29
- code_points.map { |code_point| decompose_recursively(code_point) }.flatten
28
+ code_points.inject(Hamster.list) do |ret, code_point|
29
+ decompose_recursively(code_point).each do |decomp_cp|
30
+ ret = ret.cons(decomp_cp)
31
+ end
32
+ ret
33
+ end.reverse.to_a
30
34
  end
31
35
 
32
36
  # Recursively decomposes a given code point with the values in its Decomposition Mapping property.
33
37
  #
34
38
  def decompose_recursively(code_point)
35
39
  unicode_data = TwitterCldr::Shared::CodePoint.find(code_point)
36
- return code_point unless unicode_data
40
+ return [code_point] unless unicode_data
37
41
 
38
42
  if unicode_data.hangul_type == :compositions
39
43
  decompose_hangul(code_point)
@@ -48,7 +52,7 @@ module TwitterCldr
48
52
  if decompose?(unicode_data)
49
53
  unicode_data.decomposition.map { |code_point| decompose_recursively(code_point) }.flatten
50
54
  else
51
- unicode_data.code_point
55
+ [unicode_data.code_point]
52
56
  end
53
57
  end
54
58
 
@@ -82,7 +86,6 @@ module TwitterCldr
82
86
  end
83
87
 
84
88
  result.concat(stable_sort(accum)) unless accum.empty?
85
-
86
89
  result.map { |cp_with_cc| cp_with_cc[0] }
87
90
  end
88
91
 
@@ -0,0 +1,41 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ module TwitterCldr
7
+ module Normalization
8
+
9
+ # This class isn't used anywhere because it was found that it negatively
10
+ # affects normalization performance.
11
+ module QuickCheck
12
+
13
+ class << self
14
+
15
+ def requires_normalization?(code_point, algorithm)
16
+ key = TwitterCldr::Utils.compute_cache_key(code_point, algorithm)
17
+ requires_cache[key] = if requires_cache[key].nil?
18
+ resource_for(algorithm).any? do |range|
19
+ range.include?(code_point)
20
+ end
21
+ else
22
+ requires_cache[key]
23
+ end
24
+ end
25
+
26
+ protected
27
+
28
+ def requires_cache
29
+ @requires_cache ||= {}
30
+ end
31
+
32
+ def resource_for(algorithm)
33
+ @resources ||= {}
34
+ @resources[algorithm] ||= TwitterCldr.get_resource("unicode_data", "#{algorithm.to_s.downcase}_quick_check")
35
+ end
36
+
37
+ end
38
+
39
+ end
40
+ end
41
+ end
@@ -18,5 +18,6 @@ module TwitterCldr
18
18
  autoload :TailoringImporter, 'twitter_cldr/resources/tailoring_importer'
19
19
  autoload :UnicodeDataImporter, 'twitter_cldr/resources/unicode_data_importer'
20
20
  autoload :BidiTestImporter, 'twitter_cldr/resources/bidi_test_importer'
21
+ autoload :NormalizationQuickCheckImporter, 'twitter_cldr/resources/normalization_quick_check_importer'
21
22
  end
22
23
  end
@@ -0,0 +1,86 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'twitter_cldr/resources/download'
7
+
8
+ module TwitterCldr
9
+ module Resources
10
+
11
+ class NormalizationQuickCheckImporter
12
+
13
+ PROPS_FILE_URL = "ftp://ftp.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt"
14
+
15
+ # Arguments:
16
+ #
17
+ # input_path - path to a directory containing DerivedNormalizationProps.txt
18
+ # output_path - output directory for imported YAML files
19
+ #
20
+ def initialize(input_path, output_path)
21
+ @input_path = input_path
22
+ @output_path = output_path
23
+ end
24
+
25
+ def import
26
+ parse_props_file.each_pair do |algorithm, code_point_list|
27
+ File.open(File.join(@output_path, "#{algorithm.downcase}_quick_check.yml"), "w+") do |f|
28
+ f.write(YAML.dump(rangify(partition_prop_list(code_point_list))))
29
+ end
30
+ end
31
+ end
32
+
33
+ private
34
+
35
+ def rangify(lists)
36
+ lists.map { |list| (list.first..list.last) }
37
+ end
38
+
39
+ def partition_prop_list(list)
40
+ last_item = 0
41
+ list.inject([]) do |ret, item|
42
+ (item - last_item == 1) ? ret[-1] << item : ret << [item]
43
+ last_item = item
44
+ ret
45
+ end
46
+ end
47
+
48
+ def parse_props_file
49
+ check_table = {}
50
+ cur_type = nil
51
+
52
+ File.open(props_file) do |input|
53
+ input.each_line do |line|
54
+ cur_type = nil if line =~ /=Maybe/
55
+ type = line.scan(/#\s*Property:\s*(NF[KDC]+)_Quick_Check/).flatten
56
+
57
+ if type.size > 0
58
+ cur_type = type.first
59
+ check_table[cur_type] = []
60
+ end
61
+
62
+ if check_table.size > 0 && line[0...1] != "#" && !line.strip.empty? && cur_type
63
+ start, finish = line.scan(/(\h+(\.\.\h+)?)/).first.first.split("..").map { |num| num.to_i(16) }
64
+
65
+ if finish
66
+ check_table[cur_type] += (start..finish).to_a
67
+ else
68
+ check_table[cur_type] << start
69
+ end
70
+ end
71
+
72
+ break if line =~ /={5,}/ && check_table.size >= 4 && check_table.all? { |key, val| val.size > 0 }
73
+ end
74
+ end
75
+
76
+ check_table
77
+ end
78
+
79
+ def props_file
80
+ TwitterCldr::Resources.download_if_necessary(File.join(@input_path, 'DerivedNormalizationProps.txt'), PROPS_FILE_URL)
81
+ end
82
+
83
+ end
84
+
85
+ end
86
+ end