twitter_cldr 2.4.3 → 3.0.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e7f44e818f4e5166f45c42579d91ecc27f28a3d5
4
- data.tar.gz: ae6815b54c99d68d8b1cf1fec574d5f0ae09097c
3
+ metadata.gz: 937a3b56e72068594f42c3c09d38da78c33e2a2c
4
+ data.tar.gz: d2f86005a0d9c50ba0ac90f4553e779544bd4ddf
5
5
  SHA512:
6
- metadata.gz: 8573dcbf880ff7da628f00a0cfbb03e60f358cb9ee1aec72edf2ec5a0e2bf30a259355a7827fb48ca6591faecb114e0e0b4363a2928e879b22cf26ac3a9028f7
7
- data.tar.gz: c2ccdcac7d53d42ff5901f832578e42ac7ba31bd6ff47ba26ac8939d68938532427cd442ca799fdce72aaaa5fff42602b4d78d1371a7570ca7990b19045c4c38
6
+ metadata.gz: eab5fae457ea2a42ac4a9943763eadda90c9505874f454a6fcf35b94af94f11aa09197d087965e3b06f1e1dffe7d389a59fb3c09c9792369d004ac35590d132d
7
+ data.tar.gz: 803107aec40257f79a901a02e1252e58b80f539590053641b579442d142d82a5ab78b4a0f6a0e298d4c2cf3512c29ac2dced848410b5c4ae0d6ef4a33d4ec0eb
data/History.txt CHANGED
@@ -1,6 +1,7 @@
1
- == 2.4.3
1
+ == 3.0.0
2
2
 
3
- * Fixing abbreviated timespan formats for en-GB.
3
+ * Adding maximum_level option to SortKeyBuilder to limit the size of collation sort keys (@jrochkind).
4
+ * Significant performance enhancements for normalization, estimated ~70% speed improvement.
4
5
 
5
6
  == 2.4.2
6
7
 
data/README.md CHANGED
@@ -86,7 +86,7 @@ In addition to formatting regular decimals, TwitterCLDR supports short and long
86
86
 
87
87
  ### Dates and Times
88
88
 
89
- `Date`, `Time`, and `DateTime` objects are supported:
89
+ `Time`, and `DateTime` objects are supported. `Date` objects are supported transiently:
90
90
 
91
91
  ```ruby
92
92
  DateTime.now.localize(:es).to_full_s # "lunes, 12 de diciembre de 2011 21:44:57 UTC -08:00"
@@ -94,15 +94,15 @@ DateTime.now.localize(:es).to_long_s # "12 de diciembre de 2011 21
94
94
  DateTime.now.localize(:es).to_medium_s # "12/12/2011 21:44:57"
95
95
  DateTime.now.localize(:es).to_short_s # "12/12/11 21:44"
96
96
 
97
- Date.today.localize(:es).to_full_s # "lunes 12 de diciembre de 2011"
98
- Date.today.localize(:es).to_long_s # "12 de diciembre de 2011"
99
- Date.today.localize(:es).to_medium_s # "12/12/2011"
100
- Date.today.localize(:es).to_short_s # "12/12/11"
101
-
102
97
  Time.now.localize(:es).to_full_s # "21:44:57 UTC -0800"
103
98
  Time.now.localize(:es).to_long_s # "21:44:57 UTC"
104
99
  Time.now.localize(:es).to_medium_s # "21:44:57"
105
100
  Time.now.localize(:es).to_short_s # "21:44"
101
+
102
+ DateTime.now.localize(:es).to_date.to_full_s # "lunes 12 de diciembre de 2011"
103
+ DateTime.now.localize(:es).to_date.to_long_s # "12 de diciembre de 2011"
104
+ DateTime.now.localize(:es).to_date.to_medium_s # "12/12/2011"
105
+ DateTime.now.localize(:es).to_date.to_short_s # "12/12/11"
106
106
  ```
107
107
 
108
108
  The default CLDR data set only includes 4 date formats, full, long, medium, and short. See below for a list of additional formats.
data/Rakefile CHANGED
@@ -133,4 +133,12 @@ namespace :update do
133
133
  task :canonical_compositions do
134
134
  TwitterCldr::Resources::CanonicalCompositionsUpdater.new('./resources/unicode_data').update
135
135
  end
136
+
137
+ desc 'Import normalization quick check data'
138
+ task :normalization_quick_check do
139
+ TwitterCldr::Resources::NormalizationQuickCheckImporter.new(
140
+ './vendor',
141
+ './resources/unicode_data'
142
+ ).import
143
+ end
136
144
  end
@@ -32,8 +32,10 @@ module TwitterCldr
32
32
  string_a == string_b ? 0 : get_sort_key(string_a) <=> get_sort_key(string_b)
33
33
  end
34
34
 
35
- def get_sort_key(string_or_code_points)
36
- TwitterCldr::Collation::SortKeyBuilder.build(get_collation_elements(string_or_code_points), @options[:case_first])
35
+ # Second arg options, supports an option :maximum_level, to
36
+ # pass on to SortKeyBuilder :maximum_level.
37
+ def get_sort_key(string_or_code_points, method_options = {})
38
+ TwitterCldr::Collation::SortKeyBuilder.build(get_collation_elements(string_or_code_points), :case_first => @options[:case_first], :maximum_level => method_options[:maximum_level])
37
39
  end
38
40
 
39
41
  def get_collation_elements(string_or_code_points)
@@ -17,7 +17,8 @@ module TwitterCldr
17
17
 
18
18
  LEVEL_SEPARATOR = 1 # separate levels in a sort key '01' bytes
19
19
 
20
- VALID_CASE_FIRST_OPTIONS = [nil, :lower, :upper]
20
+ VALID_CASE_FIRST_OPTIONS = [nil, :lower, :upper]
21
+ VALID_MAXIMUM_LEVEL_OPTIONS = [nil, 1, 2, 3]
21
22
 
22
23
  attr_reader :collation_elements, :case_first
23
24
 
@@ -26,25 +27,36 @@ module TwitterCldr
26
27
  # Arguments:
27
28
  #
28
29
  # collation_elements - an array of collation elements, represented as arrays of integer weights.
29
- # case_first - case-first sorting order setting.
30
+ # options - hash of options:
31
+ # case_first - optional case-first sorting order setting: :upper, :lower, nil (discard case bits).
32
+ # maximum_level - only append weights to maximum level specified (1 or 2), can be useful for searching/matching applications
30
33
  #
31
34
  # An instance of the class is created only to prevent passing of @collation_elements and @bytes_array from one
32
35
  # method into another while forming the sort key.
33
36
  #
34
- def self.build(collation_elements, case_first = nil)
35
- new(collation_elements, case_first).bytes_array
37
+ def self.build(collation_elements, options = nil)
38
+ new(collation_elements, options).bytes_array
36
39
  end
37
40
 
38
41
  # Arguments:
39
42
  #
40
43
  # collation_elements - an array of collation elements, represented as arrays of integer weights.
41
- # case_first - optional case-first sorting order setting: :upper, :lower, nil (discard case bits).
44
+ # options - hash of options:
45
+ # case_first - optional case-first sorting order setting: :upper, :lower, nil (discard case bits).
46
+ # maximum_level - only append weights to maximum level specified (1 or 2), can be useful for searching/matching applications
42
47
  #
43
- def initialize(collation_elements, case_first = nil)
48
+ def initialize(collation_elements, options = {})
49
+ raise ArgumentError, "second argument should be an options hash, not `#{options}`. Do you mean `:case_first => #{options}`?" unless options.kind_of? Hash
50
+
51
+ case_first = options[:case_first]
44
52
  raise ArgumentError, "invalid case-first options '#{case_first.inspect}'" unless VALID_CASE_FIRST_OPTIONS.include?(case_first)
45
53
 
54
+ maximum_level = options[:maximum_level]
55
+ raise ArgumentError, "invalid maximum_level option 'options[:maximum_level]'" unless VALID_MAXIMUM_LEVEL_OPTIONS.include?(maximum_level)
56
+
46
57
  @collation_elements = collation_elements
47
58
  @case_first = case_first
59
+ @maximum_level = maximum_level
48
60
 
49
61
  init_tertiary_constants
50
62
  end
@@ -59,8 +71,8 @@ module TwitterCldr
59
71
  @bytes_array = []
60
72
 
61
73
  append_primary_bytes
62
- append_secondary_bytes
63
- append_tertiary_bytes
74
+ append_secondary_bytes unless @maximum_level && (@maximum_level < 2)
75
+ append_tertiary_bytes unless @maximum_level && (@maximum_level < 3)
64
76
 
65
77
  @bytes_array
66
78
  end
@@ -5,12 +5,13 @@
5
5
 
6
6
  module TwitterCldr
7
7
  module Normalization
8
- autoload :Base, 'twitter_cldr/normalization/base'
9
- autoload :Hangul, 'twitter_cldr/normalization/hangul'
10
- autoload :NFC, 'twitter_cldr/normalization/nfc'
11
- autoload :NFD, 'twitter_cldr/normalization/nfd'
12
- autoload :NFKC, 'twitter_cldr/normalization/nfkc'
13
- autoload :NFKD, 'twitter_cldr/normalization/nfkd'
8
+ autoload :Base, 'twitter_cldr/normalization/base'
9
+ autoload :Hangul, 'twitter_cldr/normalization/hangul'
10
+ autoload :QuickCheck, 'twitter_cldr/normalization/quick_check'
11
+ autoload :NFC, 'twitter_cldr/normalization/nfc'
12
+ autoload :NFD, 'twitter_cldr/normalization/nfd'
13
+ autoload :NFKC, 'twitter_cldr/normalization/nfkc'
14
+ autoload :NFKD, 'twitter_cldr/normalization/nfkd'
14
15
 
15
16
  VALID_NORMALIZERS = [:NFD, :NFKD, :NFC, :NFKC]
16
17
  DEFAULT_NORMALIZER = :NFD
@@ -3,6 +3,8 @@
3
3
  # Copyright 2012 Twitter, Inc
4
4
  # http://www.apache.org/licenses/LICENSE-2.0
5
5
 
6
+ require 'hamster'
7
+
6
8
  module TwitterCldr
7
9
  module Normalization
8
10
  class Base
@@ -16,11 +18,18 @@ module TwitterCldr
16
18
  end
17
19
 
18
20
  def combining_class_for(code_point)
19
- TwitterCldr::Shared::CodePoint.find(code_point).combining_class.to_i
21
+ combining_class_cache[code_point] ||=
22
+ TwitterCldr::Shared::CodePoint.find(code_point).combining_class.to_i
20
23
  rescue NoMethodError
21
24
  0
22
25
  end
23
26
 
27
+ protected
28
+
29
+ def combining_class_cache
30
+ @combining_class_cache ||= {}
31
+ end
32
+
24
33
  end
25
34
 
26
35
  end
@@ -9,6 +9,23 @@ module TwitterCldr
9
9
 
10
10
  class << self
11
11
 
12
+ SBASE = 0xAC00
13
+ LBASE = 0x1100
14
+ VBASE = 0x1161
15
+ TBASE = 0x11A7
16
+
17
+ LCOUNT = 19
18
+ VCOUNT = 21
19
+ TCOUNT = 28
20
+
21
+ NCOUNT = VCOUNT * TCOUNT # 588
22
+ SCOUNT = LCOUNT * NCOUNT # 11172
23
+
24
+ LLIMIT = LBASE + LCOUNT # 0x1113 = 4371
25
+ VLIMIT = VBASE + VCOUNT # 0x1176 = 4470
26
+ TLIMIT = TBASE + TCOUNT # 0x11C3 = 4547
27
+ SLIMIT = SBASE + SCOUNT # 0xD7A4 = 55204
28
+
12
29
  # Special composition for Hangul syllables. Documented in Section 3.12 at
13
30
  # http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf
14
31
  #
@@ -24,45 +41,39 @@ module TwitterCldr
24
41
  # Also see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm#Hangul_Implicit_CEs
25
42
  #
26
43
  def decompose(code_point)
27
- l = code_point - SBASE
44
+ decomposition_cache[code_point] ||= begin
45
+ l = code_point - SBASE
28
46
 
29
- t = l % TCOUNT
30
- l /= TCOUNT
31
- v = l % VCOUNT
32
- l /= VCOUNT
47
+ t = l % TCOUNT
48
+ l /= TCOUNT
49
+ v = l % VCOUNT
50
+ l /= VCOUNT
33
51
 
34
- result = []
52
+ result = []
35
53
 
36
- result << LBASE + l
37
- result << VBASE + v
38
- result << TBASE + t if t > 0
54
+ result << LBASE + l
55
+ result << VBASE + v
56
+ result << TBASE + t if t > 0
39
57
 
40
- result
58
+ result
59
+ end
41
60
  end
42
61
 
43
62
  def hangul_syllable?(code_point)
44
63
  (SBASE...SLIMIT).include?(code_point)
45
64
  end
46
65
 
47
- SBASE = 0xAC00
48
- LBASE = 0x1100
49
- VBASE = 0x1161
50
- TBASE = 0x11A7
66
+ private
51
67
 
52
- LCOUNT = 19
53
- VCOUNT = 21
54
- TCOUNT = 28
55
-
56
- NCOUNT = VCOUNT * TCOUNT # 588
57
- SCOUNT = LCOUNT * NCOUNT # 11172
68
+ def syllable_cache
69
+ @syllable_cache ||= {}
70
+ end
58
71
 
59
- LLIMIT = LBASE + LCOUNT # 0x1113 = 4371
60
- VLIMIT = VBASE + VCOUNT # 0x1176 = 4470
61
- TLIMIT = TBASE + TCOUNT # 0x11C3 = 4547
62
- SLIMIT = SBASE + SCOUNT # 0xD7A4 = 55204
72
+ def decomposition_cache
73
+ @decomposition_cache ||= {}
74
+ end
63
75
 
64
76
  end
65
-
66
77
  end
67
78
  end
68
79
  end
@@ -13,6 +13,12 @@ module TwitterCldr
13
13
 
14
14
  class << self
15
15
 
16
+ VALID_HANGUL_SEQUENCES = [
17
+ [0, :lparts],
18
+ [1, :vparts],
19
+ [2, :tparts]
20
+ ]
21
+
16
22
  def normalize_code_points(code_points)
17
23
  compose(TwitterCldr::Normalization::NFKD.normalize_code_points(code_points))
18
24
  end
@@ -49,7 +55,7 @@ module TwitterCldr
49
55
  end
50
56
 
51
57
  def valid_hangul_sequence?(buffer_size, hangul_type)
52
- [[0, :lparts], [1, :vparts], [2, :tparts]].include?([buffer_size, hangul_type])
58
+ VALID_HANGUL_SEQUENCES.include?([buffer_size, hangul_type])
53
59
  end
54
60
 
55
61
  def compose_hangul(code_points)
@@ -16,7 +16,6 @@ module TwitterCldr
16
16
  #
17
17
  class NFKD < Base
18
18
 
19
-
20
19
  class << self
21
20
 
22
21
  def normalize_code_points(code_points)
@@ -26,14 +25,19 @@ module TwitterCldr
26
25
  protected
27
26
 
28
27
  def decompose(code_points)
29
- code_points.map { |code_point| decompose_recursively(code_point) }.flatten
28
+ code_points.inject(Hamster.list) do |ret, code_point|
29
+ decompose_recursively(code_point).each do |decomp_cp|
30
+ ret = ret.cons(decomp_cp)
31
+ end
32
+ ret
33
+ end.reverse.to_a
30
34
  end
31
35
 
32
36
  # Recursively decomposes a given code point with the values in its Decomposition Mapping property.
33
37
  #
34
38
  def decompose_recursively(code_point)
35
39
  unicode_data = TwitterCldr::Shared::CodePoint.find(code_point)
36
- return code_point unless unicode_data
40
+ return [code_point] unless unicode_data
37
41
 
38
42
  if unicode_data.hangul_type == :compositions
39
43
  decompose_hangul(code_point)
@@ -48,7 +52,7 @@ module TwitterCldr
48
52
  if decompose?(unicode_data)
49
53
  unicode_data.decomposition.map { |code_point| decompose_recursively(code_point) }.flatten
50
54
  else
51
- unicode_data.code_point
55
+ [unicode_data.code_point]
52
56
  end
53
57
  end
54
58
 
@@ -82,7 +86,6 @@ module TwitterCldr
82
86
  end
83
87
 
84
88
  result.concat(stable_sort(accum)) unless accum.empty?
85
-
86
89
  result.map { |cp_with_cc| cp_with_cc[0] }
87
90
  end
88
91
 
@@ -0,0 +1,41 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ module TwitterCldr
7
+ module Normalization
8
+
9
+ # This class isn't used anywhere because it was found that it negatively
10
+ # affects normalization performance.
11
+ module QuickCheck
12
+
13
+ class << self
14
+
15
+ def requires_normalization?(code_point, algorithm)
16
+ key = TwitterCldr::Utils.compute_cache_key(code_point, algorithm)
17
+ requires_cache[key] = if requires_cache[key].nil?
18
+ resource_for(algorithm).any? do |range|
19
+ range.include?(code_point)
20
+ end
21
+ else
22
+ requires_cache[key]
23
+ end
24
+ end
25
+
26
+ protected
27
+
28
+ def requires_cache
29
+ @requires_cache ||= {}
30
+ end
31
+
32
+ def resource_for(algorithm)
33
+ @resources ||= {}
34
+ @resources[algorithm] ||= TwitterCldr.get_resource("unicode_data", "#{algorithm.to_s.downcase}_quick_check")
35
+ end
36
+
37
+ end
38
+
39
+ end
40
+ end
41
+ end
@@ -18,5 +18,6 @@ module TwitterCldr
18
18
  autoload :TailoringImporter, 'twitter_cldr/resources/tailoring_importer'
19
19
  autoload :UnicodeDataImporter, 'twitter_cldr/resources/unicode_data_importer'
20
20
  autoload :BidiTestImporter, 'twitter_cldr/resources/bidi_test_importer'
21
+ autoload :NormalizationQuickCheckImporter, 'twitter_cldr/resources/normalization_quick_check_importer'
21
22
  end
22
23
  end
@@ -0,0 +1,86 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'twitter_cldr/resources/download'
7
+
8
+ module TwitterCldr
9
+ module Resources
10
+
11
+ class NormalizationQuickCheckImporter
12
+
13
+ PROPS_FILE_URL = "ftp://ftp.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt"
14
+
15
+ # Arguments:
16
+ #
17
+ # input_path - path to a directory containing DerivedNormalizationProps.txt
18
+ # output_path - output directory for imported YAML files
19
+ #
20
+ def initialize(input_path, output_path)
21
+ @input_path = input_path
22
+ @output_path = output_path
23
+ end
24
+
25
+ def import
26
+ parse_props_file.each_pair do |algorithm, code_point_list|
27
+ File.open(File.join(@output_path, "#{algorithm.downcase}_quick_check.yml"), "w+") do |f|
28
+ f.write(YAML.dump(rangify(partition_prop_list(code_point_list))))
29
+ end
30
+ end
31
+ end
32
+
33
+ private
34
+
35
+ def rangify(lists)
36
+ lists.map { |list| (list.first..list.last) }
37
+ end
38
+
39
+ def partition_prop_list(list)
40
+ last_item = 0
41
+ list.inject([]) do |ret, item|
42
+ (item - last_item == 1) ? ret[-1] << item : ret << [item]
43
+ last_item = item
44
+ ret
45
+ end
46
+ end
47
+
48
+ def parse_props_file
49
+ check_table = {}
50
+ cur_type = nil
51
+
52
+ File.open(props_file) do |input|
53
+ input.each_line do |line|
54
+ cur_type = nil if line =~ /=Maybe/
55
+ type = line.scan(/#\s*Property:\s*(NF[KDC]+)_Quick_Check/).flatten
56
+
57
+ if type.size > 0
58
+ cur_type = type.first
59
+ check_table[cur_type] = []
60
+ end
61
+
62
+ if check_table.size > 0 && line[0...1] != "#" && !line.strip.empty? && cur_type
63
+ start, finish = line.scan(/(\h+(\.\.\h+)?)/).first.first.split("..").map { |num| num.to_i(16) }
64
+
65
+ if finish
66
+ check_table[cur_type] += (start..finish).to_a
67
+ else
68
+ check_table[cur_type] << start
69
+ end
70
+ end
71
+
72
+ break if line =~ /={5,}/ && check_table.size >= 4 && check_table.all? { |key, val| val.size > 0 }
73
+ end
74
+ end
75
+
76
+ check_table
77
+ end
78
+
79
+ def props_file
80
+ TwitterCldr::Resources.download_if_necessary(File.join(@input_path, 'DerivedNormalizationProps.txt'), PROPS_FILE_URL)
81
+ end
82
+
83
+ end
84
+
85
+ end
86
+ end