twitter_cldr 5.2.0 → 5.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (110) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +0 -4
  3. data/Rakefile +19 -8
  4. data/lib/twitter_cldr/normalization.rb +18 -5
  5. data/lib/twitter_cldr/resources.rb +3 -1
  6. data/lib/twitter_cldr/resources/import_resolver.rb +11 -3
  7. data/lib/twitter_cldr/resources/loader.rb +22 -1
  8. data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -9
  9. data/lib/twitter_cldr/resources/postal_codes_importer.rb +19 -23
  10. data/lib/twitter_cldr/resources/segment_dictionaries_importer.rb +75 -0
  11. data/lib/twitter_cldr/resources/segment_tests_importer.rb +130 -13
  12. data/lib/twitter_cldr/segmentation.rb +25 -10
  13. data/lib/twitter_cldr/segmentation/brahmic_break_engine.rb +200 -0
  14. data/lib/twitter_cldr/segmentation/break_iterator.rb +22 -22
  15. data/lib/twitter_cldr/segmentation/burmese_break_engine.rb +83 -0
  16. data/lib/twitter_cldr/segmentation/category_table.rb +5 -1
  17. data/lib/twitter_cldr/segmentation/cj_break_engine.rb +163 -0
  18. data/lib/twitter_cldr/segmentation/cursor.rb +1 -1
  19. data/lib/twitter_cldr/segmentation/dictionary.rb +84 -0
  20. data/lib/twitter_cldr/segmentation/dictionary_break_engine.rb +34 -0
  21. data/lib/twitter_cldr/segmentation/khmer_break_engine.rb +83 -0
  22. data/lib/twitter_cldr/segmentation/korean_break_engine.rb +30 -0
  23. data/lib/twitter_cldr/segmentation/lao_break_engine.rb +85 -0
  24. data/lib/twitter_cldr/segmentation/line_iterator.rb +23 -0
  25. data/lib/twitter_cldr/segmentation/possible_word.rb +74 -0
  26. data/lib/twitter_cldr/segmentation/possible_word_list.rb +23 -0
  27. data/lib/twitter_cldr/segmentation/rule_set.rb +3 -12
  28. data/lib/twitter_cldr/segmentation/segment_iterator.rb +40 -0
  29. data/lib/twitter_cldr/segmentation/state_machine.rb +2 -8
  30. data/lib/twitter_cldr/segmentation/thai_break_engine.rb +141 -0
  31. data/lib/twitter_cldr/segmentation/unhandled_break_engine.rb +21 -0
  32. data/lib/twitter_cldr/segmentation/word_iterator.rb +170 -0
  33. data/lib/twitter_cldr/shared.rb +1 -0
  34. data/lib/twitter_cldr/shared/caser.rb +3 -3
  35. data/lib/twitter_cldr/shared/unicode_set.rb +77 -0
  36. data/lib/twitter_cldr/utils/range_set.rb +10 -1
  37. data/lib/twitter_cldr/version.rb +1 -1
  38. data/resources/collation/tailoring/km.yml +82 -0
  39. data/resources/collation/tailoring/lo.yml +4 -0
  40. data/resources/collation/tailoring/my.yml +940 -0
  41. data/resources/collation/tries/km.dump +0 -0
  42. data/resources/collation/tries/lo.dump +0 -0
  43. data/resources/collation/tries/my.dump +0 -0
  44. data/resources/locales/km/calendars.yml +373 -0
  45. data/resources/locales/km/currencies.yml +654 -0
  46. data/resources/locales/km/day_periods.yml +96 -0
  47. data/resources/locales/km/fields.yml +495 -0
  48. data/resources/locales/km/languages.yml +397 -0
  49. data/resources/locales/km/layout.yml +5 -0
  50. data/resources/locales/km/lists.yml +37 -0
  51. data/resources/locales/km/numbers.yml +402 -0
  52. data/resources/locales/km/plural_rules.yml +6 -0
  53. data/resources/locales/km/plurals.yml +12 -0
  54. data/resources/locales/km/rbnf.yml +131 -0
  55. data/resources/locales/km/territories.yml +267 -0
  56. data/resources/locales/km/timezones.yml +1471 -0
  57. data/resources/locales/km/units.yml +721 -0
  58. data/resources/locales/lo/calendars.yml +368 -0
  59. data/resources/locales/lo/currencies.yml +918 -0
  60. data/resources/locales/lo/day_periods.yml +96 -0
  61. data/resources/locales/lo/fields.yml +437 -0
  62. data/resources/locales/lo/languages.yml +529 -0
  63. data/resources/locales/lo/layout.yml +5 -0
  64. data/resources/locales/lo/lists.yml +42 -0
  65. data/resources/locales/lo/numbers.yml +476 -0
  66. data/resources/locales/lo/plural_rules.yml +7 -0
  67. data/resources/locales/lo/plurals.yml +14 -0
  68. data/resources/locales/lo/rbnf.yml +119 -0
  69. data/resources/locales/lo/territories.yml +265 -0
  70. data/resources/locales/lo/timezones.yml +1513 -0
  71. data/resources/locales/lo/units.yml +750 -0
  72. data/resources/locales/my/calendars.yml +374 -0
  73. data/resources/locales/my/currencies.yml +697 -0
  74. data/resources/locales/my/day_periods.yml +96 -0
  75. data/resources/locales/my/fields.yml +459 -0
  76. data/resources/locales/my/languages.yml +420 -0
  77. data/resources/locales/my/layout.yml +5 -0
  78. data/resources/locales/my/lists.yml +43 -0
  79. data/resources/locales/my/numbers.yml +417 -0
  80. data/resources/locales/my/plural_rules.yml +6 -0
  81. data/resources/locales/my/plurals.yml +12 -0
  82. data/resources/locales/my/rbnf.yml +145 -0
  83. data/resources/locales/my/territories.yml +265 -0
  84. data/resources/locales/my/timezones.yml +1479 -0
  85. data/resources/locales/my/units.yml +759 -0
  86. data/resources/locales/th/plurals.yml +1 -1
  87. data/resources/shared/segments/dictionaries/burmesedict.dump +0 -0
  88. data/resources/shared/segments/dictionaries/cjdict.dump +0 -0
  89. data/resources/shared/segments/dictionaries/khmerdict.dump +0 -0
  90. data/resources/shared/segments/dictionaries/laodict.dump +0 -0
  91. data/resources/shared/segments/dictionaries/thaidict.dump +0 -0
  92. data/resources/shared/segments/tests/dictionary_tests/combined.yml +1253 -0
  93. data/resources/shared/segments/tests/dictionary_tests/km.yml +204 -0
  94. data/resources/shared/segments/tests/dictionary_tests/ko.yml +171 -0
  95. data/resources/shared/segments/tests/dictionary_tests/lo.yml +236 -0
  96. data/resources/shared/segments/tests/dictionary_tests/my.yml +249 -0
  97. data/resources/shared/segments/tests/dictionary_tests/th.yml +201 -0
  98. data/resources/shared/segments/tests/dictionary_tests/zh.yml +206 -0
  99. data/resources/shared/segments/tests/line_break_test.yml +68 -68
  100. data/resources/shared/segments/tests/sentence_break_test.yml +52 -52
  101. data/resources/supported_locales.yml +3 -0
  102. data/spec/formatters/numbers/rbnf/locales/km/rbnf_test.yml +706 -0
  103. data/spec/formatters/numbers/rbnf/locales/lo/rbnf_test.yml +706 -0
  104. data/spec/formatters/numbers/rbnf/locales/my/rbnf_test.yml +706 -0
  105. data/spec/segmentation/dictionary_break_spec.rb +42 -0
  106. data/spec/segmentation/rule_set_spec.rb +3 -1
  107. data/spec/timezones/tests/km.yml +12475 -0
  108. data/spec/timezones/tests/lo.yml +12475 -0
  109. data/spec/timezones/tests/my.yml +12475 -0
  110. metadata +87 -3
@@ -27,7 +27,7 @@ module TwitterCldr
27
27
  position >= text.size
28
28
  end
29
29
 
30
- def codepoint(pos = position)
30
+ def codepoint(pos = @position)
31
31
  codepoints[pos]
32
32
  end
33
33
 
@@ -0,0 +1,84 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ module TwitterCldr
7
+ module Segmentation
8
+ class Dictionary
9
+
10
+ class << self
11
+ def burmese
12
+ get('burmese')
13
+ end
14
+
15
+ def cj
16
+ get('cj')
17
+ end
18
+
19
+ def khmer
20
+ get('khmer')
21
+ end
22
+
23
+ def lao
24
+ get('lao')
25
+ end
26
+
27
+ def thai
28
+ get('thai')
29
+ end
30
+
31
+ def get(name)
32
+ dictionary_cache[name] ||= begin
33
+ resource = TwitterCldr.get_resource(
34
+ 'shared', 'segments', 'dictionaries', "#{name}dict.dump"
35
+ )
36
+
37
+ new(resource)
38
+ end
39
+ end
40
+
41
+ private
42
+
43
+ def dictionary_cache
44
+ @dictionary_cache ||= {}
45
+ end
46
+ end
47
+
48
+ attr_reader :trie
49
+
50
+ def initialize(trie)
51
+ @trie = trie
52
+ end
53
+
54
+ def matches(cursor, max_search_length, limit)
55
+ return 0 if cursor.length == 0
56
+
57
+ count = 0
58
+ num_chars = 1
59
+ current = trie.root.child(cursor.codepoint)
60
+ values = []
61
+ lengths = []
62
+
63
+ until current.nil?
64
+ if current.has_value? && count < limit
65
+ values << current.value
66
+ lengths << num_chars
67
+ count += 1
68
+ end
69
+
70
+ break if num_chars >= max_search_length
71
+
72
+ current = current.child(
73
+ cursor.codepoint(cursor.position + num_chars)
74
+ )
75
+
76
+ num_chars += 1
77
+ end
78
+
79
+ [count, values, lengths, num_chars]
80
+ end
81
+
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,34 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ module TwitterCldr
7
+ module Segmentation
8
+ class DictionaryBreakEngine
9
+
10
+ def each_boundary(cursor, &block)
11
+ return to_enum(__method__, cursor) unless block_given?
12
+
13
+ stop = cursor.position
14
+
15
+ while !cursor.eos? && word_set.include?(cursor.codepoints[stop])
16
+ stop += 1
17
+ end
18
+
19
+ divide_up_dictionary_range(cursor, stop, &block)
20
+ end
21
+
22
+ def word_set(*args)
23
+ raise NotImplementedError, "#{__method__} must be defined in derived classes"
24
+ end
25
+
26
+ private
27
+
28
+ def divide_up_dictionary_range(*args)
29
+ raise NotImplementedError, "#{__method__} must be defined in derived classes"
30
+ end
31
+
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,83 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'singleton'
7
+ require 'forwardable'
8
+
9
+ module TwitterCldr
10
+ module Segmentation
11
+
12
+ # https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/KhmerBreakEngine.java
13
+ class KhmerBreakEngine
14
+
15
+ include Singleton
16
+ extend Forwardable
17
+
18
+ def_delegators :engine, :each_boundary
19
+
20
+ def self.word_set
21
+ @word_set ||= begin
22
+ uset = TwitterCldr::Shared::UnicodeSet.new
23
+ uset.apply_pattern('[[:Khmer:]&[:Line_Break=SA:]]')
24
+ uset.to_set
25
+ end
26
+ end
27
+
28
+ private
29
+
30
+ # All Brahmic scripts (including Khmer) can make use of the same break
31
+ # logic, so we use composition here and defer to the Brahmic break engine.
32
+ def engine
33
+ @engine ||= BrahmicBreakEngine.new(
34
+ # How many words in a row are "good enough"?
35
+ lookahead: 3,
36
+
37
+ # Will not combine a non-word with a preceding dictionary word longer than this
38
+ root_combine_threshold: 3,
39
+
40
+ # Will not combine a non-word that shares at least this much prefix with a
41
+ # dictionary word with a preceding word
42
+ prefix_combine_threshold: 3,
43
+
44
+ # Minimum word size
45
+ min_word: 4,
46
+
47
+ # Minimum number of characters for two words (same as min_word for Khmer)
48
+ min_word_span: 4,
49
+
50
+ word_set: self.class.word_set,
51
+ mark_set: mark_set,
52
+ end_word_set: end_word_set,
53
+ begin_word_set: begin_word_set,
54
+ dictionary: Dictionary.khmer,
55
+ advance_past_suffix: -> (*) do
56
+ 0 # not applicable to Khmer
57
+ end
58
+ )
59
+ end
60
+
61
+ def mark_set
62
+ @mark_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
63
+ set.apply_pattern('[[:Khmer:]&[:Line_Break=SA:]&[:M:]]')
64
+ set.add(0x0020)
65
+ end
66
+ end
67
+
68
+ def end_word_set
69
+ @end_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
70
+ set.add_list(self.class.word_set)
71
+ set.subtract(0x17D2) # KHMER SIGN COENG that combines some characters
72
+ end
73
+ end
74
+
75
+ def begin_word_set
76
+ @begin_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
77
+ set.add_range(0x1780..0x17B3)
78
+ end
79
+ end
80
+
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,30 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'singleton'
7
+
8
+ module TwitterCldr
9
+ module Segmentation
10
+ class KoreanBreakEngine < CjBreakEngine
11
+
12
+ include Singleton
13
+
14
+ def self.word_set
15
+ @word_set ||= begin
16
+ uset = TwitterCldr::Shared::UnicodeSet.new
17
+ uset.add_range(0xAC00..0xD7A3)
18
+ uset.to_set
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def word_set
25
+ self.class.word_set
26
+ end
27
+
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,85 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'singleton'
7
+ require 'forwardable'
8
+
9
+ module TwitterCldr
10
+ module Segmentation
11
+
12
+ # See: https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/LaoBreakEngine.java
13
+ class LaoBreakEngine
14
+
15
+ include Singleton
16
+ extend Forwardable
17
+
18
+ def_delegators :engine, :each_boundary
19
+
20
+ def self.word_set
21
+ @word_set ||= begin
22
+ uset = TwitterCldr::Shared::UnicodeSet.new
23
+ uset.apply_pattern('[[:Laoo:]&[:Line_Break=SA:]]')
24
+ uset.to_set
25
+ end
26
+ end
27
+
28
+ private
29
+
30
+ # All Brahmic scripts (including Lao) can make use of the same break
31
+ # logic, so we use composition here and defer to the Brahmic break engine.
32
+ def engine
33
+ @engine ||= BrahmicBreakEngine.new(
34
+ # How many words in a row are "good enough"?
35
+ lookahead: 3,
36
+
37
+ # Will not combine a non-word with a preceding dictionary word longer than this
38
+ root_combine_threshold: 3,
39
+
40
+ # Will not combine a non-word that shares at least this much prefix with a
41
+ # dictionary word with a preceding word
42
+ prefix_combine_threshold: 3,
43
+
44
+ # Minimum word size
45
+ min_word: 2,
46
+
47
+ # Minimum number of characters for two words (same as min_word for Lao)
48
+ min_word_span: 2,
49
+
50
+ word_set: self.class.word_set,
51
+ mark_set: mark_set,
52
+ end_word_set: end_word_set,
53
+ begin_word_set: begin_word_set,
54
+ dictionary: Dictionary.lao,
55
+ advance_past_suffix: -> (*) do
56
+ 0 # not applicable to Lao
57
+ end
58
+ )
59
+ end
60
+
61
+ def mark_set
62
+ @mark_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
63
+ set.apply_pattern('[[:Laoo:]&[:Line_Break=SA:]&[:M:]]')
64
+ set.add(0x0020)
65
+ end
66
+ end
67
+
68
+ def end_word_set
69
+ @end_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
70
+ set.add_list(self.class.word_set)
71
+ set.subtract_range(0x0EC0..0x0EC4) # prefix vowels
72
+ end
73
+ end
74
+
75
+ def begin_word_set
76
+ @begin_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
77
+ set.add_range(0x0E81..0x0EAE) # basic consonants (including holes for corresponding Thai characters)
78
+ set.add_range(0x0EDC..0x0EDD) # digraph consonants (no Thai equivalent)
79
+ set.add_range(0x0EC0..0x0EC4) # prefix vowels
80
+ end
81
+ end
82
+
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,23 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ module TwitterCldr
7
+ module Segmentation
8
+ class LineIterator < SegmentIterator
9
+ def each_boundary(str, &block)
10
+ return to_enum(__method__, str) unless block_given?
11
+
12
+ # Let the state machine find the first boundary for the line
13
+ # boundary type (i.e. don't yield 0 here). This helps pass
14
+ # nearly all the Unicode segmentation tests, so it must be
15
+ # the right thing to do. Normally the first boundary is the
16
+ # implicit start of text boundary, but potentially not for
17
+ # the line rules?
18
+ cursor = create_cursor(str)
19
+ rule_set.each_boundary(cursor, &block)
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,74 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ module TwitterCldr
7
+ module Segmentation
8
+ class PossibleWord
9
+ # list size, limited by the maximum number of words in the dictionary
10
+ # that form a nested sequence.
11
+ POSSIBLE_WORD_LIST_MAX = 20
12
+
13
+ def initialize
14
+ @lengths = []
15
+ @count = nil
16
+ @offset = -1
17
+ end
18
+
19
+ # fill the list of candidates if needed, select the longest, and return the number found
20
+ def candidates(cursor, dictionary, end_pos)
21
+ start = cursor.position
22
+
23
+ if start != @offset
24
+ @offset = start
25
+ @count, _, @lengths, @prefix = dictionary.matches(
26
+ cursor, end_pos - start, POSSIBLE_WORD_LIST_MAX
27
+ )
28
+
29
+ # dictionary leaves text after longest prefix, not longest word, so back up.
30
+ if @count <= 0
31
+ cursor.position = start
32
+ end
33
+ end
34
+
35
+ if @count > 0
36
+ cursor.position = start + @lengths[@count - 1]
37
+ end
38
+
39
+ @current = @count - 1
40
+ @mark = @current
41
+
42
+ return @count
43
+ end
44
+
45
+ # select the currently marked candidate, point after it in the text, and invalidate self
46
+ def accept_marked(cursor)
47
+ cursor.position = @offset + @lengths[@mark]
48
+ @lengths[@mark]
49
+ end
50
+
51
+ # back up from the current candidate to the next shorter one; return true if that exists
52
+ # and point the text after it
53
+ def back_up(cursor)
54
+ if @current > 0
55
+ @current -= 1
56
+ cursor.position = @offset + @lengths[@current]
57
+ return true
58
+ end
59
+
60
+ false
61
+ end
62
+
63
+ # return the longest prefix this candidate location shares with a dictionary word
64
+ def longest_prefix
65
+ @prefix
66
+ end
67
+
68
+ # mark the current candidate as the one we like
69
+ def mark_current
70
+ @mark = @current
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,23 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ module TwitterCldr
7
+ module Segmentation
8
+ class PossibleWordList
9
+
10
+ attr_reader :length, :items
11
+
12
+ def initialize(length)
13
+ @items = Array.new(length) { PossibleWord.new }
14
+ @length = length
15
+ end
16
+
17
+ def [](idx)
18
+ items[idx % length]
19
+ end
20
+
21
+ end
22
+ end
23
+ end