twitter_cldr 5.1.0 → 5.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +5 -5
  3. data/lib/twitter_cldr.rb +1 -0
  4. data/lib/twitter_cldr/resources.rb +2 -8
  5. data/lib/twitter_cldr/resources/loader.rb +6 -4
  6. data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -1
  7. data/lib/twitter_cldr/resources/segment_rules_importer.rb +202 -0
  8. data/lib/twitter_cldr/resources/segment_tests_importer.rb +3 -1
  9. data/lib/twitter_cldr/segmentation.rb +10 -8
  10. data/lib/twitter_cldr/segmentation/break_iterator.rb +15 -11
  11. data/lib/twitter_cldr/segmentation/category_table.rb +56 -0
  12. data/lib/twitter_cldr/segmentation/cursor.rb +10 -5
  13. data/lib/twitter_cldr/segmentation/metadata.rb +20 -0
  14. data/lib/twitter_cldr/segmentation/null_suppressions.rb +18 -0
  15. data/lib/twitter_cldr/segmentation/rule_set.rb +23 -79
  16. data/lib/twitter_cldr/segmentation/state_machine.rb +125 -0
  17. data/lib/twitter_cldr/segmentation/state_table.rb +46 -0
  18. data/lib/twitter_cldr/segmentation/status_table.rb +30 -0
  19. data/lib/twitter_cldr/segmentation/suppressions.rb +79 -0
  20. data/lib/twitter_cldr/shared/caser.rb +1 -1
  21. data/lib/twitter_cldr/shared/locale.rb +6 -2
  22. data/lib/twitter_cldr/version.rb +1 -1
  23. data/resources/shared/segments/rules/el/sentence.yml +723 -0
  24. data/resources/shared/segments/rules/en-US-POSIX/word.yml +527 -0
  25. data/resources/shared/segments/rules/ja/line.yml +964 -0
  26. data/resources/shared/segments/rules/ja/word.yml +527 -0
  27. data/resources/shared/segments/rules/root/grapheme.yml +463 -0
  28. data/resources/shared/segments/rules/root/line.yml +964 -0
  29. data/resources/shared/segments/rules/root/sentence.yml +723 -0
  30. data/resources/shared/segments/rules/root/word.yml +527 -0
  31. data/resources/shared/segments/rules/zh-Hant/line.yml +964 -0
  32. data/resources/shared/segments/rules/zh/line.yml +964 -0
  33. data/resources/shared/segments/suppressions/de/sentence.yml +5 -0
  34. data/resources/shared/segments/suppressions/en/sentence.yml +5 -0
  35. data/resources/shared/segments/suppressions/es/sentence.yml +5 -0
  36. data/resources/shared/segments/suppressions/fr/sentence.yml +5 -0
  37. data/resources/shared/segments/suppressions/it/sentence.yml +5 -0
  38. data/resources/shared/segments/suppressions/pt/sentence.yml +5 -0
  39. data/resources/shared/segments/suppressions/ru/sentence.yml +5 -0
  40. data/resources/shared/segments/tests/grapheme_break_test.yml +603 -0
  41. data/resources/shared/segments/tests/line_break_test.yml +7348 -0
  42. data/resources/uli/segments/de.yml +5 -230
  43. data/resources/uli/segments/en.yml +3 -154
  44. data/resources/uli/segments/es.yml +5 -145
  45. data/resources/uli/segments/fr.yml +5 -68
  46. data/resources/uli/segments/it.yml +3 -48
  47. data/resources/uli/segments/pt.yml +5 -173
  48. data/resources/uli/segments/ru.yml +3 -10
  49. data/spec/segmentation/rule_set_spec.rb +54 -27
  50. metadata +29 -9
  51. data/lib/twitter_cldr/resources/uli.rb +0 -12
  52. data/lib/twitter_cldr/resources/uli/segment_exceptions_importer.rb +0 -59
  53. data/lib/twitter_cldr/segmentation/parser.rb +0 -71
  54. data/lib/twitter_cldr/segmentation/rule.rb +0 -79
  55. data/lib/twitter_cldr/segmentation/rule_set_builder.rb +0 -142
  56. data/resources/shared/segments/segments_root.yml +0 -869
  57. data/spec/segmentation/parser_spec.rb +0 -104
@@ -6,10 +6,12 @@
6
6
  module TwitterCldr
7
7
  module Segmentation
8
8
  class Cursor
9
- attr_reader :text, :position, :match_cache
9
+ attr_reader :text, :codepoints
10
+ attr_accessor :position
10
11
 
11
12
  def initialize(text)
12
13
  @text = text
14
+ @codepoints = text.codepoints
13
15
  reset
14
16
  end
15
17
 
@@ -19,15 +21,18 @@ module TwitterCldr
19
21
 
20
22
  def reset
21
23
  @position = 0
22
- @match_cache = {}
23
24
  end
24
25
 
25
- def eof?
26
+ def eos?
26
27
  position >= text.size
27
28
  end
28
29
 
29
- def eos?
30
- position >= text.size - 1
30
+ def codepoint(pos = position)
31
+ codepoints[pos]
32
+ end
33
+
34
+ def length
35
+ text.length
31
36
  end
32
37
  end
33
38
  end
@@ -0,0 +1,20 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ module TwitterCldr
7
+ module Segmentation
8
+ class Metadata
9
+ attr_reader :values
10
+
11
+ def initialize(values)
12
+ @values = values
13
+ end
14
+
15
+ def category_count
16
+ @category_count ||= values[:category_count]
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,18 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'singleton'
7
+
8
+ module TwitterCldr
9
+ module Segmentation
10
+ class NullSuppressions
11
+ include Singleton
12
+
13
+ def should_break?(_cursor)
14
+ true
15
+ end
16
+ end
17
+ end
18
+ end
@@ -8,109 +8,53 @@ module TwitterCldr
8
8
  class RuleSet
9
9
 
10
10
  class << self
11
- def load(*args)
12
- RuleSetBuilder.load(*args)
11
+ def create(locale, boundary_type, options = {})
12
+ new(locale, StateMachine.instance(boundary_type, locale), options)
13
13
  end
14
14
  end
15
15
 
16
- attr_reader :locale, :rules, :boundary_type
16
+ attr_reader :locale, :state_machine
17
17
  attr_accessor :use_uli_exceptions
18
18
 
19
19
  alias_method :use_uli_exceptions?, :use_uli_exceptions
20
20
 
21
- def initialize(locale, rules, boundary_type, options)
21
+ def initialize(locale, state_machine, options)
22
22
  @locale = locale
23
- @rules = rules
24
- @boundary_type = boundary_type
23
+ @state_machine = state_machine
25
24
  @use_uli_exceptions = options.fetch(
26
25
  :use_uli_exceptions, false
27
26
  )
28
27
  end
29
28
 
30
29
  def each_boundary(str)
31
- if block_given?
32
- cursor = Cursor.new(str)
33
- last_boundary = 0
30
+ return to_enum(__method__, str) unless block_given?
34
31
 
35
- # implicit start of text boundary
36
- yield 0
32
+ cursor = Cursor.new(str)
37
33
 
38
- until cursor.eof?
39
- match = find_match(cursor)
40
- rule = match.rule
34
+ # Let the state machine find the first boundary for the line
35
+ # boundary type. This helps pass nearly all the Unicode
36
+ # segmentation tests, so it must be the right thing to do.
37
+ # Normally the first boundary is the implicit start of text
38
+ # boundary, but potentially not for the line rules?
39
+ yield 0 unless state_machine.boundary_type == 'line'
41
40
 
42
- if rule.break?
43
- yield match.boundary_position
44
- last_boundary = match.boundary_position
45
- end
46
-
47
- if match.boundary_position == cursor.position
48
- cursor.advance
49
- else
50
- cursor.advance(
51
- match.boundary_position - cursor.position
52
- )
53
- end
54
- end
55
-
56
- # implicit end of text boundary
57
- yield str.size unless last_boundary == str.size
58
- else
59
- to_enum(__method__, str)
41
+ until cursor.eos?
42
+ state_machine.handle_next(cursor)
43
+ yield cursor.position if suppressions.should_break?(cursor)
60
44
  end
61
45
  end
62
46
 
63
- private
64
-
65
- def each_rule(&block)
66
- if block_given?
67
- if use_uli_exceptions? && supports_exceptions?
68
- yield exception_rule
69
- end
70
-
71
- rules.each(&block)
72
- else
73
- to_enum(__method__)
74
- end
75
- end
76
-
77
- def exception_rule
78
- @exception_rule ||= RuleSetBuilder.exception_rule_for(
79
- locale, boundary_type
80
- )
81
- end
82
-
83
- def supports_exceptions?
84
- boundary_type == 'sentence'
47
+ def boundary_type
48
+ state_machine.boundary_type
85
49
  end
86
50
 
87
- def find_match(cursor)
88
- match = find_cached_match(cursor)
51
+ private
89
52
 
90
- match || if cursor.eos?
91
- RuleSetBuilder.implicit_end_of_text_rule.match(cursor)
53
+ def suppressions
54
+ @suppressions ||= if use_uli_exceptions?
55
+ Suppressions.instance(boundary_type, locale)
92
56
  else
93
- RuleSetBuilder.implicit_final_rule.match(cursor)
94
- end
95
- end
96
-
97
- def find_cached_match(cursor)
98
- cursor.match_cache.fetch(cursor.position) do
99
- matches = match_all(cursor)
100
-
101
- matches.each do |m|
102
- cursor.match_cache[m.boundary_position - 1] ||= m
103
- end
104
-
105
- matches.first
106
- end
107
- end
108
-
109
- def match_all(cursor)
110
- each_rule.each_with_object([]) do |rule, ret|
111
- if match = rule.match(cursor)
112
- ret << match
113
- end
57
+ NullSuppressions.instance
114
58
  end
115
59
  end
116
60
  end
@@ -0,0 +1,125 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'base64'
7
+ require 'singleton'
8
+
9
+ module TwitterCldr
10
+ module Segmentation
11
+ class StateMachine
12
+ include Singleton
13
+
14
+ START_STATE = 1
15
+ STOP_STATE = 0
16
+ NEXT_STATES = 4
17
+ ACCEPTING = 0
18
+
19
+ class << self
20
+ def instance(boundary_type, locale)
21
+ resource_path = find_resource(boundary_type, locale)
22
+
23
+ cache[resource_path] ||= begin
24
+ rsrc = TwitterCldr.get_resource(resource_path)
25
+
26
+ new(
27
+ boundary_type,
28
+ locale,
29
+ Metadata.new(rsrc[:metadata]),
30
+ StateTable.load16(rsrc[:forward_table]),
31
+ StateTable.load16(rsrc[:backward_table]),
32
+ StatusTable.load(rsrc[:status_table]),
33
+ CategoryTable.load16(rsrc[:category_table])
34
+ )
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def find_resource(boundary_type, locale)
41
+ path = TwitterCldr.resource_file_path(
42
+ ['shared', 'segments', 'rules', locale, boundary_type]
43
+ )
44
+
45
+ return path if TwitterCldr.resource_exists?(path)
46
+
47
+ TwitterCldr.resource_file_path(
48
+ ['shared', 'segments', 'rules', 'root', boundary_type]
49
+ )
50
+ end
51
+
52
+ def cache
53
+ @cache ||= {}
54
+ end
55
+ end
56
+
57
+ attr_reader :boundary_type, :locale
58
+ attr_reader :metadata, :ftable, :rtable, :status_table, :category_table
59
+
60
+ def initialize(boundary_type, locale, metadata, ftable, rtable, status_table, category_table)
61
+ @boundary_type = boundary_type
62
+ @locale = locale
63
+ @metadata = metadata
64
+ @ftable = ftable
65
+ @rtable = rtable
66
+ @status_table = status_table
67
+ @category_table = category_table
68
+ end
69
+
70
+ def handle_next(cursor)
71
+ result = initial_position = cursor.position
72
+ state = START_STATE
73
+ row = row_index_for(state)
74
+ category = 3
75
+ mode = :run
76
+
77
+ if ftable.bof_required?
78
+ category = 2
79
+ mode = :start
80
+ end
81
+
82
+ until state == STOP_STATE
83
+ if cursor.eos?
84
+ break if mode == :stop
85
+ mode = :stop
86
+ category = 1
87
+ elsif mode == :run
88
+ category = category_table.get(cursor.codepoint)
89
+
90
+ if (category & 0x4000) != 0
91
+ category &= ~0x4000
92
+ end
93
+
94
+ cursor.advance
95
+ else
96
+ mode = :run
97
+ end
98
+
99
+ state = ftable[row + NEXT_STATES + category]
100
+ row = row_index_for(state)
101
+
102
+ if ftable[row + ACCEPTING] == -1
103
+ # match found
104
+ result = cursor.position
105
+ end
106
+ end
107
+
108
+ cursor.position = result
109
+
110
+ # don't let cursor get stuck
111
+ if cursor.position == initial_position
112
+ cursor.advance
113
+ end
114
+
115
+ result
116
+ end
117
+
118
+ private
119
+
120
+ def row_index_for(state)
121
+ state * (metadata.category_count + 4)
122
+ end
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,46 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'base64'
7
+
8
+ module TwitterCldr
9
+ module Segmentation
10
+ class StateTable
11
+ PACK_FMT_16 = 's!*'.freeze
12
+ BOF_REQUIRED_FLAG = 2
13
+
14
+ class << self
15
+ def load16(hash)
16
+ new(
17
+ Base64.decode64(hash[:table]).unpack(PACK_FMT_16),
18
+ hash[:flags]
19
+ )
20
+ end
21
+ end
22
+
23
+ attr_reader :values, :flags
24
+
25
+ def initialize(values, flags)
26
+ @values = values
27
+ @flags = flags
28
+ end
29
+
30
+ def [](idx)
31
+ values[idx]
32
+ end
33
+
34
+ def bof_required?
35
+ flags & BOF_REQUIRED_FLAG != 0
36
+ end
37
+
38
+ def dump16
39
+ {
40
+ table: Base64.encode64(values.pack(PACK_FMT_16)).strip,
41
+ flags: flags
42
+ }
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,30 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'base64'
7
+
8
+ module TwitterCldr
9
+ module Segmentation
10
+ class StatusTable
11
+ PACK_FMT = 'I!*'.freeze
12
+
13
+ class << self
14
+ def load(hash)
15
+ new(Base64.decode64(hash[:table]).unpack(PACK_FMT))
16
+ end
17
+ end
18
+
19
+ attr_reader :values
20
+
21
+ def initialize(values)
22
+ @values = values
23
+ end
24
+
25
+ def dump
26
+ { table: Base64.encode64(values.pack(PACK_FMT)).strip }
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,79 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'singleton'
7
+
8
+ module TwitterCldr
9
+ module Segmentation
10
+ class Suppressions
11
+ include Singleton
12
+
13
+ class << self
14
+ def instance(boundary_type, locale)
15
+ resource_path = find_resource(boundary_type, locale)
16
+ return NullSuppressions.instance unless resource_path
17
+
18
+ cache[resource_path] ||= begin
19
+ rsrc = TwitterCldr.get_resource(resource_path)
20
+
21
+ new(
22
+ Marshal.load(rsrc[:forwards_trie]),
23
+ Marshal.load(rsrc[:backwards_trie])
24
+ )
25
+ end
26
+ end
27
+
28
+ private
29
+
30
+ def find_resource(boundary_type, locale)
31
+ path = TwitterCldr.resource_file_path(
32
+ ['shared', 'segments', 'suppressions', locale, boundary_type]
33
+ )
34
+
35
+ path if TwitterCldr.resource_exists?(path)
36
+ end
37
+
38
+ def cache
39
+ @cache ||= {}
40
+ end
41
+ end
42
+
43
+ attr_reader :forward_trie, :backward_trie
44
+
45
+ def initialize(forward_trie, backward_trie)
46
+ @forward_trie = forward_trie
47
+ @backward_trie = backward_trie
48
+ end
49
+
50
+ def should_break?(cursor)
51
+ idx = cursor.position
52
+
53
+ # consider case when a space follows the '.' (so we handle i.e. "Mr. Brown")
54
+ idx -= 2 if cursor.codepoint(idx - 1) == 32
55
+ node = backward_trie.root
56
+
57
+ found = loop do
58
+ break false if idx < 0 || idx >= cursor.length
59
+ node = node.child(cursor.codepoint(idx))
60
+ break false unless node
61
+ break true if node.value
62
+ idx -= 1
63
+ end
64
+
65
+ return true unless found
66
+
67
+ node = forward_trie.root
68
+
69
+ loop do
70
+ return true if idx >= cursor.length
71
+ node = node.child(cursor.codepoint(idx))
72
+ return true unless node
73
+ return false if node.value
74
+ idx += 1
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end