twitter_cldr 5.1.0 → 5.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +5 -5
  3. data/lib/twitter_cldr.rb +1 -0
  4. data/lib/twitter_cldr/resources.rb +2 -8
  5. data/lib/twitter_cldr/resources/loader.rb +6 -4
  6. data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -1
  7. data/lib/twitter_cldr/resources/segment_rules_importer.rb +202 -0
  8. data/lib/twitter_cldr/resources/segment_tests_importer.rb +3 -1
  9. data/lib/twitter_cldr/segmentation.rb +10 -8
  10. data/lib/twitter_cldr/segmentation/break_iterator.rb +15 -11
  11. data/lib/twitter_cldr/segmentation/category_table.rb +56 -0
  12. data/lib/twitter_cldr/segmentation/cursor.rb +10 -5
  13. data/lib/twitter_cldr/segmentation/metadata.rb +20 -0
  14. data/lib/twitter_cldr/segmentation/null_suppressions.rb +18 -0
  15. data/lib/twitter_cldr/segmentation/rule_set.rb +23 -79
  16. data/lib/twitter_cldr/segmentation/state_machine.rb +125 -0
  17. data/lib/twitter_cldr/segmentation/state_table.rb +46 -0
  18. data/lib/twitter_cldr/segmentation/status_table.rb +30 -0
  19. data/lib/twitter_cldr/segmentation/suppressions.rb +79 -0
  20. data/lib/twitter_cldr/shared/caser.rb +1 -1
  21. data/lib/twitter_cldr/shared/locale.rb +6 -2
  22. data/lib/twitter_cldr/version.rb +1 -1
  23. data/resources/shared/segments/rules/el/sentence.yml +723 -0
  24. data/resources/shared/segments/rules/en-US-POSIX/word.yml +527 -0
  25. data/resources/shared/segments/rules/ja/line.yml +964 -0
  26. data/resources/shared/segments/rules/ja/word.yml +527 -0
  27. data/resources/shared/segments/rules/root/grapheme.yml +463 -0
  28. data/resources/shared/segments/rules/root/line.yml +964 -0
  29. data/resources/shared/segments/rules/root/sentence.yml +723 -0
  30. data/resources/shared/segments/rules/root/word.yml +527 -0
  31. data/resources/shared/segments/rules/zh-Hant/line.yml +964 -0
  32. data/resources/shared/segments/rules/zh/line.yml +964 -0
  33. data/resources/shared/segments/suppressions/de/sentence.yml +5 -0
  34. data/resources/shared/segments/suppressions/en/sentence.yml +5 -0
  35. data/resources/shared/segments/suppressions/es/sentence.yml +5 -0
  36. data/resources/shared/segments/suppressions/fr/sentence.yml +5 -0
  37. data/resources/shared/segments/suppressions/it/sentence.yml +5 -0
  38. data/resources/shared/segments/suppressions/pt/sentence.yml +5 -0
  39. data/resources/shared/segments/suppressions/ru/sentence.yml +5 -0
  40. data/resources/shared/segments/tests/grapheme_break_test.yml +603 -0
  41. data/resources/shared/segments/tests/line_break_test.yml +7348 -0
  42. data/resources/uli/segments/de.yml +5 -230
  43. data/resources/uli/segments/en.yml +3 -154
  44. data/resources/uli/segments/es.yml +5 -145
  45. data/resources/uli/segments/fr.yml +5 -68
  46. data/resources/uli/segments/it.yml +3 -48
  47. data/resources/uli/segments/pt.yml +5 -173
  48. data/resources/uli/segments/ru.yml +3 -10
  49. data/spec/segmentation/rule_set_spec.rb +54 -27
  50. metadata +29 -9
  51. data/lib/twitter_cldr/resources/uli.rb +0 -12
  52. data/lib/twitter_cldr/resources/uli/segment_exceptions_importer.rb +0 -59
  53. data/lib/twitter_cldr/segmentation/parser.rb +0 -71
  54. data/lib/twitter_cldr/segmentation/rule.rb +0 -79
  55. data/lib/twitter_cldr/segmentation/rule_set_builder.rb +0 -142
  56. data/resources/shared/segments/segments_root.yml +0 -869
  57. data/spec/segmentation/parser_spec.rb +0 -104
@@ -6,10 +6,12 @@
6
6
  module TwitterCldr
7
7
  module Segmentation
8
8
  class Cursor
9
- attr_reader :text, :position, :match_cache
9
+ attr_reader :text, :codepoints
10
+ attr_accessor :position
10
11
 
11
12
  def initialize(text)
12
13
  @text = text
14
+ @codepoints = text.codepoints
13
15
  reset
14
16
  end
15
17
 
@@ -19,15 +21,18 @@ module TwitterCldr
19
21
 
20
22
  def reset
21
23
  @position = 0
22
- @match_cache = {}
23
24
  end
24
25
 
25
- def eof?
26
+ def eos?
26
27
  position >= text.size
27
28
  end
28
29
 
29
- def eos?
30
- position >= text.size - 1
30
+ def codepoint(pos = position)
31
+ codepoints[pos]
32
+ end
33
+
34
+ def length
35
+ text.length
31
36
  end
32
37
  end
33
38
  end
@@ -0,0 +1,20 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ module TwitterCldr
7
+ module Segmentation
8
+ class Metadata
9
+ attr_reader :values
10
+
11
+ def initialize(values)
12
+ @values = values
13
+ end
14
+
15
+ def category_count
16
+ @category_count ||= values[:category_count]
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,18 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'singleton'
7
+
8
+ module TwitterCldr
9
+ module Segmentation
10
+ class NullSuppressions
11
+ include Singleton
12
+
13
+ def should_break?(_cursor)
14
+ true
15
+ end
16
+ end
17
+ end
18
+ end
@@ -8,109 +8,53 @@ module TwitterCldr
8
8
  class RuleSet
9
9
 
10
10
  class << self
11
- def load(*args)
12
- RuleSetBuilder.load(*args)
11
+ def create(locale, boundary_type, options = {})
12
+ new(locale, StateMachine.instance(boundary_type, locale), options)
13
13
  end
14
14
  end
15
15
 
16
- attr_reader :locale, :rules, :boundary_type
16
+ attr_reader :locale, :state_machine
17
17
  attr_accessor :use_uli_exceptions
18
18
 
19
19
  alias_method :use_uli_exceptions?, :use_uli_exceptions
20
20
 
21
- def initialize(locale, rules, boundary_type, options)
21
+ def initialize(locale, state_machine, options)
22
22
  @locale = locale
23
- @rules = rules
24
- @boundary_type = boundary_type
23
+ @state_machine = state_machine
25
24
  @use_uli_exceptions = options.fetch(
26
25
  :use_uli_exceptions, false
27
26
  )
28
27
  end
29
28
 
30
29
  def each_boundary(str)
31
- if block_given?
32
- cursor = Cursor.new(str)
33
- last_boundary = 0
30
+ return to_enum(__method__, str) unless block_given?
34
31
 
35
- # implicit start of text boundary
36
- yield 0
32
+ cursor = Cursor.new(str)
37
33
 
38
- until cursor.eof?
39
- match = find_match(cursor)
40
- rule = match.rule
34
+ # Let the state machine find the first boundary for the line
35
+ # boundary type. This helps pass nearly all the Unicode
36
+ # segmentation tests, so it must be the right thing to do.
37
+ # Normally the first boundary is the implicit start of text
38
+ # boundary, but potentially not for the line rules?
39
+ yield 0 unless state_machine.boundary_type == 'line'
41
40
 
42
- if rule.break?
43
- yield match.boundary_position
44
- last_boundary = match.boundary_position
45
- end
46
-
47
- if match.boundary_position == cursor.position
48
- cursor.advance
49
- else
50
- cursor.advance(
51
- match.boundary_position - cursor.position
52
- )
53
- end
54
- end
55
-
56
- # implicit end of text boundary
57
- yield str.size unless last_boundary == str.size
58
- else
59
- to_enum(__method__, str)
41
+ until cursor.eos?
42
+ state_machine.handle_next(cursor)
43
+ yield cursor.position if suppressions.should_break?(cursor)
60
44
  end
61
45
  end
62
46
 
63
- private
64
-
65
- def each_rule(&block)
66
- if block_given?
67
- if use_uli_exceptions? && supports_exceptions?
68
- yield exception_rule
69
- end
70
-
71
- rules.each(&block)
72
- else
73
- to_enum(__method__)
74
- end
75
- end
76
-
77
- def exception_rule
78
- @exception_rule ||= RuleSetBuilder.exception_rule_for(
79
- locale, boundary_type
80
- )
81
- end
82
-
83
- def supports_exceptions?
84
- boundary_type == 'sentence'
47
+ def boundary_type
48
+ state_machine.boundary_type
85
49
  end
86
50
 
87
- def find_match(cursor)
88
- match = find_cached_match(cursor)
51
+ private
89
52
 
90
- match || if cursor.eos?
91
- RuleSetBuilder.implicit_end_of_text_rule.match(cursor)
53
+ def suppressions
54
+ @suppressions ||= if use_uli_exceptions?
55
+ Suppressions.instance(boundary_type, locale)
92
56
  else
93
- RuleSetBuilder.implicit_final_rule.match(cursor)
94
- end
95
- end
96
-
97
- def find_cached_match(cursor)
98
- cursor.match_cache.fetch(cursor.position) do
99
- matches = match_all(cursor)
100
-
101
- matches.each do |m|
102
- cursor.match_cache[m.boundary_position - 1] ||= m
103
- end
104
-
105
- matches.first
106
- end
107
- end
108
-
109
- def match_all(cursor)
110
- each_rule.each_with_object([]) do |rule, ret|
111
- if match = rule.match(cursor)
112
- ret << match
113
- end
57
+ NullSuppressions.instance
114
58
  end
115
59
  end
116
60
  end
@@ -0,0 +1,125 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'base64'
7
+ require 'singleton'
8
+
9
+ module TwitterCldr
10
+ module Segmentation
11
+ class StateMachine
12
+ include Singleton
13
+
14
+ START_STATE = 1
15
+ STOP_STATE = 0
16
+ NEXT_STATES = 4
17
+ ACCEPTING = 0
18
+
19
+ class << self
20
+ def instance(boundary_type, locale)
21
+ resource_path = find_resource(boundary_type, locale)
22
+
23
+ cache[resource_path] ||= begin
24
+ rsrc = TwitterCldr.get_resource(resource_path)
25
+
26
+ new(
27
+ boundary_type,
28
+ locale,
29
+ Metadata.new(rsrc[:metadata]),
30
+ StateTable.load16(rsrc[:forward_table]),
31
+ StateTable.load16(rsrc[:backward_table]),
32
+ StatusTable.load(rsrc[:status_table]),
33
+ CategoryTable.load16(rsrc[:category_table])
34
+ )
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def find_resource(boundary_type, locale)
41
+ path = TwitterCldr.resource_file_path(
42
+ ['shared', 'segments', 'rules', locale, boundary_type]
43
+ )
44
+
45
+ return path if TwitterCldr.resource_exists?(path)
46
+
47
+ TwitterCldr.resource_file_path(
48
+ ['shared', 'segments', 'rules', 'root', boundary_type]
49
+ )
50
+ end
51
+
52
+ def cache
53
+ @cache ||= {}
54
+ end
55
+ end
56
+
57
+ attr_reader :boundary_type, :locale
58
+ attr_reader :metadata, :ftable, :rtable, :status_table, :category_table
59
+
60
+ def initialize(boundary_type, locale, metadata, ftable, rtable, status_table, category_table)
61
+ @boundary_type = boundary_type
62
+ @locale = locale
63
+ @metadata = metadata
64
+ @ftable = ftable
65
+ @rtable = rtable
66
+ @status_table = status_table
67
+ @category_table = category_table
68
+ end
69
+
70
+ def handle_next(cursor)
71
+ result = initial_position = cursor.position
72
+ state = START_STATE
73
+ row = row_index_for(state)
74
+ category = 3
75
+ mode = :run
76
+
77
+ if ftable.bof_required?
78
+ category = 2
79
+ mode = :start
80
+ end
81
+
82
+ until state == STOP_STATE
83
+ if cursor.eos?
84
+ break if mode == :stop
85
+ mode = :stop
86
+ category = 1
87
+ elsif mode == :run
88
+ category = category_table.get(cursor.codepoint)
89
+
90
+ if (category & 0x4000) != 0
91
+ category &= ~0x4000
92
+ end
93
+
94
+ cursor.advance
95
+ else
96
+ mode = :run
97
+ end
98
+
99
+ state = ftable[row + NEXT_STATES + category]
100
+ row = row_index_for(state)
101
+
102
+ if ftable[row + ACCEPTING] == -1
103
+ # match found
104
+ result = cursor.position
105
+ end
106
+ end
107
+
108
+ cursor.position = result
109
+
110
+ # don't let cursor get stuck
111
+ if cursor.position == initial_position
112
+ cursor.advance
113
+ end
114
+
115
+ result
116
+ end
117
+
118
+ private
119
+
120
+ def row_index_for(state)
121
+ state * (metadata.category_count + 4)
122
+ end
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,46 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'base64'
7
+
8
+ module TwitterCldr
9
+ module Segmentation
10
+ class StateTable
11
+ PACK_FMT_16 = 's!*'.freeze
12
+ BOF_REQUIRED_FLAG = 2
13
+
14
+ class << self
15
+ def load16(hash)
16
+ new(
17
+ Base64.decode64(hash[:table]).unpack(PACK_FMT_16),
18
+ hash[:flags]
19
+ )
20
+ end
21
+ end
22
+
23
+ attr_reader :values, :flags
24
+
25
+ def initialize(values, flags)
26
+ @values = values
27
+ @flags = flags
28
+ end
29
+
30
+ def [](idx)
31
+ values[idx]
32
+ end
33
+
34
+ def bof_required?
35
+ flags & BOF_REQUIRED_FLAG != 0
36
+ end
37
+
38
+ def dump16
39
+ {
40
+ table: Base64.encode64(values.pack(PACK_FMT_16)).strip,
41
+ flags: flags
42
+ }
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,30 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'base64'
7
+
8
+ module TwitterCldr
9
+ module Segmentation
10
+ class StatusTable
11
+ PACK_FMT = 'I!*'.freeze
12
+
13
+ class << self
14
+ def load(hash)
15
+ new(Base64.decode64(hash[:table]).unpack(PACK_FMT))
16
+ end
17
+ end
18
+
19
+ attr_reader :values
20
+
21
+ def initialize(values)
22
+ @values = values
23
+ end
24
+
25
+ def dump
26
+ { table: Base64.encode64(values.pack(PACK_FMT)).strip }
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,79 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'singleton'
7
+
8
+ module TwitterCldr
9
+ module Segmentation
10
+ class Suppressions
11
+ include Singleton
12
+
13
+ class << self
14
+ def instance(boundary_type, locale)
15
+ resource_path = find_resource(boundary_type, locale)
16
+ return NullSuppressions.instance unless resource_path
17
+
18
+ cache[resource_path] ||= begin
19
+ rsrc = TwitterCldr.get_resource(resource_path)
20
+
21
+ new(
22
+ Marshal.load(rsrc[:forwards_trie]),
23
+ Marshal.load(rsrc[:backwards_trie])
24
+ )
25
+ end
26
+ end
27
+
28
+ private
29
+
30
+ def find_resource(boundary_type, locale)
31
+ path = TwitterCldr.resource_file_path(
32
+ ['shared', 'segments', 'suppressions', locale, boundary_type]
33
+ )
34
+
35
+ path if TwitterCldr.resource_exists?(path)
36
+ end
37
+
38
+ def cache
39
+ @cache ||= {}
40
+ end
41
+ end
42
+
43
+ attr_reader :forward_trie, :backward_trie
44
+
45
+ def initialize(forward_trie, backward_trie)
46
+ @forward_trie = forward_trie
47
+ @backward_trie = backward_trie
48
+ end
49
+
50
+ def should_break?(cursor)
51
+ idx = cursor.position
52
+
53
+ # consider case when a space follows the '.' (so we handle i.e. "Mr. Brown")
54
+ idx -= 2 if cursor.codepoint(idx - 1) == 32
55
+ node = backward_trie.root
56
+
57
+ found = loop do
58
+ break false if idx < 0 || idx >= cursor.length
59
+ node = node.child(cursor.codepoint(idx))
60
+ break false unless node
61
+ break true if node.value
62
+ idx -= 1
63
+ end
64
+
65
+ return true unless found
66
+
67
+ node = forward_trie.root
68
+
69
+ loop do
70
+ return true if idx >= cursor.length
71
+ node = node.child(cursor.codepoint(idx))
72
+ return true unless node
73
+ return false if node.value
74
+ idx += 1
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end