twitter_cldr 5.1.0 → 5.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +5 -5
  3. data/lib/twitter_cldr.rb +1 -0
  4. data/lib/twitter_cldr/resources.rb +2 -8
  5. data/lib/twitter_cldr/resources/loader.rb +6 -4
  6. data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -1
  7. data/lib/twitter_cldr/resources/segment_rules_importer.rb +202 -0
  8. data/lib/twitter_cldr/resources/segment_tests_importer.rb +3 -1
  9. data/lib/twitter_cldr/segmentation.rb +10 -8
  10. data/lib/twitter_cldr/segmentation/break_iterator.rb +15 -11
  11. data/lib/twitter_cldr/segmentation/category_table.rb +56 -0
  12. data/lib/twitter_cldr/segmentation/cursor.rb +10 -5
  13. data/lib/twitter_cldr/segmentation/metadata.rb +20 -0
  14. data/lib/twitter_cldr/segmentation/null_suppressions.rb +18 -0
  15. data/lib/twitter_cldr/segmentation/rule_set.rb +23 -79
  16. data/lib/twitter_cldr/segmentation/state_machine.rb +125 -0
  17. data/lib/twitter_cldr/segmentation/state_table.rb +46 -0
  18. data/lib/twitter_cldr/segmentation/status_table.rb +30 -0
  19. data/lib/twitter_cldr/segmentation/suppressions.rb +79 -0
  20. data/lib/twitter_cldr/shared/caser.rb +1 -1
  21. data/lib/twitter_cldr/shared/locale.rb +6 -2
  22. data/lib/twitter_cldr/version.rb +1 -1
  23. data/resources/shared/segments/rules/el/sentence.yml +723 -0
  24. data/resources/shared/segments/rules/en-US-POSIX/word.yml +527 -0
  25. data/resources/shared/segments/rules/ja/line.yml +964 -0
  26. data/resources/shared/segments/rules/ja/word.yml +527 -0
  27. data/resources/shared/segments/rules/root/grapheme.yml +463 -0
  28. data/resources/shared/segments/rules/root/line.yml +964 -0
  29. data/resources/shared/segments/rules/root/sentence.yml +723 -0
  30. data/resources/shared/segments/rules/root/word.yml +527 -0
  31. data/resources/shared/segments/rules/zh-Hant/line.yml +964 -0
  32. data/resources/shared/segments/rules/zh/line.yml +964 -0
  33. data/resources/shared/segments/suppressions/de/sentence.yml +5 -0
  34. data/resources/shared/segments/suppressions/en/sentence.yml +5 -0
  35. data/resources/shared/segments/suppressions/es/sentence.yml +5 -0
  36. data/resources/shared/segments/suppressions/fr/sentence.yml +5 -0
  37. data/resources/shared/segments/suppressions/it/sentence.yml +5 -0
  38. data/resources/shared/segments/suppressions/pt/sentence.yml +5 -0
  39. data/resources/shared/segments/suppressions/ru/sentence.yml +5 -0
  40. data/resources/shared/segments/tests/grapheme_break_test.yml +603 -0
  41. data/resources/shared/segments/tests/line_break_test.yml +7348 -0
  42. data/resources/uli/segments/de.yml +5 -230
  43. data/resources/uli/segments/en.yml +3 -154
  44. data/resources/uli/segments/es.yml +5 -145
  45. data/resources/uli/segments/fr.yml +5 -68
  46. data/resources/uli/segments/it.yml +3 -48
  47. data/resources/uli/segments/pt.yml +5 -173
  48. data/resources/uli/segments/ru.yml +3 -10
  49. data/spec/segmentation/rule_set_spec.rb +54 -27
  50. metadata +29 -9
  51. data/lib/twitter_cldr/resources/uli.rb +0 -12
  52. data/lib/twitter_cldr/resources/uli/segment_exceptions_importer.rb +0 -59
  53. data/lib/twitter_cldr/segmentation/parser.rb +0 -71
  54. data/lib/twitter_cldr/segmentation/rule.rb +0 -79
  55. data/lib/twitter_cldr/segmentation/rule_set_builder.rb +0 -142
  56. data/resources/shared/segments/segments_root.yml +0 -869
  57. data/spec/segmentation/parser_spec.rb +0 -104
@@ -1,104 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- # Copyright 2012 Twitter, Inc
4
- # http://www.apache.org/licenses/LICENSE-2.0
5
-
6
- require 'spec_helper'
7
-
8
- describe 'Segmentation' do
9
- let(:parser) { TwitterCldr::Segmentation::Parser.new }
10
-
11
- def parse(tokens, options = {})
12
- parser.parse(tokens, options)
13
- end
14
-
15
- describe TwitterCldr::Segmentation::Parser do
16
- let(:symbol_table) do
17
- TwitterCldr::Parsers::SymbolTable.new({
18
- "$FOO" => parser.tokenize_regex("[abc]")
19
- })
20
- end
21
-
22
- describe "#parse" do
23
- it "should parse a rule with a break" do
24
- rule = parse("[a-z] ÷ [0-9]")
25
- expect(rule.left.to_regexp_str).to eq("(?:[\\u{0061}-\\u{007a}])")
26
- expect(rule.right.to_regexp_str).to eq("(?:[\\u{0030}-\\u{0039}])")
27
- expect(rule.boundary_symbol).to eq(:break)
28
- end
29
-
30
- it "should parse a rule with a non-break" do
31
- rule = parse("[a-z] × [0-9]")
32
-
33
- expect(rule.left.to_regexp_str).to eq(
34
- "(?:[\\u{0061}-\\u{007a}])"
35
- )
36
-
37
- expect(rule.right.to_regexp_str).to eq(
38
- "(?:[\\u{0030}-\\u{0039}])"
39
- )
40
-
41
- expect(rule.boundary_symbol).to eq(:no_break)
42
- end
43
-
44
- it "should parse a rule containing a variable" do
45
- rule = parse("$FOO × bar", symbol_table: symbol_table)
46
-
47
- expect(rule.left.to_regexp_str).to eq(
48
- "(?:[\\u{0061}-\\u{0063}])"
49
- )
50
-
51
- expect(rule.right.to_regexp_str).to eq(
52
- "(?:\\u{0062})(?:\\u{0061})(?:\\u{0072})"
53
- )
54
-
55
- expect(rule.boundary_symbol).to eq(:no_break)
56
- end
57
- end
58
- end
59
-
60
- describe TwitterCldr::Segmentation::BreakRule do
61
- describe "#match" do
62
- let(:rule) { parse("[a-z] ÷ [0-9]") }
63
-
64
- it "rule should be the right type" do
65
- expect(rule).to be_a(TwitterCldr::Segmentation::BreakRule)
66
- end
67
-
68
- it "should match and return the right offset and text" do
69
- cursor = TwitterCldr::Segmentation::Cursor.new("c7")
70
- match = rule.match(cursor)
71
- expect(match.boundary_offset).to eq([0, 2])
72
- expect(match.boundary_position).to eq(1)
73
- end
74
-
75
- it "should not match if the input string doesn't contain a matching right- and/or left-hand side" do
76
- expect(rule.match(TwitterCldr::Segmentation::Cursor.new("C7"))).to be_nil
77
- expect(rule.match(TwitterCldr::Segmentation::Cursor.new("cc"))).to be_nil
78
- expect(rule.match(TwitterCldr::Segmentation::Cursor.new("CC"))).to be_nil
79
- end
80
- end
81
- end
82
-
83
- describe TwitterCldr::Segmentation::NoBreakRule do
84
- describe "#match" do
85
- let(:rule) { parse("[a-z] × [0-9]") }
86
-
87
- it "rule should be the right type" do
88
- expect(rule).to be_a(TwitterCldr::Segmentation::NoBreakRule)
89
- end
90
-
91
- it "should match and return the right offset and text" do
92
- match = rule.match(TwitterCldr::Segmentation::Cursor.new("c7"))
93
- expect(match.boundary_offset).to eq([0, 2])
94
- expect(match.boundary_position).to eq(1)
95
- end
96
-
97
- it "should not match if the input string doesn't contain matching text" do
98
- expect(rule.match(TwitterCldr::Segmentation::Cursor.new("C7"))).to be_nil
99
- expect(rule.match(TwitterCldr::Segmentation::Cursor.new("cc"))).to be_nil
100
- expect(rule.match(TwitterCldr::Segmentation::Cursor.new("CC"))).to be_nil
101
- end
102
- end
103
- end
104
- end