langusta 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +7 -0
- data/Gemfile +10 -7
- data/Gemfile.lock +12 -16
- data/{README.rdoc → README.md} +27 -10
- data/Rakefile +3 -10
- data/VERSION +1 -1
- data/langusta.gemspec +23 -47
- data/lib/langusta.rb +36 -10
- data/lib/langusta/codepoints.rb +19 -0
- data/lib/langusta/command.rb +3 -3
- data/lib/langusta/detector.rb +16 -13
- data/lib/langusta/detector_factory.rb +11 -5
- data/lib/langusta/guard.rb +22 -0
- data/lib/langusta/inspector.rb +7 -0
- data/lib/langusta/java_property_reader.rb +2 -3
- data/lib/langusta/lang_profile.rb +12 -18
- data/lib/langusta/language_detection_facade.rb +2 -2
- data/lib/langusta/n_gram.rb +20 -25
- data/lib/langusta/regex_helper.rb +15 -10
- data/lib/langusta/tag_extractor.rb +5 -5
- data/lib/langusta/unicode_block.rb +34 -34
- data/test/helper.rb +12 -3
- data/test/quality/test_falsified.rb +3 -3
- data/test/test_command.rb +1 -0
- data/test/test_detector.rb +18 -17
- data/test/test_detector_factory.rb +17 -5
- data/test/test_java_property_reader.rb +2 -1
- data/test/test_lang_profile.rb +37 -31
- data/test/test_language.rb +1 -0
- data/test/test_language_detection_facade.rb +1 -1
- data/test/test_langusta.rb +6 -6
- data/test/test_n_gram.rb +87 -75
- data/test/test_tag_extractor.rb +19 -18
- data/test/test_unicode_block.rb +2 -1
- metadata +54 -156
- data/lib/langusta/ucs2_string.rb +0 -70
- data/test/test_ucs2_string.rb +0 -9
@@ -8,8 +8,8 @@ class FalsifiedTest < Test::Unit::TestCase
|
|
8
8
|
profiles = Dir[File.join(PROFILES_PATH, '*')].map do |filename|
|
9
9
|
LangProfile.load_from_file(filename)
|
10
10
|
end
|
11
|
-
profiles.
|
12
|
-
factory.add_profile(profile
|
11
|
+
profiles.each do |profile|
|
12
|
+
factory.add_profile(profile)
|
13
13
|
end
|
14
14
|
|
15
15
|
incorrect_guesses = 0.0
|
@@ -18,7 +18,7 @@ class FalsifiedTest < Test::Unit::TestCase
|
|
18
18
|
Dir['test/test_data/*'].each do |filename|
|
19
19
|
language = filename.split(/\//).last
|
20
20
|
|
21
|
-
ucs2_content =
|
21
|
+
ucs2_content = Langusta.utf82cp(File.open(filename).read)
|
22
22
|
detector = factory.create
|
23
23
|
detector.append(ucs2_content)
|
24
24
|
|
data/test/test_command.rb
CHANGED
data/test/test_detector.rb
CHANGED
@@ -1,58 +1,59 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
require 'test/helper'
|
2
3
|
|
3
4
|
class DetectorTest < Test::Unit::TestCase
|
4
|
-
TRAINING_EN =
|
5
|
-
TRAINING_FR =
|
6
|
-
TRAINING_JP =
|
5
|
+
TRAINING_EN = [0x0061, 0x0061, 0x0061, 0x0062, 0x0062, 0x0063, 0x0063, 0x0064, 0x0065]
|
6
|
+
TRAINING_FR = [0x0061, 0x0062, 0x0062, 0x0063, 0x0063, 0x0063, 0x0063, 0x0064, 0x0064]
|
7
|
+
TRAINING_JP = [0x3042, 0x3042, 0x3042, 0x3044, 0x3046, 0x3048, 0x3048]
|
7
8
|
|
8
9
|
def setup
|
9
10
|
@factory = DetectorFactory.new
|
10
11
|
profile_en = LangProfile.new("en")
|
11
|
-
TRAINING_EN.
|
12
|
-
profile_en.add(
|
12
|
+
TRAINING_EN.each do |w|
|
13
|
+
profile_en.add([w])
|
13
14
|
end
|
14
|
-
@factory.add_profile(profile_en
|
15
|
+
@factory.add_profile(profile_en)
|
15
16
|
|
16
17
|
profile_fr = LangProfile.new("fr")
|
17
|
-
TRAINING_FR.
|
18
|
-
profile_fr.add(
|
18
|
+
TRAINING_FR.each do |w|
|
19
|
+
profile_fr.add([w])
|
19
20
|
end
|
20
|
-
@factory.add_profile(profile_fr
|
21
|
+
@factory.add_profile(profile_fr)
|
21
22
|
|
22
23
|
profile_jp = LangProfile.new("jp")
|
23
|
-
TRAINING_JP.
|
24
|
-
profile_jp.add(
|
24
|
+
TRAINING_JP.each do |w|
|
25
|
+
profile_jp.add([w])
|
25
26
|
end
|
26
|
-
@factory.add_profile(profile_jp
|
27
|
+
@factory.add_profile(profile_jp)
|
27
28
|
end
|
28
29
|
|
29
30
|
def test_detector1
|
30
31
|
detector = @factory.create()
|
31
|
-
detector.append(
|
32
|
+
detector.append([0x0061]) # "a"
|
32
33
|
assert_equal("en", detector.detect())
|
33
34
|
end
|
34
35
|
|
35
36
|
def test_detector2
|
36
37
|
detector = @factory.create()
|
37
|
-
detector.append(
|
38
|
+
detector.append([0x0062, 0x0020, 0x0064]) # "b d"
|
38
39
|
assert_equal("fr", detector.detect())
|
39
40
|
end
|
40
41
|
|
41
42
|
def test_detector3
|
42
43
|
detector = @factory.create()
|
43
|
-
detector.append(
|
44
|
+
detector.append([0x0064, 0x0020, 0x0065]) # "d e"
|
44
45
|
assert_equal("en", detector.detect())
|
45
46
|
end
|
46
47
|
|
47
48
|
def test_detector4
|
48
49
|
detector = @factory.create()
|
49
|
-
detector.append(
|
50
|
+
detector.append([0x3042, 0x3042, 0x3042, 0x3042, 0x0061])
|
50
51
|
assert_equal("jp", detector.detect())
|
51
52
|
end
|
52
53
|
|
53
54
|
def test_exceptions
|
54
55
|
detector = @factory.create()
|
55
|
-
detector.append(
|
56
|
+
detector.append([])
|
56
57
|
assert_raises(NoFeaturesInTextError) do
|
57
58
|
detector.detect()
|
58
59
|
end
|
@@ -1,28 +1,40 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
require 'test/helper'
|
2
3
|
|
3
4
|
class DetectorFactoryTest < Test::Unit::TestCase
|
4
5
|
def test_add_profile
|
5
|
-
profile = LangProfile.new
|
6
|
+
profile = LangProfile.new('sample')
|
6
7
|
factory = DetectorFactory.new
|
7
8
|
|
8
|
-
factory.add_profile(profile
|
9
|
+
factory.add_profile(profile)
|
9
10
|
|
10
11
|
detector = factory.create(0.123)
|
11
12
|
assert_equal(0.123, detector.alpha)
|
12
13
|
end
|
13
14
|
|
14
15
|
def test_exceptions
|
15
|
-
profile = LangProfile.new
|
16
|
+
profile = LangProfile.new('sample')
|
16
17
|
factory = DetectorFactory.new
|
17
18
|
|
18
19
|
assert_raises(NoProfilesLoadedError) do
|
19
20
|
factory.create()
|
20
21
|
end
|
21
22
|
|
22
|
-
factory.add_profile(profile
|
23
|
+
factory.add_profile(profile)
|
23
24
|
|
24
25
|
assert_raises(DuplicateProfilesError) do
|
25
|
-
factory.add_profile(profile
|
26
|
+
factory.add_profile(profile)
|
26
27
|
end
|
27
28
|
end
|
29
|
+
|
30
|
+
def test_inspect
|
31
|
+
profile = LangProfile.new('sample')
|
32
|
+
factory = DetectorFactory.new
|
33
|
+
|
34
|
+
factory.add_profile(profile)
|
35
|
+
|
36
|
+
assert_match(Regexp.new(factory.object_ptr), factory.inspect)
|
37
|
+
assert_match(/1 profile\(s\)/, factory.inspect)
|
38
|
+
assert_match(Regexp.new(factory.class.name), factory.inspect)
|
39
|
+
end
|
28
40
|
end
|
@@ -1,8 +1,9 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
require 'test/helper'
|
2
3
|
|
3
4
|
class JavaPropertyReaderTest < Test::Unit::TestCase
|
4
5
|
def test_parse
|
5
6
|
jpr = JavaPropertyReader.new(MESSAGES_PROPERTIES)
|
6
|
-
assert_equal(
|
7
|
+
assert_equal([0x4f7c, 0x6934], jpr["NGram.KANJI_1_0"])
|
7
8
|
end
|
8
9
|
end
|
data/test/test_lang_profile.rb
CHANGED
@@ -3,8 +3,12 @@ require 'test/helper'
|
|
3
3
|
|
4
4
|
class LangProfileTest < Test::Unit::TestCase
|
5
5
|
def test_lang_profile
|
6
|
-
|
7
|
-
|
6
|
+
assert_raises(ArgumentError) do
|
7
|
+
LangProfile.new
|
8
|
+
end
|
9
|
+
assert_raises(TypeError) do
|
10
|
+
LangProfile.new(nil)
|
11
|
+
end
|
8
12
|
end
|
9
13
|
|
10
14
|
def test_lang_profile_string_int
|
@@ -14,52 +18,54 @@ class LangProfileTest < Test::Unit::TestCase
|
|
14
18
|
|
15
19
|
def test_add
|
16
20
|
profile = LangProfile.new('en')
|
17
|
-
profile.add(
|
18
|
-
assert_equal(1, profile.freq[
|
19
|
-
profile.add(
|
20
|
-
assert_equal(2, profile.freq[
|
21
|
+
profile.add(utf82cp("a"))
|
22
|
+
assert_equal(1, profile.freq[utf82cp("a")])
|
23
|
+
profile.add(utf82cp("a"))
|
24
|
+
assert_equal(2, profile.freq[utf82cp("a")])
|
21
25
|
profile.omit_less_freq()
|
22
26
|
end
|
23
27
|
|
24
28
|
def test_add_illegally_1
|
25
|
-
profile = LangProfile.new
|
26
|
-
profile.add(
|
27
|
-
|
29
|
+
profile = LangProfile.new('sample')
|
30
|
+
profile.add(utf82cp("a"))
|
31
|
+
assert_equal(1, profile.freq[utf82cp("a")])
|
28
32
|
end
|
29
33
|
|
30
34
|
def test_add_illegally_2
|
31
35
|
profile = LangProfile.new('en')
|
32
|
-
profile.add(
|
33
|
-
profile.add(
|
34
|
-
profile.add(
|
35
|
-
assert_equal(1, profile.freq[
|
36
|
-
assert_nil(profile.freq[
|
37
|
-
assert_nil(profile.freq[
|
36
|
+
profile.add(utf82cp("a"))
|
37
|
+
profile.add(utf82cp(""))
|
38
|
+
profile.add(utf82cp("abcd"))
|
39
|
+
assert_equal(1, profile.freq[utf82cp("a")])
|
40
|
+
assert_nil(profile.freq[utf82cp("")])
|
41
|
+
assert_nil(profile.freq[utf82cp("abcd")])
|
38
42
|
end
|
39
43
|
|
40
44
|
def test_omit_less_freq
|
41
45
|
profile = LangProfile.new('en')
|
42
|
-
grams =
|
46
|
+
grams = [0x0061, 0x0062, 0x0063, 0x3042, 0x3044, 0x3046, 0x3048,
|
47
|
+
0x304a, 0x304b, 0x304c, 0x304d, 0x304e, 0x304f]
|
43
48
|
5.times do
|
44
49
|
grams.each do |gram|
|
45
|
-
profile.add(
|
50
|
+
profile.add([gram])
|
46
51
|
end
|
47
52
|
end
|
48
|
-
profile.add(
|
53
|
+
profile.add([0x3050])
|
49
54
|
|
50
|
-
assert_equal(5, profile.freq[
|
51
|
-
assert_equal(5, profile.freq[
|
52
|
-
assert_equal(1, profile.freq[
|
55
|
+
assert_equal(5, profile.freq[utf82cp("a")])
|
56
|
+
assert_equal(5, profile.freq[[0x3042]])
|
57
|
+
assert_equal(1, profile.freq[[0x3050]])
|
53
58
|
|
54
59
|
profile.omit_less_freq()
|
55
|
-
|
56
|
-
|
57
|
-
|
60
|
+
|
61
|
+
assert_nil(profile.freq[utf82cp("a")])
|
62
|
+
assert_equal(5, profile.freq[[0x3042]])
|
63
|
+
assert_nil(profile.freq[[0x3050]])
|
58
64
|
end
|
59
65
|
|
60
66
|
def test_omit_less_freq_illegally
|
61
|
-
profile = LangProfile.new
|
62
|
-
profile.omit_less_freq()
|
67
|
+
profile = LangProfile.new('sample')
|
68
|
+
assert_nil(profile.omit_less_freq())
|
63
69
|
end
|
64
70
|
|
65
71
|
def test_load_from_file
|
@@ -67,11 +73,11 @@ class LangProfileTest < Test::Unit::TestCase
|
|
67
73
|
profile = LangProfile.load_from_file(filename)
|
68
74
|
assert_equal(filename.split(/\//).last, profile.name)
|
69
75
|
has_content = [
|
70
|
-
profile.freq[
|
71
|
-
profile.freq[
|
72
|
-
profile.freq[
|
73
|
-
profile.freq[
|
74
|
-
profile.freq[
|
76
|
+
profile.freq[utf82cp(" A")], # Latin
|
77
|
+
profile.freq[[0x060c]], # Arabic
|
78
|
+
profile.freq[[0x0a85]], # Gujarati
|
79
|
+
profile.freq[[0x0905]], # Hindi
|
80
|
+
profile.freq[[0x3001]], # Japanese
|
75
81
|
].any?
|
76
82
|
assert(has_content, profile.inspect)
|
77
83
|
end
|
data/test/test_language.rb
CHANGED
@@ -4,6 +4,6 @@ require 'test/helper'
|
|
4
4
|
class LanguageDetectionFacadeTest < Test::Unit::TestCase
|
5
5
|
def test_initialize_and_detect
|
6
6
|
facade = LanguageDetectionFacade.new
|
7
|
-
assert_equal("pl", facade.detect(
|
7
|
+
assert_equal("pl", facade.detect(Langusta.utf82cp("Ich dalekopis fałszuje, gdy próby XQV nie wytrzymuje")))
|
8
8
|
end
|
9
9
|
end
|
data/test/test_langusta.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
require 'test/helper'
|
2
3
|
|
3
4
|
class LangustaTest < Test::Unit::TestCase
|
4
5
|
|
5
6
|
FACTORY = DetectorFactory.new
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
FACTORY.add_profile(profile, index, profiles.length)
|
7
|
+
|
8
|
+
Dir[File.join(PROFILES_PATH, '*')].each do |filename|
|
9
|
+
profile = LangProfile.load_from_file(filename)
|
10
|
+
FACTORY.add_profile(profile)
|
11
11
|
end
|
12
12
|
|
13
13
|
Dir['test/test_data/*'].each do |filename|
|
@@ -15,7 +15,7 @@ class LangustaTest < Test::Unit::TestCase
|
|
15
15
|
define_method(("test_%s_language" % [language]).to_sym) do
|
16
16
|
detector = FACTORY.create
|
17
17
|
|
18
|
-
ucs2_content =
|
18
|
+
ucs2_content = Langusta.utf82cp(File.open(filename).read)
|
19
19
|
detector = FACTORY.create
|
20
20
|
detector.append(ucs2_content)
|
21
21
|
|
data/test/test_n_gram.rb
CHANGED
@@ -7,50 +7,50 @@ class NGramTest < Test::Unit::TestCase
|
|
7
7
|
end
|
8
8
|
|
9
9
|
def test_normalize_with_latin
|
10
|
-
assert_equal(
|
11
|
-
assert_equal(
|
12
|
-
assert_equal(
|
13
|
-
assert_equal(
|
14
|
-
assert_equal(
|
15
|
-
assert_equal(
|
16
|
-
assert_equal(
|
17
|
-
assert_equal(
|
18
|
-
assert_equal(
|
19
|
-
assert_equal(
|
20
|
-
assert_equal(
|
21
|
-
assert_equal(
|
22
|
-
assert_equal(
|
23
|
-
assert_equal(
|
24
|
-
assert_equal(
|
25
|
-
assert_equal(
|
10
|
+
assert_equal(0x20, NGram.normalize(0x00)) # \0
|
11
|
+
assert_equal(0x20, NGram.normalize(0x09)) # <control>
|
12
|
+
assert_equal(0x20, NGram.normalize(0x20)) # space
|
13
|
+
assert_equal(0x20, NGram.normalize(0x30)) # 0
|
14
|
+
assert_equal(0x20, NGram.normalize(0x40)) # @
|
15
|
+
assert_equal(0x41, NGram.normalize(0x41)) # A
|
16
|
+
assert_equal(0x5a, NGram.normalize(0x5a)) # Z
|
17
|
+
assert_equal(0x20, NGram.normalize(0x5b)) # [
|
18
|
+
assert_equal(0x20, NGram.normalize(0x60)) # `
|
19
|
+
assert_equal(0x61, NGram.normalize(0x61)) # a
|
20
|
+
assert_equal(0x7a, NGram.normalize(0x7a)) # z
|
21
|
+
assert_equal(0x20, NGram.normalize(0x7b)) # {
|
22
|
+
assert_equal(0x20, NGram.normalize(0x7f)) # <control>
|
23
|
+
assert_equal(0x80, NGram.normalize(0x80)) # <control>
|
24
|
+
assert_equal(0x20, NGram.normalize(0xa0)) # <control>
|
25
|
+
assert_equal(0xa1, NGram.normalize(0xa1)) # <control>
|
26
26
|
end
|
27
27
|
|
28
28
|
def test_normalize_with_cjk_kanji
|
29
|
-
assert_equal(
|
30
|
-
assert_equal(
|
31
|
-
assert_equal(
|
32
|
-
assert_equal(
|
33
|
-
assert_equal(
|
34
|
-
assert_equal(
|
35
|
-
assert_equal(
|
36
|
-
assert_equal(
|
37
|
-
assert_equal(
|
38
|
-
assert_equal(
|
39
|
-
assert_equal(
|
40
|
-
assert_equal(
|
41
|
-
assert_equal(
|
42
|
-
assert_equal(
|
43
|
-
assert_equal(
|
44
|
-
assert_equal(
|
45
|
-
assert_equal(
|
46
|
-
assert_equal(
|
47
|
-
assert_equal(
|
48
|
-
assert_equal(
|
49
|
-
assert_equal(
|
50
|
-
assert_equal(
|
51
|
-
assert_equal(
|
52
|
-
assert_equal(
|
53
|
-
assert_equal(
|
29
|
+
assert_equal(0x4e00, NGram.normalize(0x4e00))
|
30
|
+
assert_equal(0x4e01, NGram.normalize(0x4e01))
|
31
|
+
assert_equal(0x4e02, NGram.normalize(0x4e02))
|
32
|
+
assert_equal(0x4e01, NGram.normalize(0x4e03))
|
33
|
+
assert_equal(0x4e04, NGram.normalize(0x4e04))
|
34
|
+
assert_equal(0x4e05, NGram.normalize(0x4e05))
|
35
|
+
assert_equal(0x4e06, NGram.normalize(0x4e06))
|
36
|
+
assert_equal(0x4e07, NGram.normalize(0x4e07))
|
37
|
+
assert_equal(0x4e08, NGram.normalize(0x4e08))
|
38
|
+
assert_equal(0x4e09, NGram.normalize(0x4e09))
|
39
|
+
assert_equal(0x4e10, NGram.normalize(0x4e10))
|
40
|
+
assert_equal(0x4e11, NGram.normalize(0x4e11))
|
41
|
+
assert_equal(0x4e12, NGram.normalize(0x4e12))
|
42
|
+
assert_equal(0x4e13, NGram.normalize(0x4e13))
|
43
|
+
assert_equal(0x4e14, NGram.normalize(0x4e14))
|
44
|
+
assert_equal(0x4e15, NGram.normalize(0x4e15))
|
45
|
+
assert_equal(0x4e1e, NGram.normalize(0x4e1e))
|
46
|
+
assert_equal(0x4e1f, NGram.normalize(0x4e1f))
|
47
|
+
assert_equal(0x4e20, NGram.normalize(0x4e20))
|
48
|
+
assert_equal(0x4e21, NGram.normalize(0x4e21))
|
49
|
+
assert_equal(0x4e22, NGram.normalize(0x4e22))
|
50
|
+
assert_equal(0x4e23, NGram.normalize(0x4e23))
|
51
|
+
assert_equal(0x4e13, NGram.normalize(0x4e24))
|
52
|
+
assert_equal(0x4e13, NGram.normalize(0x4e25))
|
53
|
+
assert_equal(0x4e30, NGram.normalize(0x4e30))
|
54
54
|
end
|
55
55
|
|
56
56
|
def test_ngram
|
@@ -58,46 +58,58 @@ class NGramTest < Test::Unit::TestCase
|
|
58
58
|
(0..4).each do |n|
|
59
59
|
assert_nil(ngram.get(n))
|
60
60
|
end
|
61
|
-
ngram.add_char(
|
61
|
+
ngram.add_char(0x20)
|
62
62
|
(1..3).each do |n|
|
63
63
|
assert_nil(ngram.get(n))
|
64
64
|
end
|
65
|
-
|
66
|
-
|
67
|
-
assert_equal(
|
65
|
+
|
66
|
+
ngram.add_char(0x0041)
|
67
|
+
assert_equal([0x0041], ngram.get(1))
|
68
|
+
assert_equal([0x0020, 0x0041], ngram.get(2))
|
68
69
|
assert_nil(ngram.get(3))
|
69
|
-
ngram.add_char("\x06\xcc")
|
70
|
-
assert_equal(UCS2String.new("\x06\x4a"), ngram.get(1))
|
71
|
-
assert_equal(UCS2String.new("\x00A\x06\x4a"), ngram.get(2))
|
72
|
-
assert_equal(UCS2String.new("\x00 \x00A\x06\x4a"), ngram.get(3))
|
73
|
-
ngram.add_char("\x1e\xa0")
|
74
|
-
assert_equal(UCS2String.new("\x1e\xc3"), ngram.get(1))
|
75
|
-
assert_equal(UCS2String.new("\x06\x4a\x1e\xc3"), ngram.get(2))
|
76
|
-
assert_equal(UCS2String.new("\x00A\x06\x4a\x1e\xc3"), ngram.get(3))
|
77
|
-
ngram.add_char("\x30\x44")
|
78
|
-
assert_equal(UCS2String.new("\x30\x42"), ngram.get(1))
|
79
|
-
assert_equal(UCS2String.new("\x1e\xc3\x30\x42"), ngram.get(2))
|
80
|
-
assert_equal(UCS2String.new("\x06\x4a\x1e\xc3\x30\x42"), ngram.get(3))
|
81
70
|
|
82
|
-
ngram.add_char(
|
83
|
-
assert_equal(
|
84
|
-
assert_equal(
|
85
|
-
assert_equal(
|
86
|
-
|
87
|
-
|
88
|
-
assert_equal(
|
89
|
-
assert_equal(
|
90
|
-
ngram.
|
91
|
-
|
92
|
-
|
93
|
-
assert_equal(
|
94
|
-
ngram.
|
71
|
+
ngram.add_char(0x06cc)
|
72
|
+
assert_equal([0x064a], ngram.get(1))
|
73
|
+
assert_equal([0x0041, 0x64a], ngram.get(2))
|
74
|
+
assert_equal([0x0020, 0x0041, 0x064a], ngram.get(3))
|
75
|
+
|
76
|
+
ngram.add_char(0x1ea0)
|
77
|
+
assert_equal([0x1ec3], ngram.get(1))
|
78
|
+
assert_equal([0x064a, 0x1ec3], ngram.get(2))
|
79
|
+
assert_equal([0x0041, 0x064a, 0x1ec3], ngram.get(3))
|
80
|
+
|
81
|
+
ngram.add_char(0x3044)
|
82
|
+
assert_equal([0x3042], ngram.get(1))
|
83
|
+
assert_equal([0x1ec3, 0x3042], ngram.get(2))
|
84
|
+
assert_equal([0x064a, 0x1ec3, 0x3042], ngram.get(3))
|
85
|
+
|
86
|
+
ngram.add_char(0x30a4)
|
87
|
+
assert_equal([0x30a2], ngram.get(1))
|
88
|
+
assert_equal([0x3042, 0x30a2], ngram.get(2))
|
89
|
+
assert_equal([0x1ec3, 0x3042, 0x30a2], ngram.get(3))
|
90
|
+
|
91
|
+
ngram.add_char(0x3106)
|
92
|
+
assert_equal([0x3105], ngram.get(1))
|
93
|
+
assert_equal([0x30a2, 0x3105], ngram.get(2))
|
94
|
+
assert_equal([0x3042, 0x30a2, 0x3105], ngram.get(3))
|
95
|
+
|
96
|
+
ngram.add_char(0xac01)
|
97
|
+
assert_equal([0xac00], ngram.get(1))
|
98
|
+
assert_equal([0x3105, 0xac00], ngram.get(2))
|
99
|
+
assert_equal([0x30a2, 0x3105, 0xac00], ngram.get(3))
|
100
|
+
|
101
|
+
ngram.add_char(0x2010)
|
95
102
|
assert_nil(ngram.get(1))
|
96
|
-
assert_equal(
|
97
|
-
assert_equal(
|
98
|
-
|
99
|
-
|
100
|
-
assert_equal(
|
103
|
+
assert_equal([0xac00, 0x0020], ngram.get(2))
|
104
|
+
assert_equal([0x3105, 0xac00, 0x0020], ngram.get(3))
|
105
|
+
|
106
|
+
ngram.add_char(0x0041)
|
107
|
+
assert_equal([0x0041], ngram.get(1))
|
108
|
+
assert_equal([0x0020, 0x0041], ngram.get(2))
|
101
109
|
assert_nil(ngram.get(3))
|
102
110
|
end
|
111
|
+
|
112
|
+
def array_of_codepoints
|
113
|
+
array_of_codepoints.pack('n*')
|
114
|
+
end
|
103
115
|
end
|