RubyGems - langusta - Versions diffs - 0.1.1 → 0.2.0 - Mend

langusta 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

data/.travis.yml +7 -0
data/Gemfile +10 -7
data/Gemfile.lock +12 -16
data/{README.rdoc → README.md} +27 -10
data/Rakefile +3 -10
data/VERSION +1 -1
data/langusta.gemspec +23 -47
data/lib/langusta.rb +36 -10
data/lib/langusta/codepoints.rb +19 -0
data/lib/langusta/command.rb +3 -3
data/lib/langusta/detector.rb +16 -13
data/lib/langusta/detector_factory.rb +11 -5
data/lib/langusta/guard.rb +22 -0
data/lib/langusta/inspector.rb +7 -0
data/lib/langusta/java_property_reader.rb +2 -3
data/lib/langusta/lang_profile.rb +12 -18
data/lib/langusta/language_detection_facade.rb +2 -2
data/lib/langusta/n_gram.rb +20 -25
data/lib/langusta/regex_helper.rb +15 -10
data/lib/langusta/tag_extractor.rb +5 -5
data/lib/langusta/unicode_block.rb +34 -34
data/test/helper.rb +12 -3
data/test/quality/test_falsified.rb +3 -3
data/test/test_command.rb +1 -0
data/test/test_detector.rb +18 -17
data/test/test_detector_factory.rb +17 -5
data/test/test_java_property_reader.rb +2 -1
data/test/test_lang_profile.rb +37 -31
data/test/test_language.rb +1 -0
data/test/test_language_detection_facade.rb +1 -1
data/test/test_langusta.rb +6 -6
data/test/test_n_gram.rb +87 -75
data/test/test_tag_extractor.rb +19 -18
data/test/test_unicode_block.rb +2 -1
metadata +54 -156
data/lib/langusta/ucs2_string.rb +0 -70
data/test/test_ucs2_string.rb +0 -9

data/test/quality/test_falsified.rb CHANGED Viewed

@@ -8,8 +8,8 @@ class FalsifiedTest < Test::Unit::TestCase
     profiles = Dir[File.join(PROFILES_PATH, '*')].map do |filename|
       LangProfile.load_from_file(filename)
     end
-    profiles.each_with_index do |profile, index|
-      factory.add_profile(profile, index, profiles.length)
+    profiles.each do |profile|
+      factory.add_profile(profile)
     end
     incorrect_guesses = 0.0
@@ -18,7 +18,7 @@ class FalsifiedTest < Test::Unit::TestCase
       Dir['test/test_data/*'].each do |filename|
         language = filename.split(/\//).last
-        ucs2_content = UCS2String.from_utf8(File.open(filename).read)
+        ucs2_content = Langusta.utf82cp(File.open(filename).read)
         detector = factory.create
         detector.append(ucs2_content)

data/test/test_command.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 require 'test/helper'
 class CommandTest < Test::Unit::TestCase

data/test/test_detector.rb CHANGED Viewed

@@ -1,58 +1,59 @@
+# -*- coding: utf-8 -*-
 require 'test/helper'
 class DetectorTest < Test::Unit::TestCase
-  TRAINING_EN = "\x00a \x00a \x00a \x00b \x00b \x00c \x00c \x00d \x00e"
-  TRAINING_FR = "\x00a \x00b \x00b \x00c \x00c \x00c \x00d \x00d \x00d"
-  TRAINING_JP = "\x30\x42 \x30\x42 \x30\x42 \x30\x44 \x30\x46 \x30\x48 \x30\x48"
+  TRAINING_EN = [0x0061, 0x0061, 0x0061, 0x0062, 0x0062, 0x0063, 0x0063, 0x0064, 0x0065]
+  TRAINING_FR = [0x0061, 0x0062, 0x0062, 0x0063, 0x0063, 0x0063, 0x0063, 0x0064, 0x0064]
+  TRAINING_JP = [0x3042, 0x3042, 0x3042, 0x3044, 0x3046, 0x3048, 0x3048]
   def setup
     @factory = DetectorFactory.new
     profile_en = LangProfile.new("en")
-    TRAINING_EN.split(/ /).each do |w|
-      profile_en.add(UCS2String.new(w))
+    TRAINING_EN.each do |w|
+      profile_en.add([w])
     end
-    @factory.add_profile(profile_en, 0, 3)
+    @factory.add_profile(profile_en)
     profile_fr = LangProfile.new("fr")
-    TRAINING_FR.split(/ /).each do |w|
-      profile_fr.add(UCS2String.new(w))
+    TRAINING_FR.each do |w|
+      profile_fr.add([w])
     end
-    @factory.add_profile(profile_fr, 1, 3)
+    @factory.add_profile(profile_fr)
     profile_jp = LangProfile.new("jp")
-    TRAINING_JP.split(/ /).each do |w|
-      profile_jp.add(UCS2String.new(w))
+    TRAINING_JP.each do |w|
+      profile_jp.add([w])
     end
-    @factory.add_profile(profile_jp, 2, 3)
+    @factory.add_profile(profile_jp)
   end
   def test_detector1
     detector = @factory.create()
-    detector.append(UCS2String.new("\x00a"))
+    detector.append([0x0061]) # "a"
     assert_equal("en", detector.detect())
   end
   def test_detector2
     detector = @factory.create()
-    detector.append(UCS2String.new("\x00b\x00\x20\x00d"))
+    detector.append([0x0062, 0x0020, 0x0064]) # "b d"
     assert_equal("fr", detector.detect())
   end
   def test_detector3
     detector = @factory.create()
-    detector.append(UCS2String.new("\x00d\x00 \x00e"))
+    detector.append([0x0064, 0x0020, 0x0065]) # "d e"
     assert_equal("en", detector.detect())
   end
   def test_detector4
     detector = @factory.create()
-    detector.append(UCS2String.new("\x30\x42\x30\x42\x30\x42\x30\x42\x00a"))
+    detector.append([0x3042, 0x3042, 0x3042, 0x3042, 0x0061])
     assert_equal("jp", detector.detect())
   end
   def test_exceptions
     detector = @factory.create()
-    detector.append(UCS2String.new(''))
+    detector.append([])
     assert_raises(NoFeaturesInTextError) do
       detector.detect()
     end

data/test/test_detector_factory.rb CHANGED Viewed

@@ -1,28 +1,40 @@
+# -*- coding: utf-8 -*-
 require 'test/helper'
 class DetectorFactoryTest < Test::Unit::TestCase
   def test_add_profile
-    profile = LangProfile.new
+    profile = LangProfile.new('sample')
     factory = DetectorFactory.new
-    factory.add_profile(profile, 0, 1)
+    factory.add_profile(profile)
     detector = factory.create(0.123)
     assert_equal(0.123, detector.alpha)
   end
   def test_exceptions
-    profile = LangProfile.new
+    profile = LangProfile.new('sample')
     factory = DetectorFactory.new
     assert_raises(NoProfilesLoadedError) do
       factory.create()
     end
-    factory.add_profile(profile, 0, 2)
+    factory.add_profile(profile)
     assert_raises(DuplicateProfilesError) do
-      factory.add_profile(profile, 1, 2)
+      factory.add_profile(profile)
     end
   end
+  def test_inspect
+    profile = LangProfile.new('sample')
+    factory = DetectorFactory.new
+    factory.add_profile(profile)
+    assert_match(Regexp.new(factory.object_ptr), factory.inspect)
+    assert_match(/1 profile\(s\)/, factory.inspect)
+    assert_match(Regexp.new(factory.class.name), factory.inspect)
+  end
 end

data/test/test_java_property_reader.rb CHANGED Viewed

@@ -1,8 +1,9 @@
+# -*- coding: utf-8 -*-
 require 'test/helper'
 class JavaPropertyReaderTest < Test::Unit::TestCase
   def test_parse
     jpr = JavaPropertyReader.new(MESSAGES_PROPERTIES)
-    assert_equal("\x4f\x7c\x69\x34", jpr["NGram.KANJI_1_0"])
+    assert_equal([0x4f7c, 0x6934], jpr["NGram.KANJI_1_0"])
   end
 end

data/test/test_lang_profile.rb CHANGED Viewed

@@ -3,8 +3,12 @@ require 'test/helper'
 class LangProfileTest < Test::Unit::TestCase
   def test_lang_profile
-    profile = LangProfile.new
-    assert_nil(profile.name)
+    assert_raises(ArgumentError) do
+      LangProfile.new
+    end
+    assert_raises(TypeError) do
+      LangProfile.new(nil)
+    end
   end
   def test_lang_profile_string_int
@@ -14,52 +18,54 @@ class LangProfileTest < Test::Unit::TestCase
   def test_add
     profile = LangProfile.new('en')
-    profile.add(UCS2String.from_utf8("a"))
-    assert_equal(1, profile.freq[UCS2String.from_utf8("a")])
-    profile.add(UCS2String.from_utf8("a"))
-    assert_equal(2, profile.freq[UCS2String.from_utf8("a")])
+    profile.add(utf82cp("a"))
+    assert_equal(1, profile.freq[utf82cp("a")])
+    profile.add(utf82cp("a"))
+    assert_equal(2, profile.freq[utf82cp("a")])
     profile.omit_less_freq()
   end
   def test_add_illegally_1
-    profile = LangProfile.new
-    profile.add(UCS2String.from_utf8("a"))
-    assert_nil(profile.freq[UCS2String.from_utf8("a")])
+    profile = LangProfile.new('sample')
+    profile.add(utf82cp("a"))
+    assert_equal(1, profile.freq[utf82cp("a")])
   end
   def test_add_illegally_2
     profile = LangProfile.new('en')
-    profile.add(UCS2String.from_utf8("a"))
-    profile.add(UCS2String.from_utf8(""))
-    profile.add(UCS2String.from_utf8("abcd"))
-    assert_equal(1, profile.freq[UCS2String.from_utf8("a")])
-    assert_nil(profile.freq[UCS2String.from_utf8("")])
-    assert_nil(profile.freq[UCS2String.from_utf8("abcd")])
+    profile.add(utf82cp("a"))
+    profile.add(utf82cp(""))
+    profile.add(utf82cp("abcd"))
+    assert_equal(1, profile.freq[utf82cp("a")])
+    assert_nil(profile.freq[utf82cp("")])
+    assert_nil(profile.freq[utf82cp("abcd")])
   end
   def test_omit_less_freq
     profile = LangProfile.new('en')
-    grams = "\x00a \x00b \x00c \x30\x42 \x30\x44 \x30\x46 \x30\x48 \x30\x4a \x30\x4b \x30\x4c \x30\x4d \x30\x4e \x30\x4f".split(/ /)
+    grams = [0x0061, 0x0062, 0x0063, 0x3042, 0x3044, 0x3046, 0x3048,
+             0x304a, 0x304b, 0x304c, 0x304d, 0x304e, 0x304f]
     5.times do
       grams.each do |gram|
-        profile.add(UCS2String.new(gram))
+        profile.add([gram])
       end
     end
-    profile.add(UCS2String.new("\x30\x50"))
+    profile.add([0x3050])
-    assert_equal(5, profile.freq[UCS2String.from_utf8("a")])
-    assert_equal(5, profile.freq[UCS2String.new("\x30\x42")])
-    assert_equal(1, profile.freq[UCS2String.new("\x30\x50")])
+    assert_equal(5, profile.freq[utf82cp("a")])
+    assert_equal(5, profile.freq[[0x3042]])
+    assert_equal(1, profile.freq[[0x3050]])
     profile.omit_less_freq()
-    assert_nil(profile.freq[UCS2String.from_utf8("a")])
-    assert_equal(5, profile.freq[UCS2String.new("\x30\x42")])
-    assert_nil(profile.freq[UCS2String.new("\x30\x50")])
+    assert_nil(profile.freq[utf82cp("a")])
+    assert_equal(5, profile.freq[[0x3042]])
+    assert_nil(profile.freq[[0x3050]])
   end
   def test_omit_less_freq_illegally
-    profile = LangProfile.new
-    profile.omit_less_freq()
+    profile = LangProfile.new('sample')
+    assert_nil(profile.omit_less_freq())
   end
   def test_load_from_file
@@ -67,11 +73,11 @@ class LangProfileTest < Test::Unit::TestCase
       profile = LangProfile.load_from_file(filename)
       assert_equal(filename.split(/\//).last, profile.name)
       has_content = [
-       profile.freq[UCS2String.from_utf8(" A")], # Latin
-       profile.freq[UCS2String.new("\x06\x0c")], # Arabic
-       profile.freq[UCS2String.new("\x0a\x85")], # Gujarati
-       profile.freq[UCS2String.new("\x09\x05")], # Hindi
-       profile.freq[UCS2String.new("\x30\x01")], # Japanese
+       profile.freq[utf82cp(" A")], # Latin
+       profile.freq[[0x060c]], # Arabic
+       profile.freq[[0x0a85]], # Gujarati
+       profile.freq[[0x0905]], # Hindi
+       profile.freq[[0x3001]], # Japanese
       ].any?
       assert(has_content, profile.inspect)
     end

data/test/test_language.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 require 'test/helper'
 class LanguageTest < Test::Unit::TestCase

data/test/test_language_detection_facade.rb CHANGED Viewed

@@ -4,6 +4,6 @@ require 'test/helper'
 class LanguageDetectionFacadeTest < Test::Unit::TestCase
   def test_initialize_and_detect
     facade = LanguageDetectionFacade.new
-    assert_equal("pl", facade.detect(UCS2String.from_utf8("Ich dalekopis fałszuje, gdy próby XQV nie wytrzymuje")))
+    assert_equal("pl", facade.detect(Langusta.utf82cp("Ich dalekopis fałszuje, gdy próby XQV nie wytrzymuje")))
   end
 end

data/test/test_langusta.rb CHANGED Viewed

@@ -1,13 +1,13 @@
+# -*- coding: utf-8 -*-
 require 'test/helper'
 class LangustaTest < Test::Unit::TestCase
   FACTORY = DetectorFactory.new
-  profiles = Dir[File.join(PROFILES_PATH, '*')].map do |filename|
-    LangProfile.load_from_file(filename)
-  end
-  profiles.each_with_index do |profile, index|
-    FACTORY.add_profile(profile, index, profiles.length)
+  Dir[File.join(PROFILES_PATH, '*')].each do |filename|
+    profile = LangProfile.load_from_file(filename)
+    FACTORY.add_profile(profile)
   end
   Dir['test/test_data/*'].each do |filename|
@@ -15,7 +15,7 @@ class LangustaTest < Test::Unit::TestCase
     define_method(("test_%s_language" % [language]).to_sym) do
       detector = FACTORY.create
-      ucs2_content = UCS2String.from_utf8(File.open(filename).read)
+      ucs2_content = Langusta.utf82cp(File.open(filename).read)
       detector = FACTORY.create
       detector.append(ucs2_content)

data/test/test_n_gram.rb CHANGED Viewed

@@ -7,50 +7,50 @@ class NGramTest < Test::Unit::TestCase
   end
   def test_normalize_with_latin
-    assert_equal("\x00 ", NGram.normalize("\x00\x00")) # \0
-    assert_equal("\x00 ", NGram.normalize("\x00\x09")) # <control>
-    assert_equal("\x00 ", NGram.normalize("\x00\x20")) # space
-    assert_equal("\x00 ", NGram.normalize("\x00\x30")) # 0
-    assert_equal("\x00 ", NGram.normalize("\x00\x40")) # @
-    assert_equal("\x00\x41", NGram.normalize("\x00\x41")) # A
-    assert_equal("\x00\x5a", NGram.normalize("\x00\x5a")) # Z
-    assert_equal("\x00 ", NGram.normalize("\x00\x5b")) # [
-    assert_equal("\x00 ", NGram.normalize("\x00\x60")) # `
-    assert_equal("\x00\x61", NGram.normalize("\x00\x61")) # a
-    assert_equal("\x00\x7a", NGram.normalize("\x00\x7a")) # z
-    assert_equal("\x00 ", NGram.normalize("\x00\x7b")) # {
-    assert_equal("\x00 ", NGram.normalize("\x00\x7f")) # <control>
-    assert_equal("\x00\x80", NGram.normalize("\x00\x80")) # <control>
-    assert_equal("\x00 ", NGram.normalize("\x00\xa0")) # <control>
-    assert_equal("\x00\xa1", NGram.normalize("\x00\xa1")) # <control>
+    assert_equal(0x20, NGram.normalize(0x00)) # \0
+    assert_equal(0x20, NGram.normalize(0x09)) # <control>
+    assert_equal(0x20, NGram.normalize(0x20)) # space
+    assert_equal(0x20, NGram.normalize(0x30)) # 0
+    assert_equal(0x20, NGram.normalize(0x40)) # @
+    assert_equal(0x41, NGram.normalize(0x41)) # A
+    assert_equal(0x5a, NGram.normalize(0x5a)) # Z
+    assert_equal(0x20, NGram.normalize(0x5b)) # [
+    assert_equal(0x20, NGram.normalize(0x60)) # `
+    assert_equal(0x61, NGram.normalize(0x61)) # a
+    assert_equal(0x7a, NGram.normalize(0x7a)) # z
+    assert_equal(0x20, NGram.normalize(0x7b)) # {
+    assert_equal(0x20, NGram.normalize(0x7f)) # <control>
+    assert_equal(0x80, NGram.normalize(0x80)) # <control>
+    assert_equal(0x20, NGram.normalize(0xa0)) # <control>
+    assert_equal(0xa1, NGram.normalize(0xa1)) # <control>
   end
   def test_normalize_with_cjk_kanji
-    assert_equal("\x4e\x00", NGram.normalize("\x4e\x00"))
-    assert_equal("\x4e\x01", NGram.normalize("\x4e\x01"))
-    assert_equal("\x4e\x02", NGram.normalize("\x4e\x02"))
-    assert_equal("\x4e\x01", NGram.normalize("\x4e\x03"))
-    assert_equal("\x4e\x04", NGram.normalize("\x4e\x04"))
-    assert_equal("\x4e\x05", NGram.normalize("\x4e\x05"))
-    assert_equal("\x4e\x06", NGram.normalize("\x4e\x06"))
-    assert_equal("\x4e\x07", NGram.normalize("\x4e\x07"))
-    assert_equal("\x4e\x08", NGram.normalize("\x4e\x08"))
-    assert_equal("\x4e\x09", NGram.normalize("\x4e\x09"))
-    assert_equal("\x4e\x10", NGram.normalize("\x4e\x10"))
-    assert_equal("\x4e\x11", NGram.normalize("\x4e\x11"))
-    assert_equal("\x4e\x12", NGram.normalize("\x4e\x12"))
-    assert_equal("\x4e\x13", NGram.normalize("\x4e\x13"))
-    assert_equal("\x4e\x14", NGram.normalize("\x4e\x14"))
-    assert_equal("\x4e\x15", NGram.normalize("\x4e\x15"))
-    assert_equal("\x4e\x1e", NGram.normalize("\x4e\x1e"))
-    assert_equal("\x4e\x1f", NGram.normalize("\x4e\x1f"))
-    assert_equal("\x4e\x20", NGram.normalize("\x4e\x20"))
-    assert_equal("\x4e\x21", NGram.normalize("\x4e\x21"))
-    assert_equal("\x4e\x22", NGram.normalize("\x4e\x22"))
-    assert_equal("\x4e\x23", NGram.normalize("\x4e\x23"))
-    assert_equal("\x4e\x13", NGram.normalize("\x4e\x24"))
-    assert_equal("\x4e\x13", NGram.normalize("\x4e\x25"))
-    assert_equal("\x4e\x30", NGram.normalize("\x4e\x30"))
+    assert_equal(0x4e00, NGram.normalize(0x4e00))
+    assert_equal(0x4e01, NGram.normalize(0x4e01))
+    assert_equal(0x4e02, NGram.normalize(0x4e02))
+    assert_equal(0x4e01, NGram.normalize(0x4e03))
+    assert_equal(0x4e04, NGram.normalize(0x4e04))
+    assert_equal(0x4e05, NGram.normalize(0x4e05))
+    assert_equal(0x4e06, NGram.normalize(0x4e06))
+    assert_equal(0x4e07, NGram.normalize(0x4e07))
+    assert_equal(0x4e08, NGram.normalize(0x4e08))
+    assert_equal(0x4e09, NGram.normalize(0x4e09))
+    assert_equal(0x4e10, NGram.normalize(0x4e10))
+    assert_equal(0x4e11, NGram.normalize(0x4e11))
+    assert_equal(0x4e12, NGram.normalize(0x4e12))
+    assert_equal(0x4e13, NGram.normalize(0x4e13))
+    assert_equal(0x4e14, NGram.normalize(0x4e14))
+    assert_equal(0x4e15, NGram.normalize(0x4e15))
+    assert_equal(0x4e1e, NGram.normalize(0x4e1e))
+    assert_equal(0x4e1f, NGram.normalize(0x4e1f))
+    assert_equal(0x4e20, NGram.normalize(0x4e20))
+    assert_equal(0x4e21, NGram.normalize(0x4e21))
+    assert_equal(0x4e22, NGram.normalize(0x4e22))
+    assert_equal(0x4e23, NGram.normalize(0x4e23))
+    assert_equal(0x4e13, NGram.normalize(0x4e24))
+    assert_equal(0x4e13, NGram.normalize(0x4e25))
+    assert_equal(0x4e30, NGram.normalize(0x4e30))
   end
   def test_ngram
@@ -58,46 +58,58 @@ class NGramTest < Test::Unit::TestCase
     (0..4).each do |n|
       assert_nil(ngram.get(n))
     end
-    ngram.add_char("\x00 ")
+    ngram.add_char(0x20)
     (1..3).each do |n|
       assert_nil(ngram.get(n))
     end
-    ngram.add_char("\x00A")
-    assert_equal(UCS2String.new("\x00A"), ngram.get(1))
-    assert_equal(UCS2String.new("\x00 \x00A"), ngram.get(2))
+    ngram.add_char(0x0041)
+    assert_equal([0x0041], ngram.get(1))
+    assert_equal([0x0020, 0x0041], ngram.get(2))
     assert_nil(ngram.get(3))
-    ngram.add_char("\x06\xcc")
-    assert_equal(UCS2String.new("\x06\x4a"), ngram.get(1))
-    assert_equal(UCS2String.new("\x00A\x06\x4a"), ngram.get(2))
-    assert_equal(UCS2String.new("\x00 \x00A\x06\x4a"), ngram.get(3))
-    ngram.add_char("\x1e\xa0")
-    assert_equal(UCS2String.new("\x1e\xc3"), ngram.get(1))
-    assert_equal(UCS2String.new("\x06\x4a\x1e\xc3"), ngram.get(2))
-    assert_equal(UCS2String.new("\x00A\x06\x4a\x1e\xc3"), ngram.get(3))
-    ngram.add_char("\x30\x44")
-    assert_equal(UCS2String.new("\x30\x42"), ngram.get(1))
-    assert_equal(UCS2String.new("\x1e\xc3\x30\x42"), ngram.get(2))
-    assert_equal(UCS2String.new("\x06\x4a\x1e\xc3\x30\x42"), ngram.get(3))
-    ngram.add_char("\x30\xa4")
-    assert_equal(UCS2String.new("\x30\xa2"), ngram.get(1))
-    assert_equal(UCS2String.new("\x30\x42\x30\xa2"), ngram.get(2))
-    assert_equal(UCS2String.new("\x1e\xc3\x30\x42\x30\xa2"), ngram.get(3))
-    ngram.add_char("\x31\x06")
-    assert_equal(UCS2String.new("\x31\x05"), ngram.get(1))
-    assert_equal(UCS2String.new("\x30\xa2\x31\x05"), ngram.get(2))
-    assert_equal(UCS2String.new("\x30\x42\x30\xa2\x31\x05"), ngram.get(3))
-    ngram.add_char("\xac\x01")
-    assert_equal(UCS2String.new("\xac\x00"), ngram.get(1))
-    assert_equal(UCS2String.new("\x31\x05\xac\x00"), ngram.get(2))
-    assert_equal(UCS2String.new("\x30\xa2\x31\x05\xac\x00"), ngram.get(3))
-    ngram.add_char("\x20\x10")
+    ngram.add_char(0x06cc)
+    assert_equal([0x064a], ngram.get(1))
+    assert_equal([0x0041, 0x64a], ngram.get(2))
+    assert_equal([0x0020, 0x0041, 0x064a], ngram.get(3))
+    ngram.add_char(0x1ea0)
+    assert_equal([0x1ec3], ngram.get(1))
+    assert_equal([0x064a, 0x1ec3], ngram.get(2))
+    assert_equal([0x0041, 0x064a, 0x1ec3], ngram.get(3))
+    ngram.add_char(0x3044)
+    assert_equal([0x3042], ngram.get(1))
+    assert_equal([0x1ec3, 0x3042], ngram.get(2))
+    assert_equal([0x064a, 0x1ec3, 0x3042], ngram.get(3))
+    ngram.add_char(0x30a4)
+    assert_equal([0x30a2], ngram.get(1))
+    assert_equal([0x3042, 0x30a2], ngram.get(2))
+    assert_equal([0x1ec3, 0x3042, 0x30a2], ngram.get(3))
+    ngram.add_char(0x3106)
+    assert_equal([0x3105], ngram.get(1))
+    assert_equal([0x30a2, 0x3105], ngram.get(2))
+    assert_equal([0x3042, 0x30a2, 0x3105], ngram.get(3))
+    ngram.add_char(0xac01)
+    assert_equal([0xac00], ngram.get(1))
+    assert_equal([0x3105, 0xac00], ngram.get(2))
+    assert_equal([0x30a2, 0x3105, 0xac00], ngram.get(3))
+    ngram.add_char(0x2010)
     assert_nil(ngram.get(1))
-    assert_equal(UCS2String.new("\xac\x00\x00 "), ngram.get(2))
-    assert_equal(UCS2String.new("\x31\x05\xac\x00\x00 "), ngram.get(3))
-    ngram.add_char("\x00a")
-    assert_equal(UCS2String.new("\x00a"), ngram.get(1))
-    assert_equal(UCS2String.new("\x00 \x00a"), ngram.get(2))
+    assert_equal([0xac00, 0x0020], ngram.get(2))
+    assert_equal([0x3105, 0xac00, 0x0020], ngram.get(3))
+    ngram.add_char(0x0041)
+    assert_equal([0x0041], ngram.get(1))
+    assert_equal([0x0020, 0x0041], ngram.get(2))
     assert_nil(ngram.get(3))
   end
+  def array_of_codepoints
+    array_of_codepoints.pack('n*')
+  end
 end