RubyGems - langue-japanese - Versions diffs - 0.0.2 - Mend

langue-japanese 0.0.2

Files changed (38) hide show

data/.gitignore +17 -0
data/Gemfile +6 -0
data/LICENSE +22 -0
data/README.md +53 -0
data/Rakefile +2 -0
data/langue-japanese.gemspec +22 -0
data/lib/langue/japanese/language.rb +36 -0
data/lib/langue/japanese/logging.rb +21 -0
data/lib/langue/japanese/parser.rb +77 -0
data/lib/langue/japanese/shaper.rb +70 -0
data/lib/langue/japanese/structurer.rb +74 -0
data/lib/langue/japanese/version.rb +5 -0
data/lib/langue/japanese/words/adjective.rb +67 -0
data/lib/langue/japanese/words/adjective_noun.rb +76 -0
data/lib/langue/japanese/words/attribute.rb +100 -0
data/lib/langue/japanese/words/classifier.rb +107 -0
data/lib/langue/japanese/words/morpheme_filter.rb +26 -0
data/lib/langue/japanese/words/noun.rb +61 -0
data/lib/langue/japanese/words/period.rb +55 -0
data/lib/langue/japanese/words/prefix.rb +19 -0
data/lib/langue/japanese/words/pronoun.rb +16 -0
data/lib/langue/japanese/words/verb.rb +100 -0
data/lib/langue/japanese.rb +2 -0
data/lib/langue-japanese.rb +1 -0
data/spec/langue/japanese/data.yaml +169 -0
data/spec/langue/japanese/language_spec.rb +120 -0
data/spec/langue/japanese/parser_spec.rb +147 -0
data/spec/langue/japanese/shaper_spec.rb +34 -0
data/spec/langue/japanese/structurer_spec.rb +116 -0
data/spec/langue/japanese/words/adjective_noun_spec.rb +76 -0
data/spec/langue/japanese/words/adjective_spec.rb +123 -0
data/spec/langue/japanese/words/noun_spec.rb +79 -0
data/spec/langue/japanese/words/period_spec.rb +69 -0
data/spec/langue/japanese/words/pronoun_spec.rb +24 -0
data/spec/langue/japanese/words/verb_spec.rb +242 -0
data/spec/langue/japanese_spec.rb +7 -0
data/spec/spec_helper.rb +75 -0
metadata +131 -0

data/spec/langue/japanese/parser_spec.rb ADDED Viewed

@@ -0,0 +1,147 @@
+require 'spec_helper'
+require 'langue/japanese/parser'
+describe Langue::Japanese::Parser, '#initialize' do
+  it 'sets an empty hash to mecab_options attribute' do
+    parser = described_class.new
+    mecab_options = parser.mecab_options
+    mecab_options.should be_a(Hash)
+    mecab_options.should be_empty
+  end
+  it 'sets an instance of Langue::Japanese::Logging::NullLogger to @logger' do
+    parser = described_class.new
+    logger = parser.instance_eval { @logger }
+    logger.should be_a(Langue::Japanese::Logging::NullLogger)
+  end
+  it 'sets an empty hash to @taggers' do
+    parser = described_class.new
+    taggers = parser.instance_eval { @taggers }
+    taggers.should be_a(Hash)
+    taggers.should be_empty
+  end
+  context 'with mecab_options option' do
+    it 'sets the value of mecab_options option to mecab_options attribute' do
+      parser = described_class.new(:mecab_options => {:key => 'value'})
+      parser.mecab_options.should == {:key => 'value'}
+    end
+  end
+  context 'with logger option' do
+    it 'sets the value of logger option to @logger' do
+      parser = described_class.new(:logger => 'logger')
+      logger = parser.instance_eval { @logger }
+      logger.should == 'logger'
+    end
+  end
+end
+describe Langue::Japanese::Parser, '#parse' do
+  before do
+    tagger_stub(3)
+    @morphemes = described_class.new.parse('text')
+  end
+  it 'returns an instance of Langue::Morphemes' do
+    @morphemes.should be_a(Langue::Morphemes)
+  end
+  it 'returns an array with the number of morpheme' do
+    @morphemes.should have(3).items
+  end
+  it 'returns an array containing the contents of the morpheme' do
+    @morphemes[0].text.should == '1'
+    @morphemes[1].text.should == '2'
+    @morphemes[2].text.should == '3'
+  end
+end
+describe Langue::Japanese::Parser, '#tagger' do
+  before do
+    @parser = described_class.new
+  end
+  it 'calls MeCab::Tagger.new with mecab_options attribute' do
+    MeCab::Tagger.should_receive(:new).with('mecab_options')
+    @parser.stub!(:mecab_options_as_string).and_return('mecab_options')
+    @parser.send(:tagger)
+  end
+  it 'returns an instance of MeCab::Tagger' do
+    tagger = tagger_stub
+    @parser.send(:tagger).should == tagger
+  end
+end
+describe Langue::Japanese::Parser, '#mecab_options_as_string' do
+  it 'returns an empty string if it does not give options' do
+    parser = described_class.new
+    mecab_options_as_string = parser.send(:mecab_options_as_string)
+    mecab_options_as_string.should be_a(String)
+    mecab_options_as_string.should be_empty
+  end
+  context 'with sysdic option' do
+    it 'returns a string included d option' do
+      parser = described_class.new(:mecab_options => {:sysdic => 'sysdic'})
+      parser.send(:mecab_options_as_string).should == '-d sysdic'
+    end
+  end
+  context 'with userdic option' do
+    it 'returns a string included u option' do
+      parser = described_class.new(:mecab_options => {:userdic => 'userdic'})
+      parser.send(:mecab_options_as_string).should == '-u userdic'
+    end
+  end
+  context 'with an unsupported option' do
+    it 'logs that an option is unsupported' do
+      parser = described_class.new(:mecab_options => {:unsupported => 'value'})
+      parser.instance_eval { @logger }.should_receive(:post).with('langue.japanese.parser', {
+        :level   => 'warn',
+        :message => "'unsupported' option is unsupported",
+        :key     => :unsupported
+      })
+      parser.send(:mecab_options_as_string)
+    end
+  end
+end
+describe Langue::Japanese::Parser, '#create_morpheme' do
+  before do
+    @parser = described_class.new
+  end
+  it 'returns an expected morpheme' do
+    surface = 'surface'
+    feature = 'part_of_speech,category1,category2,category3,inflection,inflection_type,root_form,yomi,pronunciation'
+    morpheme = @parser.send(:create_morpheme, surface, feature)
+    morpheme.text.should == 'surface'
+    morpheme.part_of_speech.should == 'part_of_speech'
+    morpheme.categories.should == %w(category1 category2 category3)
+    morpheme.inflection.should == 'inflection'
+    morpheme.inflection_type.should == 'inflection_type'
+    morpheme.root_form.should == 'root_form'
+    morpheme.yomi.should == 'yomi'
+    morpheme.pronunciation.should == 'pronunciation'
+  end
+  it 'replaces to nil from the asterisk' do
+    surface = 'surface'
+    feature = '*,*,*,*,*,*,*,*,*'
+    morpheme = @parser.send(:create_morpheme, surface, feature)
+    morpheme.part_of_speech.should be_nil
+    morpheme.categories.should be_empty
+    morpheme.inflection.should be_nil
+    morpheme.inflection_type.should be_nil
+    morpheme.root_form.should be_nil
+    morpheme.yomi.should be_nil
+    morpheme.pronunciation.should be_nil
+  end
+end

data/spec/langue/japanese/shaper_spec.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+require 'spec_helper'
+require 'langue/japanese/shaper'
+describe Langue::Japanese::Shaper, '#initialize' do
+  it 'sets an instance of Langue::Japanese::Logging::NullLogger to @logger' do
+    parser = described_class.new
+    logger = parser.instance_eval { @logger }
+    logger.should be_a(Langue::Japanese::Logging::NullLogger)
+  end
+  context 'with logger option' do
+    it 'sets the value of logger option to @logger' do
+      parser = described_class.new(:logger => 'logger')
+      logger = parser.instance_eval { @logger }
+      logger.should == 'logger'
+    end
+  end
+end
+describe Langue::Japanese::Shaper, '#shape_person_name' do
+  it 'forms the morphemes to a person name' do
+    shaper = described_class.new
+    {
+      'あたしの名前は天道あかねよ' => '天道あかね',
+      'オレの名前は早乙女乱馬だ'   => '早乙女乱馬'
+    }.each do |text, name|
+      morphemes = parser.parse(text)
+      morpheme = shaper.shape_person_name(morphemes, name).find { |m| m.classified?(*%w(名詞 固有名詞 人名)) }
+      morpheme.text.should == name
+    end
+  end
+end

data/spec/langue/japanese/structurer_spec.rb ADDED Viewed

@@ -0,0 +1,116 @@
+# -*- coding: utf-8 -*-
+require 'langue/japanese/structurer'
+require 'yaml'
+describe Langue::Japanese::Structurer, '::WORD_CLASSES' do
+  before do
+    @word_classes = Langue::Japanese::Structurer::WORD_CLASSES
+  end
+  it 'has the word classes' do
+    @word_classes.should == [
+      Langue::Japanese::Period,
+      Langue::Japanese::Verb,
+      Langue::Japanese::Adjective,
+      Langue::Japanese::AdjectiveNoun,
+      Langue::Japanese::Pronoun,
+      Langue::Japanese::Noun
+    ]
+  end
+  it 'has take method in all the word classes' do
+    @word_classes.each do |word_class|
+      word_class.should be_respond_to(:take)
+    end
+  end
+end
+describe Langue::Japanese::Structurer, '#initialize' do
+  it 'sets an instance of Langue::Japanese::Logging::NullLogger to @logger' do
+    structurer = described_class.new
+    logger = structurer.instance_eval { @logger }
+    logger.should be_a(Langue::Japanese::Logging::NullLogger)
+  end
+  context 'with logger option' do
+    it 'sets the value of logger option to @logger' do
+      structurer = described_class.new(:logger => 'logger')
+      logger = structurer.instance_eval { @logger }
+      logger.should == 'logger'
+    end
+  end
+end
+describe Langue::Japanese::Structurer, '#structure' do
+  before :all do
+    @parser = parser
+    @morphemes = @parser.parse('今日は妹と一緒にお買い物してきたよ。楽しかった〜')
+    @word_classes = Langue::Japanese::Structurer::WORD_CLASSES
+  end
+  before do
+    @structurer = described_class.new
+  end
+  it 'returns an instance of Langue::Text' do
+    text = @structurer.structure(@morphemes)
+    text.should be_a Langue::Text
+  end
+  it 'returns valid text' do
+    text = @structurer.structure(@morphemes)
+    text.should be_valid
+  end
+  it 'returns sentences in the text' do
+    text = @structurer.structure(@morphemes)
+    text.should have(2).items
+  end
+  it 'returns words in the sentences' do
+    text = @structurer.structure(@morphemes)
+    text[0].should have(9).items
+    text[1].should have(2).items
+  end
+  YAML.load_file(File.join(File.dirname(__FILE__), 'data.yaml')).each do |data|
+    input = data['text']
+    sentences = data['sentences']
+    it "extracts expected words from #{input.size < 10 ? input : input[0..7] + '...'}" do
+      morphemes = @parser.parse(input)
+      text = @structurer.structure(morphemes)
+      text.should have(sentences.size).items
+      text.each_with_index do |sentence, index|
+        sentence = sentence.select { |word| !word.instance_of?(Langue::Word) }
+        words = sentences[index]
+        sentence.should have(words.size).items
+        words.zip(sentence).each do |pair|
+          pair[1].text.should == pair[0][0]
+          pair[1].class.name.split('::').last.should == pair[0][1]
+          next unless pair[0][2]
+          pair[0][2].each do |name, value|
+            if name.downcase == 'attributes'
+              value.each do |attribute|
+                pair[1].__send__("#{attribute}?").should be_true
+              end
+            else
+              got = pair[1].__send__(name)
+              if TrueClass === value
+                got.should be_true
+              elsif FalseClass === value
+                got.should be_false
+              else
+                got.should == value
+              end
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/spec/langue/japanese/words/adjective_noun_spec.rb ADDED Viewed

@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+require 'spec_helper'
+require 'langue/japanese/words/adjective_noun'
+describe Langue::Japanese::AdjectiveNoun, '.take' do
+  after do
+    @pairs.each do |text, size|
+      morphemes = parser.parse(text)
+      described_class.take(morphemes, 0).should == size
+    end
+  end
+  it 'takes an adjective noun' do
+    @pairs = {
+      '大丈夫だ'       => 1,
+      '健康だ'         => 1,
+      'かっこいいこと' => 0,
+      '会話だ'         => 0,
+      '話すこと'       => 0
+    }
+  end
+  it 'takes an adjective noun with prefix' do
+    @pairs = {
+      '超大丈夫だ'       => 2,
+      '反健康だ'         => 2,
+      '超それだ'         => 0,
+      '超若干だ'         => 0,
+      '超可愛いこと'     => 0,
+      '反かっこいいこと' => 0
+    }
+  end
+  it 'takes an adjective noun with suffix' do
+    @pairs = {
+      '病気がちだ' => 2,
+      '犬好きだ'   => 2,
+      '犬だ'       => 0,
+      'それがちだ' => 0,
+      '若干がちだ' => 0
+    }
+  end
+  it 'takes a successive adjective noun' do
+    @pairs = {
+      '健康大丈夫だ'         => 2,
+      '健康大丈夫がちだ'     => 2,
+      '健康大丈夫ラーメンだ' => 0
+    }
+  end
+  it 'takes a complex adjective' do
+    @pairs = {
+      '超病気がちだ'           => 3,
+      '超漆黒病気がちだ'       => 4,
+      '反超健康大丈夫だ'       => 4,
+      '反超健康大丈夫がちだ'   => 4,
+      '超犬だ'                 => 0,
+      '超健康大丈夫ラーメンだ' => 0,
+      '精神的疾患だ'           => 0
+    }
+  end
+end
+describe Langue::Japanese::AdjectiveNoun, '#prefix' do
+  it 'returns the prefix' do
+    adjective_noun('反超病気がち').prefix.should == '反超'
+  end
+end
+describe Langue::Japanese::AdjectiveNoun, '#body' do
+  it 'returns the text with the prefix' do
+    adjective_noun('反超病気がち').body.should == '病気がち'
+  end
+end

data/spec/langue/japanese/words/adjective_spec.rb ADDED Viewed

@@ -0,0 +1,123 @@
+# -*- coding: utf-8 -*-
+require 'spec_helper'
+require 'langue/japanese/words/adjective'
+describe Langue::Japanese::Adjective, '.take' do
+  after do
+    @pairs.each do |text, size|
+      morphemes = parser.parse(text)
+      described_class.take(morphemes, 0).should == size
+    end
+  end
+  it 'takes an adjective' do
+    @pairs = {
+      '可愛いこと'     => 1,
+      'かっこいいこと' => 1,
+      '会話だ'         => 0,
+      '話すこと'       => 0
+    }
+  end
+  it 'takes an adjective with prefix' do
+    @pairs = {
+      'くそ可愛いこと'       => 2,
+      'くそくそかっこいいこと' => 3,
+      'くそ会話だ'             => 0
+    }
+  end
+  it 'takes an adjective with suffix' do
+    @pairs = {
+      '可愛いっぽいこと' => 2
+    }
+  end
+  it 'takes a successive adjective' do
+    @pairs = {
+      '可愛がたいこと' => 2
+    }
+  end
+  it 'takes a negative adjective' do
+    @pairs = {
+      '可愛くないこと'     => 2,
+      'かっこよくないこと' => 2
+    }
+  end
+  it 'takes a perfective adjective' do
+    @pairs = {
+      '可愛かったこと'     => 2,
+      'かっこよかったこと' => 2
+    }
+  end
+  it 'takes a complex adjective' do
+    @pairs = {
+      'くそ可愛がたくなかったこと' => 5,
+      'クソかっこよくないこと'     => 3,
+      '美しくなかったこと'         => 3,
+      '厳しいっぽくなかったこと'   => 4
+    }
+  end
+  it 'takes an adjective by other' do
+    @pairs = {
+      '可愛いでしょう' => 3
+    }
+  end
+end
+describe Langue::Japanese::Adjective, '#key_morpheme' do
+  it 'returns the categorematic adjective or the noncategorematic adjective' do
+    {
+      '可愛い'     => 0,
+      '可愛っぽい' => 0,
+      '可愛くない' => 0,
+      '可愛がたい' => 1
+    }.each do |text, index|
+      word = adjective(text)
+      word.key_morpheme.should == word[index]
+    end
+  end
+  context 'with an empty word' do
+    it 'returns nil' do
+      word = described_class.new
+      word.key_morpheme.should be_nil
+    end
+  end
+end
+describe Langue::Japanese::Adjective, '#prefix' do
+  it 'returns the prefix' do
+    adjective('くそくそ可愛っぽくない').prefix.should == 'くそくそ'
+  end
+end
+describe Langue::Japanese::Adjective, '#body' do
+  it 'returns the text with the prefix' do
+    adjective('くそくそ可愛っぽくない').body.should == '可愛い'
+  end
+end
+describe Langue::Japanese::Adjective, '#negative?' do
+  it 'returns true if it is negative' do
+    adjective('可愛くない').should be_negative
+  end
+  it 'returns false if it is not negative' do
+    adjective('可愛い').should_not be_negative
+  end
+end
+describe Langue::Japanese::Adjective, '#perfective?' do
+  it 'returns true if it is perfective' do
+    adjective('可愛かった').should be_perfective
+  end
+  it 'returns false if it is not perfective' do
+    adjective('可愛い').should_not be_perfective
+  end
+end

data/spec/langue/japanese/words/noun_spec.rb ADDED Viewed

@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+require 'spec_helper'
+require 'langue/japanese/words/noun'
+describe Langue::Japanese::Noun, '.take' do
+  after do
+    @pairs.each do |text, size|
+      morphemes = parser.parse(text)
+      described_class.take(morphemes, 0).should == size
+    end
+  end
+  it 'takes a noun' do
+    @pairs = {
+      '会話だ'         => 1,
+      'かっこいいこと' => 0,
+      '話すこと'       => 0,
+      '大丈夫だ'       => 0,
+      '健康だ'         => 0
+    }
+  end
+  it 'takes a noun with prefix' do
+    @pairs = {
+      '超会話だ'         => 2,
+      '超反会話だ'       => 3,
+      '超大丈夫だ'       => 0,
+      '反健康だ'         => 0,
+      '超それだ'         => 0,
+      '超若干だ'         => 0,
+      '超可愛いこと'     => 0,
+      '反かっこいいこと' => 0
+    }
+  end
+  it 'takes a successive noun' do
+    @pairs = {
+      '緊急連絡網だ'         => 3,
+      '健康大丈夫ラーメンだ' => 3,
+      '精神的疾患だ'         => 3,
+      '緊急大丈夫だ'         => 0
+    }
+  end
+  it 'takes an adverbable noun' do
+    @pairs = {
+      '一挙だ'             => 1,
+      '一挙ラーメン永年だ' => 1,
+      '一挙永年ラーメンだ' => 2,
+      'ラーメン永年だ'     => 1
+    }
+  end
+  it 'does not take noun conjunct to suru-verb' do
+    @pairs = {
+      '連絡する'       => 0,
+      '緊急連絡する'   => 0,
+      '緊急連絡網する' => 3
+    }
+  end
+  it 'does not take noun if starts with special char' do
+    @pairs = {
+      'ぁ犬だ' => 0,
+      'ァ犬だ' => 0,
+      'ぃ犬だ' => 0,
+      'ィ犬だ' => 0,
+      'ぅ犬だ' => 0,
+      'ゥ犬だ' => 0,
+      'ぇ犬だ' => 0,
+      'ェ犬だ' => 0,
+      'ぉ犬だ' => 0,
+      'ォ犬だ' => 0,
+      'っ犬だ' => 0,
+      'ッ犬だ' => 0,
+      'ー犬だ' => 0
+    }
+  end
+end

data/spec/langue/japanese/words/period_spec.rb ADDED Viewed

@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+require 'spec_helper'
+require 'langue/japanese/words/period'
+describe Langue::Japanese::Period, '.take' do
+  after do
+    @pairs.each do |text, size|
+      morphemes = parser.parse(text)
+      described_class.take(morphemes, 0).should == size
+    end
+  end
+  it 'takes a period' do
+    @pairs = {
+      '。さて'       => 1,
+      '…… さて'    => 2,
+      '‥・ さて'    => 2,
+      '、、、 さて'  => 3,
+      '。。。 さて'  => 3,
+      '. さて'       => 1,
+      '... さて'     => 3,
+      '．さて'       => 1,
+      '．．さて'     => 2,
+      '.。．さて'    => 3,
+      '、。さて'     => 2,
+      '! さて'       => 1,
+      '！ さて'      => 1,
+      '!! さて'      => 2,
+      '！！ さて'    => 2,
+      '!！ さて'     => 2,
+      '? さて'       => 1,
+      '？ さて'      => 1,
+      '?? さて'      => 2,
+      '？？ さて'    => 2,
+      '!? さて'      => 2,
+      '！？ さて'    => 2,
+      '!?! さて'     => 3,
+      '！?!？ さて'  => 4,
+      '!??!!?! さて' => 7,
+      '、さて'       => 0,
+      '，さて'       => 0,
+      ', さて'       => 0
+    }
+  end
+end
+describe Langue::Japanese::Period, '#exclamation?' do
+  it 'returns true if include exclamation mark' do
+    period('!').should  be_exclamation
+    period('！').should be_exclamation
+    period('?!').should be_exclamation
+  end
+  it 'returns false if do not include exclamation marks' do
+    period('?').should_not be_exclamation
+  end
+end
+describe Langue::Japanese::Period, '#question?' do
+  it 'returns true if include question mark' do
+    period('?').should  be_question
+    period('？').should be_question
+    period('!?').should be_question
+  end
+  it 'returns false if do not include question marks' do
+    period('!').should_not be_question
+  end
+end

data/spec/langue/japanese/words/pronoun_spec.rb ADDED Viewed

@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+require 'spec_helper'
+require 'langue/japanese/words/pronoun'
+describe Langue::Japanese::Pronoun, '.take' do
+  after do
+    @pairs.each do |text, size|
+      morphemes = parser.parse(text)
+      described_class.take(morphemes, 0).should == size
+    end
+  end
+  it 'takes a pronoun' do
+    @pairs = {
+      'それだ'         => 1,
+      '僕だ'           => 1,
+      'それ僕だ'       => 1,
+      '会話それだ'     => 0,
+      'かっこいいこと' => 0,
+      '会話だ'         => 0,
+      '話すこと'       => 0
+    }
+  end
+end