RubyGems - langue-japanese - Versions diffs - 0.0.2 - Mend

langue-japanese 0.0.2

Files changed (38) hide show

data/.gitignore +17 -0
data/Gemfile +6 -0
data/LICENSE +22 -0
data/README.md +53 -0
data/Rakefile +2 -0
data/langue-japanese.gemspec +22 -0
data/lib/langue/japanese/language.rb +36 -0
data/lib/langue/japanese/logging.rb +21 -0
data/lib/langue/japanese/parser.rb +77 -0
data/lib/langue/japanese/shaper.rb +70 -0
data/lib/langue/japanese/structurer.rb +74 -0
data/lib/langue/japanese/version.rb +5 -0
data/lib/langue/japanese/words/adjective.rb +67 -0
data/lib/langue/japanese/words/adjective_noun.rb +76 -0
data/lib/langue/japanese/words/attribute.rb +100 -0
data/lib/langue/japanese/words/classifier.rb +107 -0
data/lib/langue/japanese/words/morpheme_filter.rb +26 -0
data/lib/langue/japanese/words/noun.rb +61 -0
data/lib/langue/japanese/words/period.rb +55 -0
data/lib/langue/japanese/words/prefix.rb +19 -0
data/lib/langue/japanese/words/pronoun.rb +16 -0
data/lib/langue/japanese/words/verb.rb +100 -0
data/lib/langue/japanese.rb +2 -0
data/lib/langue-japanese.rb +1 -0
data/spec/langue/japanese/data.yaml +169 -0
data/spec/langue/japanese/language_spec.rb +120 -0
data/spec/langue/japanese/parser_spec.rb +147 -0
data/spec/langue/japanese/shaper_spec.rb +34 -0
data/spec/langue/japanese/structurer_spec.rb +116 -0
data/spec/langue/japanese/words/adjective_noun_spec.rb +76 -0
data/spec/langue/japanese/words/adjective_spec.rb +123 -0
data/spec/langue/japanese/words/noun_spec.rb +79 -0
data/spec/langue/japanese/words/period_spec.rb +69 -0
data/spec/langue/japanese/words/pronoun_spec.rb +24 -0
data/spec/langue/japanese/words/verb_spec.rb +242 -0
data/spec/langue/japanese_spec.rb +7 -0
data/spec/spec_helper.rb +75 -0
metadata +131 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/Gemfile ADDED Viewed

@@ -0,0 +1,6 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in langue-japanese.gemspec
+gemspec
+gem 'mecab-ruby', :git => 'git://github.com/takkkun/mecab-ruby.git'

data/LICENSE ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2012 Takahiro Kondo
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,53 @@
+What is langue-japanese
+=======================
+It provides the operations to Japanese.
+Installation
+------------
+Add this line to your application's Gemfile:
+    gem 'langue'
+    gem 'langue-japanese'
+    # When doing morphological analysis
+    gem 'mecab-ruby', :git => 'path to mecab-ruby repository'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install langue
+    $ gem install langue-japanese
+langue-japanese gem runs on langue gem. So it depends to langue gem.
+It also uses MeCab with morphological analysis, this gem depends too to
+mecab-ruby gem if you do it.
+Usage
+-----
+    # coding: utf-8
+    require 'langue-japanese'
+    # Get a language class
+    language = Langue['japanese'].new
+    # Split to morphemes a text
+    morphemes = language.parse('今日は妹と一緒にお買い物してきたよ。楽しかった〜')
+    # Create a structured text from the morphemes
+    text = language.structure(morphemes)
+Contributing
+------------
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Added some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ #!/usr/bin/env rake
2	+ require "bundler/gem_tasks"

data/langue-japanese.gemspec ADDED Viewed

@@ -0,0 +1,22 @@
+# -*- encoding: utf-8 -*-
+require File.expand_path('../lib/langue/japanese/version', __FILE__)
+Gem::Specification.new do |gem|
+  gem.authors       = ["Takahiro Kondo"]
+  gem.email         = ["kondo@atedesign.net"]
+  gem.description   = %q{It provides the operations to Japanese.}
+  gem.summary       = %q{The foundation for Japanese}
+  gem.homepage      = ""
+  gem.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  gem.files         = `git ls-files`.split("\n")
+  gem.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  gem.name          = "langue-japanese"
+  gem.require_paths = ["lib"]
+  gem.version       = Langue::Japanese::VERSION
+  gem.add_runtime_dependency 'langue'
+  gem.add_runtime_dependency 'activesupport'
+  gem.add_development_dependency 'rspec'
+end

data/lib/langue/japanese/language.rb ADDED Viewed

@@ -0,0 +1,36 @@
+require 'langue'
+module Langue
+  module Japanese
+    class Language < Langue::Language
+      def parser
+        @parser ||= Parser.new(@options)
+      end
+      depend_to :parser, 'langue/japanese/parser'
+      def shaper
+        @shaper ||= Shaper.new(@options)
+      end
+      depend_to :shaper, 'langue/japanese/shaper'
+      def structurer
+        @structurer ||= Structurer.new(@options)
+      end
+      depend_to :structurer, 'langue/japanese/structurer'
+      def parse(text)
+        parser.parse(text)
+      end
+      def shape_person_name(morphemes, person_name)
+        shaper.shape_person_name(morphemes, person_name)
+      end
+      def structure(morphemes)
+        structurer.structure(morphemes)
+      end
+    end
+  end
+  support(Japanese::Language)
+end

data/lib/langue/japanese/logging.rb ADDED Viewed

@@ -0,0 +1,21 @@
+module Langue
+  module Japanese
+    module Logging
+      def null_logger
+        return NullLogger.new unless Object.const_defined?(:Fluent)
+        return NullLogger.new unless Fluent.const_defined?(:Logger)
+        Fluent::Logger::NullLogger.open
+      end
+      class NullLogger
+        def post(tag, map)
+          post_with_time(tag, map, nil)
+        end
+        def post_with_time(tag, map, time)
+          false
+        end
+      end
+    end
+  end
+end

data/lib/langue/japanese/parser.rb ADDED Viewed

@@ -0,0 +1,77 @@
+require 'MeCab'
+require 'langue/morpheme'
+require 'langue/morphemes'
+require 'langue/japanese/logging'
+module Langue
+  module Japanese
+    class Parser
+      include Logging
+      def initialize(options = {})
+        @mecab_options = options[:mecab_options] || {}
+        @logger = options[:logger] || null_logger
+        @taggers = {}
+      end
+      attr_accessor :mecab_options
+      def parse(text)
+        morphemes = Morphemes.new
+        node = tagger.parseToNode(text)
+        while node
+          surface = node.surface.force_encoding('utf-8')
+          unless surface.empty?
+            feature = node.feature.force_encoding('utf-8')
+            morphemes << create_morpheme(surface, feature)
+          end
+          node = node.next
+        end
+        morphemes
+      end
+      private
+      def tagger
+        @taggers[Thread.current] ||= MeCab::Tagger.new(mecab_options_as_string)
+      end
+      def mecab_options_as_string
+        options = @mecab_options.inject([]) do |o, pair|
+          key = pair[0].to_sym
+          value = pair[1]
+          case key
+          when :sysdic
+            o << '-d' << value
+          when :userdic
+            o << '-u' << value
+          else
+            map = {
+              :level   => 'warn',
+              :message => "'#{key}' option is unsupported",
+              :key     => key
+            }
+            @logger.post('langue.japanese.parser', map)
+            o
+          end
+        end
+        options.join(' ')
+      end
+      def create_morpheme(surface, feature)
+        values = feature.split(',').map { |v| v == '*' ? nil : v }
+        values[1..3] = [values[1..3].take_while {|value| !value.nil?}]
+        values.unshift(surface.downcase)
+        Morpheme.new(Hash[Morpheme::KEYS.zip(values)])
+      end
+    end
+  end
+end

data/lib/langue/japanese/shaper.rb ADDED Viewed

@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+require 'langue/morpheme'
+require 'langue/morphemes'
+require 'langue/japanese/logging'
+module Langue
+  module Japanese
+    class Shaper
+      include Logging
+      def initialize(options = {})
+        @logger = options[:logger] || null_logger
+      end
+      def shape_person_name(morphemes, person_name)
+        new_morphemes = Morphemes.new
+        name_morphemes = []
+        start_index = 0
+        person_name_size = person_name.size
+        morphemes.each do |morpheme|
+          text = morpheme.text
+          index = person_name.index(text, start_index)
+          if index == start_index
+            name_morphemes << morpheme
+            start_index += text.size
+            if start_index == person_name_size
+              new_morphemes << join_as_person_name(name_morphemes)
+              name_morphemes.clear
+              start_index = 0
+            end
+          else
+            new_morphemes += name_morphemes + [morpheme]
+            name_morphemes.clear
+            start_index = 0
+          end
+        end
+        new_morphemes
+      end
+      private
+      def join_as_person_name(morphemes)
+        text = morphemes.map(&:text).join
+        yomi = morphemes.inject('') do |yomi, morpheme|
+          t = morpheme.text
+          y = morpheme.yomi
+          yomi + (y || t != 'ー' ? y : t)
+        end
+        pronunciation = morphemes.inject('') do |pronunciation, morpheme|
+          pronunciation + (morpheme.pronunciation)
+        end
+        Morpheme.new(
+          :text           => text,
+          :part_of_speech => '名詞',
+          :categories     => %w(固有名詞 人名),
+          :root_form      => text,
+          :yomi           => yomi,
+          :pronunciation  => pronunciation
+        )
+      end
+    end
+  end
+end

data/lib/langue/japanese/structurer.rb ADDED Viewed

@@ -0,0 +1,74 @@
+require 'active_support/core_ext/string/inflections'
+require 'langue/text'
+require 'langue/sentence'
+require 'langue/word'
+require 'langue/japanese/logging'
+module Langue
+  module Japanese
+    class Structurer
+      include Logging
+      WORD_CLASSES = %w(
+        period
+        verb
+        adjective
+        adjective_noun
+        pronoun
+        noun
+      ).map do |word_name|
+        require "langue/japanese/words/#{word_name}"
+        Langue::Japanese.const_get(word_name.camelize)
+      end
+      def initialize(options = {})
+        @logger = options[:logger] || null_logger
+      end
+      def structure(morphemes)
+        sentences = []
+        words = []
+        arrived = false
+        index = 0
+        length = morphemes.length
+        while index < length
+          word_class = nil
+          size = 0
+          WORD_CLASSES.each do |wc|
+            s = wc.take(morphemes, index)
+            if s > 0
+              word_class = wc
+              size = s
+              break
+            end
+          end
+          if word_class.nil?
+            word_class = Word
+            size = 1
+          end
+          word = word_class.new(morphemes[index, size])
+          if arrived && !word.instance_of?(Period)
+            sentences << Sentence.new(words)
+            words.clear
+            arrived = false
+          elsif word.instance_of?(Period)
+            arrived = true
+          end
+          words << word
+          index += size
+        end
+        sentences << Sentence.new(words) unless words.empty?
+        Text.new(sentences)
+      end
+    end
+  end
+end

data/lib/langue/japanese/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module Langue
+  module Japanese
+    VERSION = '0.0.2'
+  end
+end

data/lib/langue/japanese/words/adjective.rb ADDED Viewed

@@ -0,0 +1,67 @@
+require 'langue/word'
+require 'langue/japanese/words/prefix'
+require 'langue/japanese/words/attribute'
+require 'langue/japanese/words/classifier'
+module Langue
+  module Japanese
+    class Adjective < Word
+      include Prefix
+      include Attribute
+      has :negative, :perfective
+      class << self
+        include Classifier
+        def take(morphemes, index)
+          if first_adjective?(morphemes, index)
+            take_adjective(morphemes, index)
+          elsif adjective_prefix?(morphemes, index)
+            take_adjective_with_prefix(morphemes, index)
+          else
+            0
+          end
+        end
+        def take_adjective(morphemes, index)
+          return 0 unless first_adjective?(morphemes, index)
+          size = 1
+          size += 1 while following_adjective?(morphemes, index + size) || conjunctive_particle?(morphemes, index + size) && following_adjective?(morphemes, index + size + 1)
+          size += 1 while auxiliary_verb?(morphemes, index + size)
+          size
+        end
+        def take_adjective_with_prefix(morphemes, index)
+          size = 0
+          size += 1 while adjective_prefix?(morphemes, index + size)
+          return 0 unless size > 0
+          next_size = take_adjective(morphemes, index + size)
+          next_size > 0 ? size + next_size : 0
+        end
+      end
+      def key_morpheme
+        unless instance_variable_defined?(:@key_morpheme)
+          @key_morpheme = if empty?
+                            nil
+                          else
+                            index = size - 1
+                            index -= 1 while !self.class.body_adjective?(morphemes, index)
+                            self[index]
+                          end
+        end
+        @key_morpheme
+      end
+      def prefix_morphemes
+        @prefix_morphemes ||= begin
+                                size = 0
+                                size += 1 while self.class.adjective_prefix?(morphemes, size)
+                                morphemes[0, size]
+                              end
+      end
+    end
+  end
+end

data/lib/langue/japanese/words/adjective_noun.rb ADDED Viewed

@@ -0,0 +1,76 @@
+require 'langue/japanese/words/noun'
+require 'langue/japanese/words/prefix'
+require 'langue/japanese/words/classifier'
+module Langue
+  module Japanese
+    class AdjectiveNoun < Noun
+      include Prefix
+      class << self
+        include Classifier
+        def take(morphemes, index)
+          if adjective_stem_noun?(morphemes, index)
+            take_adjective_stem_noun(morphemes, index)
+          elsif first_noun?(morphemes, index)
+            take_noun_with_suffix(morphemes, index)
+          elsif noun_prefix?(morphemes, index)
+            take_noun_with_prefix(morphemes, index)
+          else
+            0
+          end
+        end
+        def take_adjective_stem_noun(morphemes, index)
+          size = 0
+          size += 1 while adjective_stem_noun?(morphemes, index + size)
+          return 0 unless size > 0
+          if adjective_stem_suffix?(morphemes, index + size)
+            size
+          elsif following_noun?(morphemes, index + size)
+            0
+          else
+            size
+          end
+        end
+        def take_noun_with_suffix(morphemes, index)
+          return 0 unless first_noun?(morphemes, index)
+          size = 1
+          size += 1 while following_noun?(morphemes, index + size) && !adjective_stem_suffix?(morphemes, index + size)
+          return 0 unless adjective_stem_suffix?(morphemes, index + size)
+          size += 1 while adjective_stem_suffix?(morphemes, index + size)
+          if following_noun?(morphemes, index + size)
+            0
+          else
+            size
+          end
+        end
+        def take_noun_with_prefix(morphemes, index)
+          size = 0
+          size += 1 while noun_prefix?(morphemes, index + size)
+          return 0 unless size > 0
+          next_size = take(morphemes, index + size)
+          next_size > 0 ? size + next_size : 0
+        end
+      end
+      def prefix_morphemes
+        @prefix_morphemes ||= begin
+                                size = 0
+                                size += 1 while self.class.noun_prefix?(morphemes, size)
+                                morphemes[0, size]
+                              end
+      end
+      def body
+        @body = body_morphemes.empty? ? nil : body_morphemes.map(&:text).join unless instance_variable_defined?(:@body)
+        @body
+      end
+    end
+  end
+end

data/lib/langue/japanese/words/attribute.rb ADDED Viewed

@@ -0,0 +1,100 @@
+# -*- coding: utf-8 -*-
+require 'langue/japanese/words/morpheme_filter'
+module Langue
+  module Japanese
+    module Attribute
+      def self.included(object)
+        object.class_eval do
+          include MorphemeFilter
+          filter { |word, morphemes| word.empty? ? morphemes : morphemes[0..morphemes.index(word.key_morpheme)] }
+          def self.has(*attrs)
+            attrs.each do |attr|
+              define_method("#{attr}?") do
+                @attrs ||= {}
+                @attrs[attr] = !!__send__("include_#{attr}?") unless @attrs.key?(attr)
+                @attrs[attr]
+              end
+            end
+          end
+        end
+      end
+      def body
+        unless instance_variable_defined?(:@body)
+          @body = if body_morphemes.empty?
+                    nil
+                  else
+                    morphemes = body_morphemes.dup
+                    last_morpheme = morphemes.pop
+                    morphemes.map(&:text).join + last_morpheme.root_form
+                  end
+        end
+        @body
+      end
+      if RUBY_VERSION.to_f < 1.9
+        def index(value = nil)
+          if value
+            super
+          else
+            each_with_index { |morpheme, index| return index if yield morpheme }
+            nil
+          end
+        end
+      end
+      private
+      def include_progressive?
+        if noncategorematic_verb_index(%w(てる でる とる どる))
+          true
+        elsif index = noncategorematic_verb_index(['いる'])
+          morphemes.at(index - 1) { |m| m.classified?('助詞', '接続助詞') && %w(て で).include?(m.root_form) }
+        end
+      end
+      def include_passive?
+        verb_suffix_index(%w(れる られる))
+      end
+      def include_aggressive?
+        auxiliary_verb_index('特殊・タイ')
+      end
+      def include_negative?
+        if auxiliary_verb_index('特殊・ナイ')
+          true
+        elsif index = auxiliary_verb_index('特殊・ヌ')
+          morphemes.at(index - 1) { |m| m.inflection_type == '未然形' }
+        end
+      end
+      def include_perfective?
+        if auxiliary_verb_index('特殊・タ')
+          true
+        elsif index = index { |m| m.classified?('助動詞') && m.root_form == 'ぬ' }
+          morphemes.at(index - 1) { |m| m.inflection_type == '連用形' }
+        end
+      end
+      def include_imperative?
+        self[-1].inflection_type =~ /^命令/
+      end
+      def noncategorematic_verb_index(root_forms)
+        index { |m| m.classified?('動詞', '非自立') && root_forms.include?(m.root_form) }
+      end
+      def auxiliary_verb_index(inflection)
+        index { |m| m.classified?('助動詞') && m.inflected?(inflection) }
+      end
+      def verb_suffix_index(root_forms)
+        index { |m| m.classified?('動詞', '接尾') && root_forms.include?(m.root_form) }
+      end
+    end
+  end
+end