langue-japanese 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/.gitignore +17 -0
  2. data/Gemfile +6 -0
  3. data/LICENSE +22 -0
  4. data/README.md +53 -0
  5. data/Rakefile +2 -0
  6. data/langue-japanese.gemspec +22 -0
  7. data/lib/langue/japanese/language.rb +36 -0
  8. data/lib/langue/japanese/logging.rb +21 -0
  9. data/lib/langue/japanese/parser.rb +77 -0
  10. data/lib/langue/japanese/shaper.rb +70 -0
  11. data/lib/langue/japanese/structurer.rb +74 -0
  12. data/lib/langue/japanese/version.rb +5 -0
  13. data/lib/langue/japanese/words/adjective.rb +67 -0
  14. data/lib/langue/japanese/words/adjective_noun.rb +76 -0
  15. data/lib/langue/japanese/words/attribute.rb +100 -0
  16. data/lib/langue/japanese/words/classifier.rb +107 -0
  17. data/lib/langue/japanese/words/morpheme_filter.rb +26 -0
  18. data/lib/langue/japanese/words/noun.rb +61 -0
  19. data/lib/langue/japanese/words/period.rb +55 -0
  20. data/lib/langue/japanese/words/prefix.rb +19 -0
  21. data/lib/langue/japanese/words/pronoun.rb +16 -0
  22. data/lib/langue/japanese/words/verb.rb +100 -0
  23. data/lib/langue/japanese.rb +2 -0
  24. data/lib/langue-japanese.rb +1 -0
  25. data/spec/langue/japanese/data.yaml +169 -0
  26. data/spec/langue/japanese/language_spec.rb +120 -0
  27. data/spec/langue/japanese/parser_spec.rb +147 -0
  28. data/spec/langue/japanese/shaper_spec.rb +34 -0
  29. data/spec/langue/japanese/structurer_spec.rb +116 -0
  30. data/spec/langue/japanese/words/adjective_noun_spec.rb +76 -0
  31. data/spec/langue/japanese/words/adjective_spec.rb +123 -0
  32. data/spec/langue/japanese/words/noun_spec.rb +79 -0
  33. data/spec/langue/japanese/words/period_spec.rb +69 -0
  34. data/spec/langue/japanese/words/pronoun_spec.rb +24 -0
  35. data/spec/langue/japanese/words/verb_spec.rb +242 -0
  36. data/spec/langue/japanese_spec.rb +7 -0
  37. data/spec/spec_helper.rb +75 -0
  38. metadata +131 -0
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in langue-japanese.gemspec
4
+ gemspec
5
+
6
+ gem 'mecab-ruby', :git => 'git://github.com/takkkun/mecab-ruby.git'
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Takahiro Kondo
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,53 @@
1
+ What is langue-japanese
2
+ =======================
3
+
4
+ It provides the operations to Japanese.
5
+
6
+ Installation
7
+ ------------
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'langue'
12
+ gem 'langue-japanese'
13
+
14
+ # When doing morphological analysis
15
+ gem 'mecab-ruby', :git => 'path to mecab-ruby repository'
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install langue
24
+ $ gem install langue-japanese
25
+
26
+ langue-japanese gem runs on langue gem. So it depends to langue gem.
27
+
28
+ It also uses MeCab with morphological analysis, this gem depends too to
29
+ mecab-ruby gem if you do it.
30
+
31
+ Usage
32
+ -----
33
+
34
+ # coding: utf-8
35
+ require 'langue-japanese'
36
+
37
+ # Get a language class
38
+ language = Langue['japanese'].new
39
+
40
+ # Split to morphemes a text
41
+ morphemes = language.parse('今日は妹と一緒にお買い物してきたよ。楽しかった〜')
42
+
43
+ # Create a structured text from the morphemes
44
+ text = language.structure(morphemes)
45
+
46
+ Contributing
47
+ ------------
48
+
49
+ 1. Fork it
50
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
51
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
52
+ 4. Push to the branch (`git push origin my-new-feature`)
53
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/langue/japanese/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Takahiro Kondo"]
6
+ gem.email = ["kondo@atedesign.net"]
7
+ gem.description = %q{It provides the operations to Japanese.}
8
+ gem.summary = %q{The foundation for Japanese}
9
+ gem.homepage = ""
10
+
11
+ gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
12
+ gem.files = `git ls-files`.split("\n")
13
+ gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
14
+ gem.name = "langue-japanese"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = Langue::Japanese::VERSION
17
+
18
+ gem.add_runtime_dependency 'langue'
19
+ gem.add_runtime_dependency 'activesupport'
20
+
21
+ gem.add_development_dependency 'rspec'
22
+ end
@@ -0,0 +1,36 @@
1
+ require 'langue'
2
+
3
+ module Langue
4
+ module Japanese
5
+ class Language < Langue::Language
6
+ def parser
7
+ @parser ||= Parser.new(@options)
8
+ end
9
+ depend_to :parser, 'langue/japanese/parser'
10
+
11
+ def shaper
12
+ @shaper ||= Shaper.new(@options)
13
+ end
14
+ depend_to :shaper, 'langue/japanese/shaper'
15
+
16
+ def structurer
17
+ @structurer ||= Structurer.new(@options)
18
+ end
19
+ depend_to :structurer, 'langue/japanese/structurer'
20
+
21
+ def parse(text)
22
+ parser.parse(text)
23
+ end
24
+
25
+ def shape_person_name(morphemes, person_name)
26
+ shaper.shape_person_name(morphemes, person_name)
27
+ end
28
+
29
+ def structure(morphemes)
30
+ structurer.structure(morphemes)
31
+ end
32
+ end
33
+ end
34
+
35
+ support(Japanese::Language)
36
+ end
@@ -0,0 +1,21 @@
1
+ module Langue
2
+ module Japanese
3
+ module Logging
4
+ def null_logger
5
+ return NullLogger.new unless Object.const_defined?(:Fluent)
6
+ return NullLogger.new unless Fluent.const_defined?(:Logger)
7
+ Fluent::Logger::NullLogger.open
8
+ end
9
+
10
+ class NullLogger
11
+ def post(tag, map)
12
+ post_with_time(tag, map, nil)
13
+ end
14
+
15
+ def post_with_time(tag, map, time)
16
+ false
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,77 @@
1
+ require 'MeCab'
2
+
3
+ require 'langue/morpheme'
4
+ require 'langue/morphemes'
5
+ require 'langue/japanese/logging'
6
+
7
+ module Langue
8
+ module Japanese
9
+ class Parser
10
+ include Logging
11
+
12
+ def initialize(options = {})
13
+ @mecab_options = options[:mecab_options] || {}
14
+ @logger = options[:logger] || null_logger
15
+ @taggers = {}
16
+ end
17
+
18
+ attr_accessor :mecab_options
19
+
20
+ def parse(text)
21
+ morphemes = Morphemes.new
22
+ node = tagger.parseToNode(text)
23
+
24
+ while node
25
+ surface = node.surface.force_encoding('utf-8')
26
+
27
+ unless surface.empty?
28
+ feature = node.feature.force_encoding('utf-8')
29
+ morphemes << create_morpheme(surface, feature)
30
+ end
31
+
32
+ node = node.next
33
+ end
34
+
35
+ morphemes
36
+ end
37
+
38
+ private
39
+
40
+ def tagger
41
+ @taggers[Thread.current] ||= MeCab::Tagger.new(mecab_options_as_string)
42
+ end
43
+
44
+ def mecab_options_as_string
45
+ options = @mecab_options.inject([]) do |o, pair|
46
+ key = pair[0].to_sym
47
+ value = pair[1]
48
+
49
+ case key
50
+ when :sysdic
51
+ o << '-d' << value
52
+ when :userdic
53
+ o << '-u' << value
54
+ else
55
+ map = {
56
+ :level => 'warn',
57
+ :message => "'#{key}' option is unsupported",
58
+ :key => key
59
+ }
60
+
61
+ @logger.post('langue.japanese.parser', map)
62
+ o
63
+ end
64
+ end
65
+
66
+ options.join(' ')
67
+ end
68
+
69
+ def create_morpheme(surface, feature)
70
+ values = feature.split(',').map { |v| v == '*' ? nil : v }
71
+ values[1..3] = [values[1..3].take_while {|value| !value.nil?}]
72
+ values.unshift(surface.downcase)
73
+ Morpheme.new(Hash[Morpheme::KEYS.zip(values)])
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,70 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'langue/morpheme'
3
+ require 'langue/morphemes'
4
+ require 'langue/japanese/logging'
5
+
6
+ module Langue
7
+ module Japanese
8
+ class Shaper
9
+ include Logging
10
+
11
+ def initialize(options = {})
12
+ @logger = options[:logger] || null_logger
13
+ end
14
+
15
+ def shape_person_name(morphemes, person_name)
16
+ new_morphemes = Morphemes.new
17
+ name_morphemes = []
18
+ start_index = 0
19
+ person_name_size = person_name.size
20
+
21
+ morphemes.each do |morpheme|
22
+ text = morpheme.text
23
+ index = person_name.index(text, start_index)
24
+
25
+ if index == start_index
26
+ name_morphemes << morpheme
27
+ start_index += text.size
28
+
29
+ if start_index == person_name_size
30
+ new_morphemes << join_as_person_name(name_morphemes)
31
+ name_morphemes.clear
32
+ start_index = 0
33
+ end
34
+ else
35
+ new_morphemes += name_morphemes + [morpheme]
36
+ name_morphemes.clear
37
+ start_index = 0
38
+ end
39
+ end
40
+
41
+ new_morphemes
42
+ end
43
+
44
+ private
45
+
46
+ def join_as_person_name(morphemes)
47
+ text = morphemes.map(&:text).join
48
+
49
+ yomi = morphemes.inject('') do |yomi, morpheme|
50
+ t = morpheme.text
51
+ y = morpheme.yomi
52
+ yomi + (y || t != 'ー' ? y : t)
53
+ end
54
+
55
+ pronunciation = morphemes.inject('') do |pronunciation, morpheme|
56
+ pronunciation + (morpheme.pronunciation)
57
+ end
58
+
59
+ Morpheme.new(
60
+ :text => text,
61
+ :part_of_speech => '名詞',
62
+ :categories => %w(固有名詞 人名),
63
+ :root_form => text,
64
+ :yomi => yomi,
65
+ :pronunciation => pronunciation
66
+ )
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,74 @@
1
+ require 'active_support/core_ext/string/inflections'
2
+
3
+ require 'langue/text'
4
+ require 'langue/sentence'
5
+ require 'langue/word'
6
+ require 'langue/japanese/logging'
7
+
8
+ module Langue
9
+ module Japanese
10
+ class Structurer
11
+ include Logging
12
+
13
+ WORD_CLASSES = %w(
14
+ period
15
+ verb
16
+ adjective
17
+ adjective_noun
18
+ pronoun
19
+ noun
20
+ ).map do |word_name|
21
+ require "langue/japanese/words/#{word_name}"
22
+ Langue::Japanese.const_get(word_name.camelize)
23
+ end
24
+
25
+ def initialize(options = {})
26
+ @logger = options[:logger] || null_logger
27
+ end
28
+
29
+ def structure(morphemes)
30
+ sentences = []
31
+ words = []
32
+ arrived = false
33
+ index = 0
34
+ length = morphemes.length
35
+
36
+ while index < length
37
+ word_class = nil
38
+ size = 0
39
+
40
+ WORD_CLASSES.each do |wc|
41
+ s = wc.take(morphemes, index)
42
+
43
+ if s > 0
44
+ word_class = wc
45
+ size = s
46
+ break
47
+ end
48
+ end
49
+
50
+ if word_class.nil?
51
+ word_class = Word
52
+ size = 1
53
+ end
54
+
55
+ word = word_class.new(morphemes[index, size])
56
+
57
+ if arrived && !word.instance_of?(Period)
58
+ sentences << Sentence.new(words)
59
+ words.clear
60
+ arrived = false
61
+ elsif word.instance_of?(Period)
62
+ arrived = true
63
+ end
64
+
65
+ words << word
66
+ index += size
67
+ end
68
+
69
+ sentences << Sentence.new(words) unless words.empty?
70
+ Text.new(sentences)
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,5 @@
1
+ module Langue
2
+ module Japanese
3
+ VERSION = '0.0.2'
4
+ end
5
+ end
@@ -0,0 +1,67 @@
1
+ require 'langue/word'
2
+ require 'langue/japanese/words/prefix'
3
+ require 'langue/japanese/words/attribute'
4
+ require 'langue/japanese/words/classifier'
5
+
6
+ module Langue
7
+ module Japanese
8
+ class Adjective < Word
9
+ include Prefix
10
+ include Attribute
11
+
12
+ has :negative, :perfective
13
+
14
+ class << self
15
+ include Classifier
16
+
17
+ def take(morphemes, index)
18
+ if first_adjective?(morphemes, index)
19
+ take_adjective(morphemes, index)
20
+ elsif adjective_prefix?(morphemes, index)
21
+ take_adjective_with_prefix(morphemes, index)
22
+ else
23
+ 0
24
+ end
25
+ end
26
+
27
+ def take_adjective(morphemes, index)
28
+ return 0 unless first_adjective?(morphemes, index)
29
+ size = 1
30
+ size += 1 while following_adjective?(morphemes, index + size) || conjunctive_particle?(morphemes, index + size) && following_adjective?(morphemes, index + size + 1)
31
+ size += 1 while auxiliary_verb?(morphemes, index + size)
32
+ size
33
+ end
34
+
35
+ def take_adjective_with_prefix(morphemes, index)
36
+ size = 0
37
+ size += 1 while adjective_prefix?(morphemes, index + size)
38
+ return 0 unless size > 0
39
+ next_size = take_adjective(morphemes, index + size)
40
+ next_size > 0 ? size + next_size : 0
41
+ end
42
+ end
43
+
44
+ def key_morpheme
45
+ unless instance_variable_defined?(:@key_morpheme)
46
+ @key_morpheme = if empty?
47
+ nil
48
+ else
49
+ index = size - 1
50
+ index -= 1 while !self.class.body_adjective?(morphemes, index)
51
+ self[index]
52
+ end
53
+ end
54
+
55
+ @key_morpheme
56
+ end
57
+
58
+ def prefix_morphemes
59
+ @prefix_morphemes ||= begin
60
+ size = 0
61
+ size += 1 while self.class.adjective_prefix?(morphemes, size)
62
+ morphemes[0, size]
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,76 @@
1
+ require 'langue/japanese/words/noun'
2
+ require 'langue/japanese/words/prefix'
3
+ require 'langue/japanese/words/classifier'
4
+
5
+ module Langue
6
+ module Japanese
7
+ class AdjectiveNoun < Noun
8
+ include Prefix
9
+
10
+ class << self
11
+ include Classifier
12
+
13
+ def take(morphemes, index)
14
+ if adjective_stem_noun?(morphemes, index)
15
+ take_adjective_stem_noun(morphemes, index)
16
+ elsif first_noun?(morphemes, index)
17
+ take_noun_with_suffix(morphemes, index)
18
+ elsif noun_prefix?(morphemes, index)
19
+ take_noun_with_prefix(morphemes, index)
20
+ else
21
+ 0
22
+ end
23
+ end
24
+
25
+ def take_adjective_stem_noun(morphemes, index)
26
+ size = 0
27
+ size += 1 while adjective_stem_noun?(morphemes, index + size)
28
+ return 0 unless size > 0
29
+
30
+ if adjective_stem_suffix?(morphemes, index + size)
31
+ size
32
+ elsif following_noun?(morphemes, index + size)
33
+ 0
34
+ else
35
+ size
36
+ end
37
+ end
38
+
39
+ def take_noun_with_suffix(morphemes, index)
40
+ return 0 unless first_noun?(morphemes, index)
41
+ size = 1
42
+ size += 1 while following_noun?(morphemes, index + size) && !adjective_stem_suffix?(morphemes, index + size)
43
+ return 0 unless adjective_stem_suffix?(morphemes, index + size)
44
+ size += 1 while adjective_stem_suffix?(morphemes, index + size)
45
+
46
+ if following_noun?(morphemes, index + size)
47
+ 0
48
+ else
49
+ size
50
+ end
51
+ end
52
+
53
+ def take_noun_with_prefix(morphemes, index)
54
+ size = 0
55
+ size += 1 while noun_prefix?(morphemes, index + size)
56
+ return 0 unless size > 0
57
+ next_size = take(morphemes, index + size)
58
+ next_size > 0 ? size + next_size : 0
59
+ end
60
+ end
61
+
62
+ def prefix_morphemes
63
+ @prefix_morphemes ||= begin
64
+ size = 0
65
+ size += 1 while self.class.noun_prefix?(morphemes, size)
66
+ morphemes[0, size]
67
+ end
68
+ end
69
+
70
+ def body
71
+ @body = body_morphemes.empty? ? nil : body_morphemes.map(&:text).join unless instance_variable_defined?(:@body)
72
+ @body
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,100 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'langue/japanese/words/morpheme_filter'
3
+
4
+ module Langue
5
+ module Japanese
6
+ module Attribute
7
+ def self.included(object)
8
+ object.class_eval do
9
+ include MorphemeFilter
10
+ filter { |word, morphemes| word.empty? ? morphemes : morphemes[0..morphemes.index(word.key_morpheme)] }
11
+
12
+ def self.has(*attrs)
13
+ attrs.each do |attr|
14
+ define_method("#{attr}?") do
15
+ @attrs ||= {}
16
+ @attrs[attr] = !!__send__("include_#{attr}?") unless @attrs.key?(attr)
17
+ @attrs[attr]
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+
24
+ def body
25
+ unless instance_variable_defined?(:@body)
26
+ @body = if body_morphemes.empty?
27
+ nil
28
+ else
29
+ morphemes = body_morphemes.dup
30
+ last_morpheme = morphemes.pop
31
+ morphemes.map(&:text).join + last_morpheme.root_form
32
+ end
33
+ end
34
+
35
+ @body
36
+ end
37
+
38
+ if RUBY_VERSION.to_f < 1.9
39
+ def index(value = nil)
40
+ if value
41
+ super
42
+ else
43
+ each_with_index { |morpheme, index| return index if yield morpheme }
44
+ nil
45
+ end
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ def include_progressive?
52
+ if noncategorematic_verb_index(%w(てる でる とる どる))
53
+ true
54
+ elsif index = noncategorematic_verb_index(['いる'])
55
+ morphemes.at(index - 1) { |m| m.classified?('助詞', '接続助詞') && %w(て で).include?(m.root_form) }
56
+ end
57
+ end
58
+
59
+ def include_passive?
60
+ verb_suffix_index(%w(れる られる))
61
+ end
62
+
63
+ def include_aggressive?
64
+ auxiliary_verb_index('特殊・タイ')
65
+ end
66
+
67
+ def include_negative?
68
+ if auxiliary_verb_index('特殊・ナイ')
69
+ true
70
+ elsif index = auxiliary_verb_index('特殊・ヌ')
71
+ morphemes.at(index - 1) { |m| m.inflection_type == '未然形' }
72
+ end
73
+ end
74
+
75
+ def include_perfective?
76
+ if auxiliary_verb_index('特殊・タ')
77
+ true
78
+ elsif index = index { |m| m.classified?('助動詞') && m.root_form == 'ぬ' }
79
+ morphemes.at(index - 1) { |m| m.inflection_type == '連用形' }
80
+ end
81
+ end
82
+
83
+ def include_imperative?
84
+ self[-1].inflection_type =~ /^命令/
85
+ end
86
+
87
+ def noncategorematic_verb_index(root_forms)
88
+ index { |m| m.classified?('動詞', '非自立') && root_forms.include?(m.root_form) }
89
+ end
90
+
91
+ def auxiliary_verb_index(inflection)
92
+ index { |m| m.classified?('助動詞') && m.inflected?(inflection) }
93
+ end
94
+
95
+ def verb_suffix_index(root_forms)
96
+ index { |m| m.classified?('動詞', '接尾') && root_forms.include?(m.root_form) }
97
+ end
98
+ end
99
+ end
100
+ end