japanese_deinflector 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +3 -0
- data/.travis.yml +0 -2
- data/README.md +26 -0
- data/japanese_deinflector.gemspec +1 -1
- data/lib/data/deinflect.json +1553 -1
- data/lib/deinflect_to_json.rb +10 -9
- data/lib/japanese_deinflector.rb +46 -22
- data/lib/japanese_deinflector/version.rb +1 -1
- data/spec/japanese_deinflector_spec.rb +144 -49
- metadata +4 -3
data/lib/deinflect_to_json.rb
CHANGED
@@ -13,20 +13,21 @@ def parse(fpath)
|
|
13
13
|
# Rules are tab-separated in the following format:
|
14
14
|
# <from>\t<to>\t<type>\t<reason_index>
|
15
15
|
else
|
16
|
-
|
17
|
-
|
18
|
-
rules_hash[
|
19
|
-
|
20
|
-
:
|
21
|
-
:
|
16
|
+
from_suffix = parts.first
|
17
|
+
reason_id = parts[3].to_i
|
18
|
+
rules_hash[from_suffix.size] ||= []
|
19
|
+
rules_hash[from_suffix.size] << {
|
20
|
+
:from_suffix => from_suffix,
|
21
|
+
:to_suffix => parts[1],
|
22
|
+
:reason => reasons[reason_id],
|
22
23
|
}
|
23
24
|
end
|
24
25
|
end
|
25
|
-
|
26
|
-
{:reasons => reasons, :rules => rules_hash}
|
26
|
+
rules_hash
|
27
27
|
end
|
28
28
|
|
29
29
|
root = File.expand_path(File.dirname(__FILE__))
|
30
30
|
File.open(File.join(root, 'data/deinflect.json'), 'w') do |f|
|
31
|
-
|
31
|
+
parsed_dat = parse(File.join(root, 'data/deinflect.dat'))
|
32
|
+
f.write(JSON.pretty_generate(parsed_dat))
|
32
33
|
end
|
data/lib/japanese_deinflector.rb
CHANGED
@@ -1,41 +1,65 @@
|
|
1
1
|
#encoding: utf-8
|
2
|
+
require 'rubygems'
|
2
3
|
require 'json'
|
3
4
|
require "japanese_deinflector/version"
|
4
5
|
|
5
6
|
class JapaneseDeinflector
|
6
7
|
def initialize
|
7
8
|
File.open(File.join(File.expand_path(File.dirname(__FILE__)), 'data/deinflect.json')) do |f|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
rules_and_reasons[:rules].each do |size, rules|
|
14
|
-
@rules[size.to_s.to_i] = rules
|
9
|
+
rule_groups = JSON.parse(f.read, :symbolize_names => true)
|
10
|
+
# Convert hash keys to integers (from something like :"9" -> 9)
|
11
|
+
@rule_groups = {}
|
12
|
+
rule_groups.each do |suffix_size, rules|
|
13
|
+
@rule_groups[suffix_size.to_s.to_i] = rules
|
15
14
|
end
|
16
15
|
end
|
17
16
|
end
|
18
17
|
|
19
|
-
def deinflect(word)
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
18
|
+
def deinflect(word, iteration = 0)
|
19
|
+
results = Set.new
|
20
|
+
|
21
|
+
filter_rule_groups(@rule_groups, :max_suffix_size => word.size).each do |suffix_size, rules|
|
22
|
+
from_suffix = word[-suffix_size..-1]
|
23
|
+
filter_rules(rules, :from_suffix => from_suffix).each do |rule|
|
24
|
+
results << result_hash(word, from_suffix, rule[:to_suffix], rule[:reason])
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
if iteration < 2
|
29
|
+
additional_results = Set.new
|
30
|
+
results.each do |result|
|
31
|
+
additional_results.merge(deinflect(result[:word], iteration + 1))
|
31
32
|
end
|
33
|
+
results.merge(additional_results)
|
32
34
|
end
|
33
|
-
|
35
|
+
|
36
|
+
# Sort results in descending order by weight
|
37
|
+
results.to_a.sort{|x, y| y[:weight] <=> x[:weight]}
|
34
38
|
end
|
35
39
|
|
36
40
|
private
|
37
41
|
|
38
|
-
def
|
39
|
-
|
42
|
+
def result_hash(word, from_suffix, to_suffix, reason = "")
|
43
|
+
# Append new suffix to the original word to get the deinflection
|
44
|
+
deinflected_word = "#{word[0..-from_suffix.size-1]}#{to_suffix}"
|
45
|
+
# Weight is between 0 and 1, 1 being a higher chance of correct deinflection
|
46
|
+
weight = (Float(from_suffix.size) / word.size).round(3)
|
47
|
+
{
|
48
|
+
:word => deinflected_word,
|
49
|
+
:weight => weight,
|
50
|
+
:reason => reason,
|
51
|
+
}
|
52
|
+
end
|
53
|
+
|
54
|
+
def filter_rule_groups(groups, filters = {})
|
55
|
+
groups.clone.delete_if do |suffix_size, rules|
|
56
|
+
suffix_size >= filters[:max_suffix_size] if filters[:max_suffix_size]
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def filter_rules(rules, filters = {})
|
61
|
+
rules.clone.delete_if.each do |rule|
|
62
|
+
rule[:from_suffix] != filters[:from_suffix] if filters[:from_suffix]
|
63
|
+
end
|
40
64
|
end
|
41
65
|
end
|
@@ -4,67 +4,162 @@ require 'spec_helper'
|
|
4
4
|
describe JapaneseDeinflector do
|
5
5
|
subject{ JapaneseDeinflector.new }
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
#subject.deinflect("見ている").first[:word].should == "見る"
|
10
|
-
#subject.deinflect("歌っている").first[:word].should == "歌う"
|
11
|
-
|
12
|
-
# past tense
|
13
|
-
subject.deinflect("見た").first[:word].should == "見る"
|
14
|
-
subject.deinflect("歌った").first[:word].should == "歌う"
|
15
|
-
end
|
7
|
+
describe "verbs" do
|
8
|
+
context "in present / attibutive tense" do
|
16
9
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
10
|
+
it "does not deinflect plain positive (non-inflected) verbs" do
|
11
|
+
subject.deinflect("見る").should be_empty
|
12
|
+
subject.deinflect("歌う").should be_empty
|
13
|
+
end
|
21
14
|
|
22
|
-
|
23
|
-
|
24
|
-
|
15
|
+
it "deinflects polite positive verbs" do
|
16
|
+
subject.deinflect("見ます").first[:word].should == "見る"
|
17
|
+
subject.deinflect("歌います").first[:word].should == "歌う"
|
18
|
+
end
|
25
19
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
20
|
+
it "deinflects plain negative verbs" do
|
21
|
+
subject.deinflect("見ない").first[:word].should == "見る"
|
22
|
+
subject.deinflect("歌わない").first[:word].should == "歌う"
|
23
|
+
end
|
30
24
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
25
|
+
it "deinflects polite negative formal verbs" do
|
26
|
+
subject.deinflect("見ません").first[:word].should == "見る"
|
27
|
+
subject.deinflect("歌いません").first[:word].should == "歌う"
|
28
|
+
end
|
35
29
|
|
36
|
-
|
37
|
-
#subject.deinflect("見ていない").first[:word].should == "見る"
|
38
|
-
#subject.deinflect("歌っていない").first[:word].should == "歌う"
|
30
|
+
end
|
39
31
|
|
40
|
-
|
41
|
-
#subject.deinflect("見なかった").first[:word].should == "見る"
|
42
|
-
#subject.deinflect("歌わなかった").first[:word].should == "歌う"
|
43
|
-
end
|
32
|
+
context "in progressive (ongoing) tense" do
|
44
33
|
|
45
|
-
|
46
|
-
|
47
|
-
subject.deinflect("見ません").first[:word].should == "見る"
|
48
|
-
subject.deinflect("歌いません").first[:word].should == "歌う"
|
34
|
+
it "deinflects plain positive verbs" do
|
35
|
+
pending "unimplemented"
|
49
36
|
|
50
|
-
|
51
|
-
|
52
|
-
|
37
|
+
subject.deinflect("見ている").first[:word].should == "見る"
|
38
|
+
subject.deinflect("歌っている").first[:word].should == "歌う"
|
39
|
+
end
|
53
40
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
41
|
+
it "deinflects polite positive verbs" do
|
42
|
+
pending "unimplemented"
|
43
|
+
|
44
|
+
subject.deinflect("見ています").first[:word].should == "見る"
|
45
|
+
subject.deinflect("歌っています").first[:word].should == "歌う"
|
46
|
+
end
|
47
|
+
|
48
|
+
it "deinflects plain negative verbs" do
|
49
|
+
pending "unimplemented"
|
50
|
+
|
51
|
+
subject.deinflect("見ていない").first[:word].should == "見る"
|
52
|
+
subject.deinflect("歌っていない").first[:word].should == "歌う"
|
53
|
+
end
|
54
|
+
|
55
|
+
it "deinflects polite negative formal verbs" do
|
56
|
+
pending "unimplemented"
|
57
|
+
|
58
|
+
subject.deinflect("見ていません").first[:word].should == "見る"
|
59
|
+
subject.deinflect("歌っていません").first[:word].should == "歌う"
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
context "in past / present-perfect tense" do
|
65
|
+
|
66
|
+
it "deinflects plain positive verbs" do
|
67
|
+
subject.deinflect("見た").first[:word].should == "見る"
|
68
|
+
subject.deinflect("歌った").first[:word].should == "歌う"
|
69
|
+
end
|
70
|
+
|
71
|
+
it "deinflects polite positive verbs" do
|
72
|
+
subject.deinflect("見ました").first[:word].should == "見る"
|
73
|
+
subject.deinflect("歌いました").first[:word].should == "歌う"
|
74
|
+
end
|
75
|
+
|
76
|
+
it "deinflects plain negative verbs" do
|
77
|
+
subject.deinflect("見なかった").first[:word].should == "見る"
|
78
|
+
subject.deinflect("歌わなかった").first[:word].should == "歌う"
|
79
|
+
end
|
80
|
+
|
81
|
+
it "deinflects polite negative formal verbs" do
|
82
|
+
subject.deinflect("見ませんでした").first[:word].should == "見る"
|
83
|
+
subject.deinflect("歌いませんでした").first[:word].should == "歌う"
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
|
88
|
+
context "presumptive verbs" do
|
58
89
|
|
59
|
-
|
60
|
-
|
61
|
-
|
90
|
+
it "deinflects plain verbs" do
|
91
|
+
subject.deinflect("見よう").first[:word].should == "見る"
|
92
|
+
subject.deinflect("歌おう").first[:word].should == "歌う"
|
93
|
+
subject.deinflect("走ろう").first[:word].should == "走る"
|
94
|
+
end
|
62
95
|
|
63
|
-
|
64
|
-
|
96
|
+
it "deinflects polite verbs" do
|
97
|
+
subject.deinflect("見ましょう").first[:word].should == "見る"
|
98
|
+
subject.deinflect("歌いましょう").first[:word].should == "歌う"
|
99
|
+
subject.deinflect("走りましょう").first[:word].should == "走る"
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
context "imperative verbs" do
|
105
|
+
|
106
|
+
it "deinflects plain positive verbs" do
|
107
|
+
subject.deinflect("見ろ").first[:word].should == "見る"
|
108
|
+
subject.deinflect("歌え").first[:word].should == "歌う"
|
109
|
+
subject.deinflect("走れ").first[:word].should == "走る"
|
110
|
+
end
|
111
|
+
|
112
|
+
it "deinflects plain negative verbs" do
|
113
|
+
subject.deinflect("見るな").first[:word].should == "見る"
|
114
|
+
subject.deinflect("歌うな").first[:word].should == "歌う"
|
115
|
+
subject.deinflect("走るな").first[:word].should == "走る"
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
119
|
+
|
120
|
+
context "provisional verbs" do
|
121
|
+
|
122
|
+
it "deinflects positive verbs" do
|
123
|
+
subject.deinflect("見れば").first[:word].should == "見る"
|
124
|
+
subject.deinflect("歌えば").first[:word].should == "歌う"
|
125
|
+
end
|
126
|
+
|
127
|
+
it "deinflects negative verbs" do
|
128
|
+
subject.deinflect("見なければ").first[:word].should == "見る"
|
129
|
+
subject.deinflect("歌えなければ").first[:word].should == "歌う"
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
|
134
|
+
context "in conditional form" do
|
135
|
+
|
136
|
+
it "deinflects plain positive verbs" do
|
137
|
+
subject.deinflect("見たら").first[:word].should == "見る"
|
138
|
+
subject.deinflect("歌ったら").first[:word].should == "歌う"
|
139
|
+
end
|
140
|
+
|
141
|
+
it "deinflects plain negative verbs" do
|
142
|
+
pending "unimplemented"
|
143
|
+
|
144
|
+
subject.deinflect("見なかったら").first[:word].should == "見る"
|
145
|
+
subject.deinflect("歌わなかったら").first[:word].should == "歌う"
|
146
|
+
end
|
147
|
+
|
148
|
+
it "deinflects polite positive verbs" do
|
149
|
+
subject.deinflect("見ましたら").first[:word].should == "見る"
|
150
|
+
subject.deinflect("歌いましたら").first[:word].should == "歌う"
|
151
|
+
end
|
152
|
+
|
153
|
+
end
|
65
154
|
end
|
66
155
|
|
67
|
-
|
68
|
-
|
156
|
+
describe "adjectives" do
|
157
|
+
it "deinflects adjectives in present tense" do
|
158
|
+
subject.deinflect("嬉しくありません").first[:word].should == "嬉しい"
|
159
|
+
end
|
160
|
+
|
161
|
+
it "deinflects adjectives in past tense" do
|
162
|
+
subject.deinflect("嬉しくありませんでした").first[:word].should == "嬉しい"
|
163
|
+
end
|
69
164
|
end
|
70
165
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: japanese_deinflector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-12-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: json
|
@@ -70,6 +70,7 @@ files:
|
|
70
70
|
- .rspec
|
71
71
|
- .travis.yml
|
72
72
|
- Gemfile
|
73
|
+
- README.md
|
73
74
|
- Rakefile
|
74
75
|
- japanese_deinflector.gemspec
|
75
76
|
- lib/data/deinflect.dat
|
@@ -99,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
99
100
|
version: '0'
|
100
101
|
requirements: []
|
101
102
|
rubyforge_project:
|
102
|
-
rubygems_version: 1.8.
|
103
|
+
rubygems_version: 1.8.24
|
103
104
|
signing_key:
|
104
105
|
specification_version: 3
|
105
106
|
summary: Deinflect (unconjugate/undecline) Japanese words.
|