ve 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +22 -0
- data/Rakefile +9 -0
- data/Readme.md +60 -0
- data/js/test.html +32 -0
- data/js/ve.js +57 -0
- data/lib/language.rb +2 -0
- data/lib/languages/english.rb +6 -0
- data/lib/languages/japanese.rb +9 -0
- data/lib/misc.rb +10 -0
- data/lib/part_of_speech.rb +30 -0
- data/lib/provider.rb +29 -0
- data/lib/providers/fallbacks.rb +0 -0
- data/lib/providers/freeling_en.rb +229 -0
- data/lib/providers/japanese_transliterators.rb +293 -0
- data/lib/providers/mecab_ipadic.rb +362 -0
- data/lib/ve.rb +111 -0
- data/lib/word.rb +43 -0
- data/sinatra/server.rb +46 -0
- data/tests/freeling_en_test.rb +135 -0
- data/tests/japanese_transliterators_test.rb +79 -0
- data/tests/mecab_ipadic_test.rb +452 -0
- data/tests/test_helper.rb +26 -0
- data/tests/ve_test.rb +20 -0
- data/ve.gemspec +20 -0
- metadata +80 -0
data/lib/ve.rb
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
$: << File.expand_path(File.dirname(__FILE__))
|
2
|
+
|
3
|
+
require 'misc'
|
4
|
+
require 'word'
|
5
|
+
require 'part_of_speech'
|
6
|
+
require 'languages/english'
|
7
|
+
require 'languages/japanese'
|
8
|
+
require 'pp'
|
9
|
+
|
10
|
+
class Ve
|
11
|
+
|
12
|
+
class Manager
|
13
|
+
def self.provider_for(language, function)
|
14
|
+
@@provider_for[language.to_sym][function.to_sym]
|
15
|
+
end
|
16
|
+
|
17
|
+
# TODO: Make a difference between what features are available locally
|
18
|
+
# and what requires contacting external Ves
|
19
|
+
def self.register(klass, language)
|
20
|
+
@@provider_for ||= {}
|
21
|
+
provider = klass.new
|
22
|
+
# This won't work if people start monkey patching the providers with public methods that arent abilities
|
23
|
+
# It's also not pretty, but kinda nifty
|
24
|
+
provider_name = provider.class.to_s.split('::').last
|
25
|
+
parse_class = Kernel.class_eval("Ve::Parse::#{provider_name}")
|
26
|
+
abilities = parse_class.public_instance_methods - Object.public_instance_methods
|
27
|
+
abilities.each do |a|
|
28
|
+
@@provider_for[language.to_sym] ||= {}
|
29
|
+
@@provider_for[language.to_sym][a] = provider
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# TODO: Put into separate files
|
35
|
+
class LocalInterface
|
36
|
+
def initialize(language, config = {})
|
37
|
+
@language = language
|
38
|
+
end
|
39
|
+
|
40
|
+
def method_missing(function, *args)
|
41
|
+
provider = Ve::Manager.provider_for(@language, function)
|
42
|
+
parse = provider.parse(args[0])
|
43
|
+
parse.send(function.to_sym)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class HTTPInterface
|
48
|
+
require 'net/http'
|
49
|
+
require 'uri'
|
50
|
+
require 'json'
|
51
|
+
|
52
|
+
def initialize(language, config = {})
|
53
|
+
@language = language
|
54
|
+
@base_url = config[:url]
|
55
|
+
end
|
56
|
+
|
57
|
+
def method_missing(function, *args)
|
58
|
+
url = "#{@base_url}/#{@language}/#{function}"
|
59
|
+
uri = URI.parse(url)
|
60
|
+
response = Net::HTTP.post_form(uri, {:text => args[0]})
|
61
|
+
data = JSON.parse(response.body)
|
62
|
+
result = []
|
63
|
+
|
64
|
+
data.each do |obj|
|
65
|
+
# TODO: Support transliterations
|
66
|
+
case obj['_class']
|
67
|
+
when 'Word'
|
68
|
+
result << Ve::Word.new(obj['word'], obj['lemma'], obj['part_of_speech'], obj['tokens'], obj['extra'], obj['info'])
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
result
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
@@interface = Ve::LocalInterface
|
77
|
+
@@interface_for = {}
|
78
|
+
@@config = {}
|
79
|
+
|
80
|
+
# End-users only interact with this class, so it must provide a sexy interface
|
81
|
+
# to all functionality in the providers and parse objects
|
82
|
+
|
83
|
+
# Basic, non-sexy, local interface only
|
84
|
+
def self.get(text, language, function, *args)
|
85
|
+
provider = Ve::Manager.provider_for(language, function, *args)
|
86
|
+
parse = provider.parse(text, args)
|
87
|
+
parse.send(function.to_sym)
|
88
|
+
end
|
89
|
+
|
90
|
+
# Early sexy verision
|
91
|
+
def self.in(language)
|
92
|
+
unless @@interface_for[language]
|
93
|
+
@@interface_for[language] = @@interface.new(language, @@config)
|
94
|
+
end
|
95
|
+
|
96
|
+
@@interface_for[language]
|
97
|
+
end
|
98
|
+
|
99
|
+
def self.config(interface, config)
|
100
|
+
@@interface = interface
|
101
|
+
@@config = config
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
# TODO: Autoload this shit
|
107
|
+
require 'providers/fallbacks'
|
108
|
+
require 'providers/mecab_ipadic'
|
109
|
+
require 'providers/freeling_en'
|
110
|
+
require 'providers/japanese_transliterators'
|
111
|
+
|
data/lib/word.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
class Ve
|
2
|
+
class Word
|
3
|
+
|
4
|
+
attr_accessor :word, :lemma, :part_of_speech, :tokens, :extra, :info
|
5
|
+
|
6
|
+
# TODO: More elegance
|
7
|
+
def initialize(word, lemma, part_of_speech, tokens, extra = {}, info = {})
|
8
|
+
@word = word.dup
|
9
|
+
@lemma = lemma.dup
|
10
|
+
@part_of_speech = part_of_speech
|
11
|
+
@tokens = tokens
|
12
|
+
|
13
|
+
# TODO: I don't like this, it's too unstructured
|
14
|
+
@extra = extra
|
15
|
+
@info = info
|
16
|
+
end
|
17
|
+
|
18
|
+
# TODO: the main part of a word, for example 重要 in 重要な
|
19
|
+
def main_part
|
20
|
+
end
|
21
|
+
|
22
|
+
def base_form
|
23
|
+
@lemma
|
24
|
+
end
|
25
|
+
|
26
|
+
def inflected?
|
27
|
+
@word != @lemma
|
28
|
+
end
|
29
|
+
|
30
|
+
def as_json
|
31
|
+
{
|
32
|
+
:_class => 'Word',
|
33
|
+
:word => @word,
|
34
|
+
:lemma => @lemma,
|
35
|
+
:part_of_speech => @part_of_speech.name,
|
36
|
+
:tokens => @tokens,
|
37
|
+
:extra => @extra,
|
38
|
+
:info => @info
|
39
|
+
}
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
data/sinatra/server.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler/setup'
|
3
|
+
require 'sinatra'
|
4
|
+
require 'json'
|
5
|
+
require 'rack/cors'
|
6
|
+
|
7
|
+
require File.expand_path(File.dirname(__FILE__) + "/../lib/ve")
|
8
|
+
|
9
|
+
use Rack::Cors do
|
10
|
+
allow do
|
11
|
+
origins '*'
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
get '/:language/:function' do
|
16
|
+
run
|
17
|
+
end
|
18
|
+
|
19
|
+
post '/:language/:function' do
|
20
|
+
run
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def run
|
26
|
+
# Ve.source = Ve::Local # Default
|
27
|
+
# Ve.source = Ve::Remote.new(:url => 'http://ve.kimtaro.com/', :access_token => 'XYZ')
|
28
|
+
# result = Ve.get(params[:text], params[:language], params[:function].to_sym)
|
29
|
+
result = Ve.in(params[:language]).words(params[:text])
|
30
|
+
|
31
|
+
case params[:function].to_sym
|
32
|
+
when :words
|
33
|
+
json = JSON.generate(result.collect(&:as_json))
|
34
|
+
else
|
35
|
+
json = result
|
36
|
+
end
|
37
|
+
|
38
|
+
if params[:callback]
|
39
|
+
json = "#{params[:callback]}(#{json})"
|
40
|
+
content_type 'application/javascript', :charset => 'utf-8'
|
41
|
+
else
|
42
|
+
content_type 'application/json', :charset => 'utf-8'
|
43
|
+
end
|
44
|
+
|
45
|
+
json
|
46
|
+
end
|
@@ -0,0 +1,135 @@
|
|
1
|
+
# Encoding: UTF-8
|
2
|
+
|
3
|
+
require_relative 'test_helper'
|
4
|
+
|
5
|
+
class FreelingEnTest < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def test_should_be_able_to_start
|
8
|
+
freeling = Ve::Provider::FreelingEn.new
|
9
|
+
assert freeling.works?
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_doesnt_die_on_japanese
|
13
|
+
freeling = Ve::Provider::FreelingEn.new
|
14
|
+
parse = freeling.parse('これは日本語です')
|
15
|
+
assert_equal Ve::Parse::FreelingEn, parse.class
|
16
|
+
end
|
17
|
+
|
18
|
+
# TODO: UTF-8 handling
|
19
|
+
def test_can_handle_utf8
|
20
|
+
freeling = Ve::Provider::FreelingEn.new
|
21
|
+
parse = freeling.parse('I’m')
|
22
|
+
assert_equal ['I\'m'], parse.tokens.collect { |t| t[:literal] }
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_can_parse
|
26
|
+
freeling = Ve::Provider::FreelingEn.new
|
27
|
+
parse = freeling.parse('')
|
28
|
+
assert_equal Ve::Parse::FreelingEn, parse.class
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_all_literals_should_equal_the_input_text
|
32
|
+
text = <<-EOS
|
33
|
+
There once was a man from X
|
34
|
+
Who took it upon himself to Y
|
35
|
+
Z
|
36
|
+
|
37
|
+
EOS
|
38
|
+
freeling = Ve::Provider::FreelingEn.new
|
39
|
+
parse = freeling.parse(text)
|
40
|
+
assert_equal text, parse.tokens.collect { |t| t[:literal] }.join
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_creates_tokens_from_data_that_is_ignored_in_parsing
|
44
|
+
freeling = Ve::Provider::FreelingEn.new
|
45
|
+
parse = freeling.parse('A B ')
|
46
|
+
assert_equal [:parsed, :unparsed, :parsed, :unparsed, :sentence_split], parse.tokens.collect { |t| t[:type] }
|
47
|
+
assert_equal ['A', ' ', 'B', ' ', ''], parse.tokens.collect { |t| t[:literal] }
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_can_give_sentences
|
51
|
+
freeling = Ve::Provider::FreelingEn.new
|
52
|
+
parse = freeling.parse('This is a sentence. And this was another one')
|
53
|
+
assert_equal ['This is a sentence.', 'And this was another one'], parse.sentences
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_can_give_words
|
57
|
+
freeling = Ve::Provider::FreelingEn.new
|
58
|
+
parse = freeling.parse('This was a sentence.')
|
59
|
+
words = parse.words
|
60
|
+
tokens = parse.tokens
|
61
|
+
|
62
|
+
assert_equal ['This', 'was', 'a', 'sentence', '.'], words.collect(&:word)
|
63
|
+
assert_equal ['this', 'be', 'a', 'sentence', '.'], words.collect(&:lemma)
|
64
|
+
assert_equal [Ve::PartOfSpeech::Pronoun, Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Determiner, Ve::PartOfSpeech::Noun, Ve::PartOfSpeech::Symbol], words.collect(&:part_of_speech)
|
65
|
+
assert_equal [{:grammar => :personal}, {:grammar => :past}, {:grammar => nil}, {:grammar => nil}, {:grammar => nil}], words.collect(&:extra)
|
66
|
+
|
67
|
+
assert_equal [[tokens[0]], [tokens[2]], [tokens[4]], [tokens[6]], [tokens[7]]], words.collect(&:tokens)
|
68
|
+
end
|
69
|
+
|
70
|
+
def test_possessive_endings_must_be_reattached
|
71
|
+
freeling = Ve::Provider::FreelingEn.new
|
72
|
+
parse = freeling.parse("This is Jane's sentence.")
|
73
|
+
words = parse.words
|
74
|
+
tokens = parse.tokens
|
75
|
+
|
76
|
+
assert_equal ['This', 'is', "Jane's", 'sentence', '.'], words.collect(&:word)
|
77
|
+
assert_equal ['this', 'be', "jane", 'sentence', '.'], words.collect(&:lemma)
|
78
|
+
assert_equal [Ve::PartOfSpeech::Pronoun, Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::ProperNoun, Ve::PartOfSpeech::Noun, Ve::PartOfSpeech::Symbol], words.collect(&:part_of_speech)
|
79
|
+
assert_equal [{:grammar => :personal}, {:grammar => nil}, {:gramamr => nil}, {:grammar => nil}, {:grammar => nil}], words.collect(&:extra)
|
80
|
+
assert_equal [[tokens[0]], [tokens[2]], tokens[4..5], [tokens[7]], [tokens[8]]], words.collect(&:tokens)
|
81
|
+
end
|
82
|
+
|
83
|
+
def test_date_parsing
|
84
|
+
# Should be turned off. At least for now
|
85
|
+
freeling = Ve::Provider::FreelingEn.new
|
86
|
+
|
87
|
+
assert_parses_into_words(freeling,
|
88
|
+
{:words => ['January'],
|
89
|
+
:lemmas => ['january'],
|
90
|
+
:pos => [Ve::PartOfSpeech::Noun],
|
91
|
+
:extra => [{:grammar => nil}],
|
92
|
+
:tokens => [0..0]},
|
93
|
+
'January')
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_symbol_parsing
|
97
|
+
freeling = Ve::Provider::FreelingEn.new
|
98
|
+
|
99
|
+
assert_parses_into_words(freeling,
|
100
|
+
{:words => ['.', ',', '$'],
|
101
|
+
:lemmas => ['.', ',', '$'],
|
102
|
+
:pos => [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::Symbol],
|
103
|
+
:extra => [{:grammar => nil}, {:grammar => nil}, {:grammar => nil}],
|
104
|
+
:tokens => [0..0, 1..1, 2..2]},
|
105
|
+
'.,$')
|
106
|
+
end
|
107
|
+
|
108
|
+
def test_can_handle_underscores_properly
|
109
|
+
# Should restore them
|
110
|
+
freeling = Ve::Provider::FreelingEn.new
|
111
|
+
parse = freeling.parse("In New York")
|
112
|
+
words = parse.words
|
113
|
+
tokens = parse.tokens
|
114
|
+
|
115
|
+
assert_equal ['In', 'New York'], words.collect(&:word)
|
116
|
+
assert_equal ['in', 'new york'], words.collect(&:lemma)
|
117
|
+
assert_equal [Ve::PartOfSpeech::Preposition, Ve::PartOfSpeech::ProperNoun], words.collect(&:part_of_speech)
|
118
|
+
assert_equal [{:grammar => nil}, {:grammar => nil}], words.collect(&:extra)
|
119
|
+
assert_equal [tokens[0..0], tokens[2..2]], words.collect(&:tokens)
|
120
|
+
|
121
|
+
# Should keep them
|
122
|
+
# TODO
|
123
|
+
freeling = Ve::Provider::FreelingEn.new
|
124
|
+
parse = freeling.parse("In New_York")
|
125
|
+
words = parse.words
|
126
|
+
tokens = parse.tokens
|
127
|
+
|
128
|
+
assert_equal ['In', 'New_York'], words.collect(&:word)
|
129
|
+
assert_equal ['in', 'new_york'], words.collect(&:lemma)
|
130
|
+
assert_equal [Ve::PartOfSpeech::Preposition, Ve::PartOfSpeech::ProperNoun], words.collect(&:part_of_speech)
|
131
|
+
assert_equal [{:grammar => nil}, {:grammarl => nil}], words.collect(&:extra)
|
132
|
+
assert_equal [tokens[0..1], tokens[2..2], tokens[3..11]], words.collect(&:tokens)
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# Encoding: UTF-8
|
2
|
+
|
3
|
+
require_relative 'test_helper'
|
4
|
+
|
5
|
+
class JapaneseTransliteratorsTest < Test::Unit::TestCase
|
6
|
+
|
7
|
+
KATAKANA = "ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶ"
|
8
|
+
HIRAGANA = "ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ"
|
9
|
+
HALFWIDTH = "!\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ "
|
10
|
+
FULLWIDTH = "!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ "
|
11
|
+
|
12
|
+
def setup
|
13
|
+
@trans = Ve::Provider::JapaneseTransliterators.new
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_should_be_able_to_start
|
17
|
+
assert @trans.works?
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_transliterate_from_hira_to_latn
|
21
|
+
assert_equal 'kosoado', @trans.parse('こそあど').transliterate_from_hira_to_latn
|
22
|
+
assert_equal 'konna', @trans.parse('こんな').transliterate_from_hira_to_latn
|
23
|
+
assert_equal 'konyaku', @trans.parse('こにゃく').transliterate_from_hira_to_latn
|
24
|
+
assert_equal 'kon\'yaku', @trans.parse('こんやく').transliterate_from_hira_to_latn
|
25
|
+
assert_equal 'shinbun', @trans.parse('しんぶん').transliterate_from_hira_to_latn
|
26
|
+
assert_equal 'appa', @trans.parse('あっぱ').transliterate_from_hira_to_latn
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_transliterate_from_latn_to_hrkt
|
30
|
+
assert_equal('かなです', @trans.parse('kanadesu').transliterate_from_latn_to_hrkt)
|
31
|
+
assert_equal('こそあど', @trans.parse('kosoado').transliterate_from_latn_to_hrkt)
|
32
|
+
assert_equal('こんな', @trans.parse('konna').transliterate_from_latn_to_hrkt)
|
33
|
+
assert_equal('しんぶん', @trans.parse('shimbun').transliterate_from_latn_to_hrkt)
|
34
|
+
assert_equal('しんぱい', @trans.parse('simpai').transliterate_from_latn_to_hrkt)
|
35
|
+
assert_equal('うぁ', @trans.parse('wha').transliterate_from_latn_to_hrkt)
|
36
|
+
assert_equal('かっちゃった', @trans.parse('katchatta').transliterate_from_latn_to_hrkt)
|
37
|
+
assert_equal('かっわいいぃ', @trans.parse('kawwaiixi').transliterate_from_latn_to_hrkt)
|
38
|
+
assert_equal('おっとせい', @trans.parse('ottosei').transliterate_from_latn_to_hrkt)
|
39
|
+
assert_equal('あっち', @trans.parse('acchi').transliterate_from_latn_to_hrkt)
|
40
|
+
|
41
|
+
# Katakana
|
42
|
+
assert_equal('カナデス', @trans.parse('KANADESU').transliterate_from_latn_to_hrkt)
|
43
|
+
assert_equal('コソアド', @trans.parse('KOSOADO').transliterate_from_latn_to_hrkt)
|
44
|
+
assert_equal('コンナ', @trans.parse('KONNA').transliterate_from_latn_to_hrkt)
|
45
|
+
assert_equal('シンブン', @trans.parse('SHIMBUN').transliterate_from_latn_to_hrkt)
|
46
|
+
assert_equal('シンパイ', @trans.parse('SIMPAI').transliterate_from_latn_to_hrkt)
|
47
|
+
assert_equal('ウァ', @trans.parse('WHA').transliterate_from_latn_to_hrkt)
|
48
|
+
assert_equal('カッチャッタ', @trans.parse('KATCHATTA').transliterate_from_latn_to_hrkt)
|
49
|
+
assert_equal('カッワイイィ', @trans.parse('KAWWAIIXI').transliterate_from_latn_to_hrkt)
|
50
|
+
assert_equal('オットセイ', @trans.parse('OTTOSEI').transliterate_from_latn_to_hrkt)
|
51
|
+
assert_equal('アッチ', @trans.parse('ACCHI').transliterate_from_latn_to_hrkt)
|
52
|
+
assert_equal('カタカナ です', @trans.parse('KATAKANA desu').transliterate_from_latn_to_hrkt)
|
53
|
+
|
54
|
+
# Non-Japanese
|
55
|
+
assert_equal('てぃs いs そめ えんgりsh', @trans.parse('this is some english').transliterate_from_latn_to_hrkt)
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_transliterate_from_hira_to_kana
|
59
|
+
assert_equal KATAKANA, @trans.parse(HIRAGANA).transliterate_from_hira_to_kana
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_transliterate_from_kata_to_hina
|
63
|
+
assert_equal HIRAGANA, @trans.parse(KATAKANA).transliterate_from_kana_to_hira
|
64
|
+
end
|
65
|
+
|
66
|
+
def test_transliterate_from_hrkt_to_latn
|
67
|
+
assert_equal 'hiraganakatakana', @trans.parse('ひらがなカタカナ').transliterate_from_hrkt_to_latn
|
68
|
+
end
|
69
|
+
|
70
|
+
def test_transliterate_from_fullwidth_to_halfwidth
|
71
|
+
assert_equal HALFWIDTH, @trans.parse(FULLWIDTH).transliterate_from_fullwidth_to_halfwidth
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_transliterate_from_halfwidth_to_fullwidth
|
75
|
+
assert_equal FULLWIDTH, @trans.parse(HALFWIDTH).transliterate_from_halfwidth_to_fullwidth
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
end
|
@@ -0,0 +1,452 @@
|
|
1
|
+
# Encoding: UTF-8
|
2
|
+
|
3
|
+
require_relative 'test_helper'
|
4
|
+
|
5
|
+
class MecabIpadicTest < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def test_should_be_able_to_start
|
8
|
+
mecab = Ve::Provider::MecabIpadic.new
|
9
|
+
assert mecab.works?
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_can_parse
|
13
|
+
mecab = Ve::Provider::MecabIpadic.new
|
14
|
+
parse = mecab.parse('')
|
15
|
+
assert_equal Ve::Parse::MecabIpadic, parse.class
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_all_literals_should_equal_the_input_text
|
19
|
+
text = <<-EOS
|
20
|
+
古池や
|
21
|
+
蛙飛び込む
|
22
|
+
水の音
|
23
|
+
|
24
|
+
EOS
|
25
|
+
mecab = Ve::Provider::MecabIpadic.new
|
26
|
+
parse = mecab.parse(text)
|
27
|
+
assert_equal text, parse.tokens.collect { |t| t[:literal] }.join
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_tokens_must_be_created_for_parsed_and_unparsed_text
|
31
|
+
mecab = Ve::Provider::MecabIpadic.new
|
32
|
+
parse = mecab.parse(' A B ')
|
33
|
+
assert_equal [:unparsed, :parsed, :unparsed, :parsed, :unparsed, :sentence_split], parse.tokens.collect { |t| t[:type] }
|
34
|
+
assert_equal [' ', 'A', ' ', 'B', ' ', ''], parse.tokens.collect { |t| t[:literal] }
|
35
|
+
assert_equal [0..0, 1..1, 2..4, 5..5, 6..7, nil], parse.tokens.collect { |t| t[:characters] }
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_tokens_should_not_be_modified_when_attached_to_words
|
39
|
+
mecab = Ve::Provider::MecabIpadic.new
|
40
|
+
parse = mecab.parse('悪化する')
|
41
|
+
tokens = parse.tokens
|
42
|
+
assert_equal '悪化', tokens[0][:literal]
|
43
|
+
assert_equal '悪化', tokens[0][:lemma]
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_sentences
|
47
|
+
mecab = Ve::Provider::MecabIpadic.new
|
48
|
+
parse = mecab.parse('これは文章である。で、also containing some Englishですね')
|
49
|
+
assert_equal ['これは文章である。', 'で、also containing some Englishですね'], parse.sentences
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_this_shouldnt_crash
|
53
|
+
mecab = Ve::Provider::MecabIpadic.new
|
54
|
+
parse = mecab.parse('チューたろうは田中さんの犬です。')
|
55
|
+
assert_equal 11, parse.tokens.size
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_this_shouldnt_crash_either
|
59
|
+
mecab = Ve::Provider::MecabIpadic.new
|
60
|
+
parse = mecab.parse('三十年式歩兵銃')
|
61
|
+
assert_equal 7, parse.tokens.size
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_words
|
65
|
+
mecab = Ve::Provider::MecabIpadic.new
|
66
|
+
|
67
|
+
# Meishi
|
68
|
+
assert_parses_into_words(mecab, {:words => ['車'],
|
69
|
+
:lemmas => ['車'],
|
70
|
+
:pos => [Ve::PartOfSpeech::Noun],
|
71
|
+
:extra => [{:reading => 'クルマ', :transcription => 'クルマ', :grammar => nil}],
|
72
|
+
:tokens => [0..0]},
|
73
|
+
'車')
|
74
|
+
|
75
|
+
# Koyuumeishi
|
76
|
+
assert_parses_into_words(mecab, {:words => ['太郎'],
|
77
|
+
:lemmas => ['太郎'],
|
78
|
+
:pos => [Ve::PartOfSpeech::ProperNoun],
|
79
|
+
:extra => [{:reading => 'タロウ', :transcription => 'タロー', :grammar => nil}],
|
80
|
+
:tokens => [0..0]},
|
81
|
+
'太郎')
|
82
|
+
|
83
|
+
# Daimeishi
|
84
|
+
assert_parses_into_words(mecab, {:words => ['彼'],
|
85
|
+
:lemmas => ['彼'],
|
86
|
+
:pos => [Ve::PartOfSpeech::Pronoun],
|
87
|
+
:extra => [{:reading => 'カレ', :transcription => 'カレ', :grammar => nil}],
|
88
|
+
:tokens => [0..0]},
|
89
|
+
'彼')
|
90
|
+
|
91
|
+
# Fukushikanou
|
92
|
+
assert_parses_into_words(mecab, {:words => ['午後に'],
|
93
|
+
:lemmas => ['午後に'],
|
94
|
+
:pos => [Ve::PartOfSpeech::Adverb],
|
95
|
+
:extra => [{:reading => 'ゴゴニ', :transcription => 'ゴゴニ', :grammar => nil}],
|
96
|
+
:tokens => [0..1]},
|
97
|
+
'午後に')
|
98
|
+
|
99
|
+
# Kazu
|
100
|
+
assert_parses_into_words(mecab, {:words => ['一'],
|
101
|
+
:lemmas => ['一'],
|
102
|
+
:pos => [Ve::PartOfSpeech::Number],
|
103
|
+
:extra => [{:reading => 'イチ', :transcription => 'イチ', :grammar => nil}],
|
104
|
+
:tokens => [0..0]},
|
105
|
+
'一')
|
106
|
+
|
107
|
+
assert_parses_into_words(mecab, {:words => ['123'],
|
108
|
+
:lemmas => ['123'],
|
109
|
+
:pos => [Ve::PartOfSpeech::Number],
|
110
|
+
:extra => [{:reading => 'イチニサン', :transcription => 'イチニサン', :grammar => nil}],
|
111
|
+
:tokens => [0..2]},
|
112
|
+
'123')
|
113
|
+
|
114
|
+
# Sahensetsuzoku + tokumi ta
|
115
|
+
assert_parses_into_words(mecab, {:words => ['悪化した'],
|
116
|
+
:lemmas => ['悪化する'],
|
117
|
+
:pos => [Ve::PartOfSpeech::Verb],
|
118
|
+
:extra => [{:reading => 'アッカシタ', :transcription => 'アッカシタ', :grammar => nil}],
|
119
|
+
:tokens => [0..2]},
|
120
|
+
'悪化した')
|
121
|
+
|
122
|
+
# Keiyoudoushigokan
|
123
|
+
assert_parses_into_words(mecab, {:words => ['重要な'],
|
124
|
+
:lemmas => ['重要'],
|
125
|
+
:pos => [Ve::PartOfSpeech::Adjective],
|
126
|
+
:extra => [{:reading => 'ジュウヨウナ', :transcription => 'ジューヨーナ', :grammar => nil}],
|
127
|
+
:tokens => [0..1]},
|
128
|
+
'重要な')
|
129
|
+
|
130
|
+
# Naikeiyoushigokan
|
131
|
+
assert_parses_into_words(mecab, {:words => ['とんでもない'],
|
132
|
+
:lemmas => ['とんでもない'],
|
133
|
+
:pos => [Ve::PartOfSpeech::Adjective],
|
134
|
+
:extra => [{:reading => 'トンデモナイ', :transcription => 'トンデモナイ', :grammar => nil}],
|
135
|
+
:tokens => [0..1]},
|
136
|
+
'とんでもない')
|
137
|
+
|
138
|
+
# Meishi hijiritsu fukushikanou
|
139
|
+
assert_parses_into_words(mecab, {:words => ['の', 'うちに'],
|
140
|
+
:lemmas => ['の', 'うちに'],
|
141
|
+
:pos => [Ve::PartOfSpeech::Postposition, Ve::PartOfSpeech::Adverb],
|
142
|
+
:extra => [{:reading => 'ノ', :transcription => 'ノ', :grammar => nil},
|
143
|
+
{:reading => 'ウチニ', :transcription => 'ウチニ', :grammar => nil}],
|
144
|
+
:tokens => [0..0, 1..2]},
|
145
|
+
'のうちに')
|
146
|
+
|
147
|
+
# Meishi hijiritsu jodoushigokan
|
148
|
+
assert_parses_into_words(mecab, {:words => ['の', 'ような'],
|
149
|
+
:lemmas => ['の', 'ようだ'],
|
150
|
+
:pos => [Ve::PartOfSpeech::Postposition, Ve::PartOfSpeech::Verb],
|
151
|
+
:extra => [{:reading => 'ノ', :transcription => 'ノ', :grammar => nil},
|
152
|
+
{:reading => 'ヨウナ', :transcription => 'ヨーナ', :grammar => :auxillary}],
|
153
|
+
:tokens => [0..0, 1..2]},
|
154
|
+
'のような')
|
155
|
+
|
156
|
+
assert_parses_into_words(mecab, {:words => ['の', 'ように'],
|
157
|
+
:lemmas => ['の', 'ように'],
|
158
|
+
:pos => [Ve::PartOfSpeech::Postposition, Ve::PartOfSpeech::Adverb],
|
159
|
+
:extra => [{:reading => 'ノ', :transcription => 'ノ', :grammar => nil},
|
160
|
+
{:reading => 'ヨウニ', :transcription => 'ヨーニ', :grammar => nil}],
|
161
|
+
:tokens => [0..0, 1..2]},
|
162
|
+
'のように')
|
163
|
+
|
164
|
+
# Meishi hijiritsu keiyoudoushigokan
|
165
|
+
assert_parses_into_words(mecab, {:words => ['みたいな'],
|
166
|
+
:lemmas => ['みたいだ'],
|
167
|
+
:pos => [Ve::PartOfSpeech::Adjective],
|
168
|
+
:extra => [{:reading => 'ミタイナ', :transcription => 'ミタイナ', :grammar => nil}],
|
169
|
+
:tokens => [0..1]},
|
170
|
+
'みたいな')
|
171
|
+
|
172
|
+
assert_parses_into_words(mecab, {:words => ['みたいの'],
|
173
|
+
:lemmas => ['みたいの'],
|
174
|
+
:pos => [Ve::PartOfSpeech::Adjective],
|
175
|
+
:extra => [{:reading => 'ミタイノ', :transcription => 'ミタイノ', :grammar => nil}],
|
176
|
+
:tokens => [0..1]},
|
177
|
+
'みたいの')
|
178
|
+
|
179
|
+
assert_parses_into_words(mecab, {:words => ['みたい', 'だ'],
|
180
|
+
:lemmas => ['みたい', 'だ'],
|
181
|
+
:pos => [Ve::PartOfSpeech::Adjective, Ve::PartOfSpeech::Verb],
|
182
|
+
:extra => [{:reading => 'ミタイ', :transcription => 'ミタイ', :grammar => nil},
|
183
|
+
{:reading => 'ダ', :transcription => 'ダ', :grammar => nil}],
|
184
|
+
:tokens => [0..0, 1..1]},
|
185
|
+
'みたいだ')
|
186
|
+
|
187
|
+
# Meishi tokushu jodoushigokan
|
188
|
+
assert_parses_into_words(mecab, {:words => ['行く', 'そう', 'だ'],
|
189
|
+
:lemmas => ['行く', 'そう', 'だ'],
|
190
|
+
:pos => [Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Verb],
|
191
|
+
:extra => [{:reading => 'イク', :transcription => 'イク', :grammar => nil},
|
192
|
+
{:reading => 'ソウ', :transcription => 'ソー', :grammar => :auxillary},
|
193
|
+
{:reading => 'ダ', :transcription => 'ダ', :grammar => nil}],
|
194
|
+
:tokens => [0..0, 1..1, 2..2]},
|
195
|
+
'行くそうだ')
|
196
|
+
|
197
|
+
# Meishi setsubi
|
198
|
+
# TODO: This should maybe be parsed as one noun instead
|
199
|
+
assert_parses_into_words(mecab, {:words => ['楽し', 'さ'],
|
200
|
+
:lemmas => ['楽しい', 'さ'],
|
201
|
+
:pos => [Ve::PartOfSpeech::Adjective, Ve::PartOfSpeech::Suffix],
|
202
|
+
:extra => [{:reading => 'タノシ', :transcription => 'タノシ', :grammar => nil},
|
203
|
+
{:reading => 'サ', :transcription => 'サ', :grammar => nil}],
|
204
|
+
:tokens => [0..0, 1..1]},
|
205
|
+
'楽しさ')
|
206
|
+
|
207
|
+
# Meishi setsuzokushiteki
|
208
|
+
assert_parses_into_words(mecab, {:words => ['日本', '対', 'アメリカ'],
|
209
|
+
:lemmas => ['日本', '対', 'アメリカ'],
|
210
|
+
:pos => [Ve::PartOfSpeech::ProperNoun, Ve::PartOfSpeech::Conjunction, Ve::PartOfSpeech::ProperNoun],
|
211
|
+
:extra => [{:reading => 'ニッポン', :transcription => 'ニッポン', :grammar => nil},
|
212
|
+
{:reading => 'タイ', :transcription => 'タイ', :grammar => nil},
|
213
|
+
{:reading => 'アメリカ', :transcription => 'アメリカ', :grammar => nil}],
|
214
|
+
:tokens => [0..0, 1..1, 2..2]},
|
215
|
+
'日本対アメリカ')
|
216
|
+
|
217
|
+
# Meishi doushihijiritsuteki
|
218
|
+
assert_parses_into_words(mecab, {:words => ['見て', 'ごらん'],
|
219
|
+
:lemmas => ['見る', 'ごらん'],
|
220
|
+
:pos => [Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Verb],
|
221
|
+
:extra => [{:reading => 'ミテ', :transcription => 'ミテ', :grammar => nil},
|
222
|
+
{:reading => 'ゴラン', :transcription => 'ゴラン', :grammar => :nominal}],
|
223
|
+
:tokens => [0..1, 2..2]},
|
224
|
+
'見てごらん')
|
225
|
+
|
226
|
+
# Settoushi
|
227
|
+
assert_parses_into_words(mecab, {:words => ['お', '座り'],
|
228
|
+
:lemmas => ['お', '座り'],
|
229
|
+
:pos => [Ve::PartOfSpeech::Prefix, Ve::PartOfSpeech::Noun],
|
230
|
+
:extra => [{:reading => 'オ', :transcription => 'オ', :grammar => nil},
|
231
|
+
{:reading => 'スワリ', :transcription => 'スワリ', :grammar => nil}],
|
232
|
+
:tokens => [0..0, 1..1]},
|
233
|
+
'お座り')
|
234
|
+
|
235
|
+
# Kigou
|
236
|
+
assert_parses_into_words(mecab, {:words => ['。'],
|
237
|
+
:lemmas => ['。'],
|
238
|
+
:pos => [Ve::PartOfSpeech::Symbol],
|
239
|
+
:extra => [{:reading => '。', :transcription => '。', :grammar => nil}],
|
240
|
+
:tokens => [0..0]},
|
241
|
+
'。')
|
242
|
+
|
243
|
+
# Firaa
|
244
|
+
assert_parses_into_words(mecab, {:words => ['えと'],
|
245
|
+
:lemmas => ['えと'],
|
246
|
+
:pos => [Ve::PartOfSpeech::Interjection],
|
247
|
+
:extra => [{:reading => 'エト', :transcription => 'エト', :grammar => nil}],
|
248
|
+
:tokens => [0..0]},
|
249
|
+
'えと')
|
250
|
+
|
251
|
+
# Sonota
|
252
|
+
assert_parses_into_words(mecab, {:words => ['だ', 'ァ'],
|
253
|
+
:lemmas => ['だ', 'ァ'],
|
254
|
+
:pos => [Ve::PartOfSpeech::Postposition, Ve::PartOfSpeech::Other],
|
255
|
+
:extra => [{:reading => 'ダ', :transcription => 'ダ', :grammar => nil},
|
256
|
+
{:reading => 'ァ', :transcription => 'ア', :grammar => nil}],
|
257
|
+
:tokens => [0..0, 1..1]},
|
258
|
+
'だァ')
|
259
|
+
|
260
|
+
# Kandoushi
|
261
|
+
assert_parses_into_words(mecab, {:words => ['おはよう'],
|
262
|
+
:lemmas => ['おはよう'],
|
263
|
+
:pos => [Ve::PartOfSpeech::Interjection],
|
264
|
+
:extra => [{:reading => 'オハヨウ', :transcription => 'オハヨー', :grammar => nil}],
|
265
|
+
:tokens => [0..0]},
|
266
|
+
'おはよう')
|
267
|
+
|
268
|
+
# Rentaishi
|
269
|
+
assert_parses_into_words(mecab, {:words => ['この'],
|
270
|
+
:lemmas => ['この'],
|
271
|
+
:pos => [Ve::PartOfSpeech::Determiner],
|
272
|
+
:extra => [{:reading => 'コノ', :transcription => 'コノ', :grammar => nil}],
|
273
|
+
:tokens => [0..0]},
|
274
|
+
'この')
|
275
|
+
|
276
|
+
# Setsuzokushi
|
277
|
+
assert_parses_into_words(mecab, {:words => ['そして'],
|
278
|
+
:lemmas => ['そして'],
|
279
|
+
:pos => [Ve::PartOfSpeech::Conjunction],
|
280
|
+
:extra => [{:reading => 'ソシテ', :transcription => 'ソシテ', :grammar => nil}],
|
281
|
+
:tokens => [0..0]},
|
282
|
+
'そして')
|
283
|
+
|
284
|
+
# Fukushi
|
285
|
+
assert_parses_into_words(mecab, {:words => ['多分'],
|
286
|
+
:lemmas => ['多分'],
|
287
|
+
:pos => [Ve::PartOfSpeech::Adverb],
|
288
|
+
:extra => [{:reading => 'タブン', :transcription => 'タブン', :grammar => nil}],
|
289
|
+
:tokens => [0..0]},
|
290
|
+
'多分')
|
291
|
+
|
292
|
+
# Doushi
|
293
|
+
assert_parses_into_words(mecab, {:words => ['行く'],
|
294
|
+
:lemmas => ['行く'],
|
295
|
+
:pos => [Ve::PartOfSpeech::Verb],
|
296
|
+
:extra => [{:reading => 'イク', :transcription => 'イク', :grammar => nil}],
|
297
|
+
:tokens => [0..0]},
|
298
|
+
'行く')
|
299
|
+
|
300
|
+
assert_parses_into_words(mecab, {:words => ['行かない'],
|
301
|
+
:lemmas => ['行く'],
|
302
|
+
:pos => [Ve::PartOfSpeech::Verb],
|
303
|
+
:extra => [{:reading => 'イカナイ', :transcription => 'イカナイ', :grammar => nil}],
|
304
|
+
:tokens => [0..1]},
|
305
|
+
'行かない')
|
306
|
+
|
307
|
+
assert_parses_into_words(mecab, {:words => ['行って', 'きて'],
|
308
|
+
:lemmas => ['行く', 'くる'],
|
309
|
+
:pos => [Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Verb],
|
310
|
+
:extra => [{:reading => 'イッテ', :transcription => 'イッテ', :grammar => nil},
|
311
|
+
{:reading => 'キテ', :transcription => 'キテ', :grammar => :auxillary}],
|
312
|
+
:tokens => [0..1, 2..3]},
|
313
|
+
'行ってきて')
|
314
|
+
|
315
|
+
# Doushi setsubi
|
316
|
+
assert_parses_into_words(mecab, {:words => ['行かれる'],
|
317
|
+
:lemmas => ['行く'],
|
318
|
+
:pos => [Ve::PartOfSpeech::Verb],
|
319
|
+
:extra => [{:reading => 'イカレル', :transcription => 'イカレル', :grammar => nil}],
|
320
|
+
:tokens => [0..1]},
|
321
|
+
'行かれる')
|
322
|
+
|
323
|
+
assert_parses_into_words(mecab, {:words => ['食べさせられた'],
|
324
|
+
:lemmas => ['食べる'],
|
325
|
+
:pos => [Ve::PartOfSpeech::Verb],
|
326
|
+
:extra => [{:reading => 'タベサセラレタ', :transcription => 'タベサセラレタ', :grammar => nil}],
|
327
|
+
:tokens => [0..3]},
|
328
|
+
'食べさせられた')
|
329
|
+
|
330
|
+
# Doushi + jodoushi
|
331
|
+
assert_parses_into_words(mecab, {:words => ['食べました'],
|
332
|
+
:lemmas => ['食べる'],
|
333
|
+
:pos => [Ve::PartOfSpeech::Verb],
|
334
|
+
:extra => [{:reading => 'タベマシタ', :transcription => 'タベマシタ', :grammar => nil}],
|
335
|
+
:tokens => [0..2]},
|
336
|
+
'食べました')
|
337
|
+
|
338
|
+
# Keiyoushi
|
339
|
+
assert_parses_into_words(mecab, {:words => ['寒い'],
|
340
|
+
:lemmas => ['寒い'],
|
341
|
+
:pos => [Ve::PartOfSpeech::Adjective],
|
342
|
+
:extra => [{:reading => 'サムイ', :transcription => 'サムイ', :grammar => nil}],
|
343
|
+
:tokens => [0..0]},
|
344
|
+
'寒い')
|
345
|
+
|
346
|
+
assert_parses_into_words(mecab, {:words => ['寒くて'],
|
347
|
+
:lemmas => ['寒い'],
|
348
|
+
:pos => [Ve::PartOfSpeech::Adjective],
|
349
|
+
:extra => [{:reading => 'サムクテ', :transcription => 'サムクテ', :grammar => nil}],
|
350
|
+
:tokens => [0..1]},
|
351
|
+
'寒くて')
|
352
|
+
|
353
|
+
assert_parses_into_words(mecab, {:words => ['寒かった'],
|
354
|
+
:lemmas => ['寒い'],
|
355
|
+
:pos => [Ve::PartOfSpeech::Adjective],
|
356
|
+
:extra => [{:reading => 'サムカッタ', :transcription => 'サムカッタ', :grammar => nil}],
|
357
|
+
:tokens => [0..1]},
|
358
|
+
'寒かった')
|
359
|
+
|
360
|
+
assert_parses_into_words(mecab, {:words => ['寒ければ'],
|
361
|
+
:lemmas => ['寒い'],
|
362
|
+
:pos => [Ve::PartOfSpeech::Adjective],
|
363
|
+
:extra => [{:reading => 'サムケレバ', :transcription => 'サムケレバ', :grammar => nil}],
|
364
|
+
:tokens => [0..1]},
|
365
|
+
'寒ければ')
|
366
|
+
|
367
|
+
assert_parses_into_words(mecab, {:words => ['寒けりゃ'],
|
368
|
+
:lemmas => ['寒い'],
|
369
|
+
:pos => [Ve::PartOfSpeech::Adjective],
|
370
|
+
:extra => [{:reading => 'サムケリャ', :transcription => 'サムケリャ', :grammar => nil}],
|
371
|
+
:tokens => [0..0]},
|
372
|
+
'寒けりゃ')
|
373
|
+
|
374
|
+
assert_parses_into_words(mecab, {:words => ['食べたい'],
|
375
|
+
:lemmas => ['食べる'],
|
376
|
+
:pos => [Ve::PartOfSpeech::Verb],
|
377
|
+
:extra => [{:reading => 'タベタイ', :transcription => 'タベタイ', :grammar => nil}],
|
378
|
+
:tokens => [0..1]},
|
379
|
+
'食べたい')
|
380
|
+
|
381
|
+
# Joshi
|
382
|
+
assert_parses_into_words(mecab, {:words => ['日本', 'から'],
|
383
|
+
:lemmas => ['日本', 'から'],
|
384
|
+
:pos => [Ve::PartOfSpeech::ProperNoun, Ve::PartOfSpeech::Postposition],
|
385
|
+
:extra => [{:reading => 'ニッポン', :transcription => 'ニッポン', :grammar => nil},
|
386
|
+
{:reading => 'カラ', :transcription => 'カラ', :grammar => nil}],
|
387
|
+
:tokens => [0..0, 1..1]},
|
388
|
+
'日本から')
|
389
|
+
|
390
|
+
# The copula
|
391
|
+
assert_parses_into_words(mecab, {:words => ['日本', 'です'],
|
392
|
+
:lemmas => ['日本', 'です'],
|
393
|
+
:pos => [Ve::PartOfSpeech::ProperNoun, Ve::PartOfSpeech::Verb],
|
394
|
+
:extra => [{:reading => 'ニッポン', :transcription => 'ニッポン', :grammar => nil},
|
395
|
+
{:reading => 'デス', :transcription => 'デス', :grammar => nil}],
|
396
|
+
:tokens => [0..0, 1..1]},
|
397
|
+
'日本です')
|
398
|
+
|
399
|
+
assert_parses_into_words(mecab, {:words => ['日本', 'だった'],
|
400
|
+
:lemmas => ['日本', 'だ'],
|
401
|
+
:pos => [Ve::PartOfSpeech::ProperNoun, Ve::PartOfSpeech::Verb],
|
402
|
+
:extra => [{:reading => 'ニッポン', :transcription => 'ニッポン', :grammar => nil},
|
403
|
+
{:reading => 'ダッタ', :transcription => 'ダッタ', :grammar => nil}],
|
404
|
+
:tokens => [0..0, 1..2]},
|
405
|
+
'日本だった')
|
406
|
+
|
407
|
+
# いるから
|
408
|
+
assert_parses_into_words(mecab, {:words => ['いる', 'から'],
|
409
|
+
:lemmas => ['いる', 'から'],
|
410
|
+
:pos => [Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Postposition],
|
411
|
+
:extra => [{:reading => 'イル', :transcription => 'イル', :grammar => nil},
|
412
|
+
{:reading => 'カラ', :transcription => 'カラ', :grammar => nil}],
|
413
|
+
:tokens => [0..0, 1..1]},
|
414
|
+
'いるから')
|
415
|
+
|
416
|
+
# しているから
|
417
|
+
assert_parses_into_words(mecab, {:words => ['して', 'いる', 'から'],
|
418
|
+
:lemmas => ['する', 'いる', 'から'],
|
419
|
+
:pos => [Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Postposition],
|
420
|
+
:extra => [{:reading => 'シテ', :transcription => 'シテ', :grammar => nil},
|
421
|
+
{:reading => 'イル', :transcription => 'イル', :grammar => :auxillary},
|
422
|
+
{:reading => 'カラ', :transcription => 'カラ', :grammar => nil}],
|
423
|
+
:tokens => [0..0, 1..1, 2..2]},
|
424
|
+
'しているから')
|
425
|
+
|
426
|
+
# 基準があるが、
|
427
|
+
assert_parses_into_words(mecab, {:words => ['して', 'いる', 'から'],
|
428
|
+
:lemmas => ['する', 'いる', 'から'],
|
429
|
+
:pos => [Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Postposition],
|
430
|
+
:extra => [{:reading => 'シテ', :transcription => 'シテ', :grammar => nil},
|
431
|
+
{:reading => 'イル', :transcription => 'イル', :grammar => :auxillary},
|
432
|
+
{:reading => 'カラ', :transcription => 'カラ', :grammar => nil}],
|
433
|
+
:tokens => [0..0, 1..1, 2..2]},
|
434
|
+
'基準があるが、')
|
435
|
+
|
436
|
+
# TODO: xした should parse as adjective?
|
437
|
+
assert_parses_into_words(mecab, {:words => [],
|
438
|
+
:lemmas => [],
|
439
|
+
:pos => [],
|
440
|
+
:extra => [],
|
441
|
+
:tokens => []},
|
442
|
+
'')
|
443
|
+
end
|
444
|
+
|
445
|
+
def todo_test_word_transliteration
|
446
|
+
mecab = Ve::Provider::MecabIpadic.new
|
447
|
+
parse = mecab.parse('日本', :transliterate_words => :latn)
|
448
|
+
|
449
|
+
assert_equal 'nihon', parse.words.first.transliteration(:latn)
|
450
|
+
end
|
451
|
+
|
452
|
+
end
|