ve 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/ve.rb ADDED
@@ -0,0 +1,111 @@
1
+ $: << File.expand_path(File.dirname(__FILE__))
2
+
3
+ require 'misc'
4
+ require 'word'
5
+ require 'part_of_speech'
6
+ require 'languages/english'
7
+ require 'languages/japanese'
8
+ require 'pp'
9
+
10
+ class Ve
11
+
12
+ class Manager
13
+ def self.provider_for(language, function)
14
+ @@provider_for[language.to_sym][function.to_sym]
15
+ end
16
+
17
+ # TODO: Make a difference between what features are available locally
18
+ # and what requires contacting external Ves
19
+ def self.register(klass, language)
20
+ @@provider_for ||= {}
21
+ provider = klass.new
22
+ # This won't work if people start monkey patching the providers with public methods that arent abilities
23
+ # It's also not pretty, but kinda nifty
24
+ provider_name = provider.class.to_s.split('::').last
25
+ parse_class = Kernel.class_eval("Ve::Parse::#{provider_name}")
26
+ abilities = parse_class.public_instance_methods - Object.public_instance_methods
27
+ abilities.each do |a|
28
+ @@provider_for[language.to_sym] ||= {}
29
+ @@provider_for[language.to_sym][a] = provider
30
+ end
31
+ end
32
+ end
33
+
34
+ # TODO: Put into separate files
35
+ class LocalInterface
36
+ def initialize(language, config = {})
37
+ @language = language
38
+ end
39
+
40
+ def method_missing(function, *args)
41
+ provider = Ve::Manager.provider_for(@language, function)
42
+ parse = provider.parse(args[0])
43
+ parse.send(function.to_sym)
44
+ end
45
+ end
46
+
47
+ class HTTPInterface
48
+ require 'net/http'
49
+ require 'uri'
50
+ require 'json'
51
+
52
+ def initialize(language, config = {})
53
+ @language = language
54
+ @base_url = config[:url]
55
+ end
56
+
57
+ def method_missing(function, *args)
58
+ url = "#{@base_url}/#{@language}/#{function}"
59
+ uri = URI.parse(url)
60
+ response = Net::HTTP.post_form(uri, {:text => args[0]})
61
+ data = JSON.parse(response.body)
62
+ result = []
63
+
64
+ data.each do |obj|
65
+ # TODO: Support transliterations
66
+ case obj['_class']
67
+ when 'Word'
68
+ result << Ve::Word.new(obj['word'], obj['lemma'], obj['part_of_speech'], obj['tokens'], obj['extra'], obj['info'])
69
+ end
70
+ end
71
+
72
+ result
73
+ end
74
+ end
75
+
76
+ @@interface = Ve::LocalInterface
77
+ @@interface_for = {}
78
+ @@config = {}
79
+
80
+ # End-users only interact with this class, so it must provide a sexy interface
81
+ # to all functionality in the providers and parse objects
82
+
83
+ # Basic, non-sexy, local interface only
84
+ def self.get(text, language, function, *args)
85
+ provider = Ve::Manager.provider_for(language, function, *args)
86
+ parse = provider.parse(text, args)
87
+ parse.send(function.to_sym)
88
+ end
89
+
90
+ # Early sexy verision
91
+ def self.in(language)
92
+ unless @@interface_for[language]
93
+ @@interface_for[language] = @@interface.new(language, @@config)
94
+ end
95
+
96
+ @@interface_for[language]
97
+ end
98
+
99
+ def self.config(interface, config)
100
+ @@interface = interface
101
+ @@config = config
102
+ end
103
+
104
+ end
105
+
106
+ # TODO: Autoload this shit
107
+ require 'providers/fallbacks'
108
+ require 'providers/mecab_ipadic'
109
+ require 'providers/freeling_en'
110
+ require 'providers/japanese_transliterators'
111
+
data/lib/word.rb ADDED
@@ -0,0 +1,43 @@
1
+ class Ve
2
+ class Word
3
+
4
+ attr_accessor :word, :lemma, :part_of_speech, :tokens, :extra, :info
5
+
6
+ # TODO: More elegance
7
+ def initialize(word, lemma, part_of_speech, tokens, extra = {}, info = {})
8
+ @word = word.dup
9
+ @lemma = lemma.dup
10
+ @part_of_speech = part_of_speech
11
+ @tokens = tokens
12
+
13
+ # TODO: I don't like this, it's too unstructured
14
+ @extra = extra
15
+ @info = info
16
+ end
17
+
18
+ # TODO: the main part of a word, for example 重要 in 重要な
19
+ def main_part
20
+ end
21
+
22
+ def base_form
23
+ @lemma
24
+ end
25
+
26
+ def inflected?
27
+ @word != @lemma
28
+ end
29
+
30
+ def as_json
31
+ {
32
+ :_class => 'Word',
33
+ :word => @word,
34
+ :lemma => @lemma,
35
+ :part_of_speech => @part_of_speech.name,
36
+ :tokens => @tokens,
37
+ :extra => @extra,
38
+ :info => @info
39
+ }
40
+ end
41
+
42
+ end
43
+ end
data/sinatra/server.rb ADDED
@@ -0,0 +1,46 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ require 'sinatra'
4
+ require 'json'
5
+ require 'rack/cors'
6
+
7
+ require File.expand_path(File.dirname(__FILE__) + "/../lib/ve")
8
+
9
+ use Rack::Cors do
10
+ allow do
11
+ origins '*'
12
+ end
13
+ end
14
+
15
+ get '/:language/:function' do
16
+ run
17
+ end
18
+
19
+ post '/:language/:function' do
20
+ run
21
+ end
22
+
23
+ private
24
+
25
+ def run
26
+ # Ve.source = Ve::Local # Default
27
+ # Ve.source = Ve::Remote.new(:url => 'http://ve.kimtaro.com/', :access_token => 'XYZ')
28
+ # result = Ve.get(params[:text], params[:language], params[:function].to_sym)
29
+ result = Ve.in(params[:language]).words(params[:text])
30
+
31
+ case params[:function].to_sym
32
+ when :words
33
+ json = JSON.generate(result.collect(&:as_json))
34
+ else
35
+ json = result
36
+ end
37
+
38
+ if params[:callback]
39
+ json = "#{params[:callback]}(#{json})"
40
+ content_type 'application/javascript', :charset => 'utf-8'
41
+ else
42
+ content_type 'application/json', :charset => 'utf-8'
43
+ end
44
+
45
+ json
46
+ end
@@ -0,0 +1,135 @@
1
+ # Encoding: UTF-8
2
+
3
+ require_relative 'test_helper'
4
+
5
+ class FreelingEnTest < Test::Unit::TestCase
6
+
7
+ def test_should_be_able_to_start
8
+ freeling = Ve::Provider::FreelingEn.new
9
+ assert freeling.works?
10
+ end
11
+
12
+ def test_doesnt_die_on_japanese
13
+ freeling = Ve::Provider::FreelingEn.new
14
+ parse = freeling.parse('これは日本語です')
15
+ assert_equal Ve::Parse::FreelingEn, parse.class
16
+ end
17
+
18
+ # TODO: UTF-8 handling
19
+ def test_can_handle_utf8
20
+ freeling = Ve::Provider::FreelingEn.new
21
+ parse = freeling.parse('I’m')
22
+ assert_equal ['I\'m'], parse.tokens.collect { |t| t[:literal] }
23
+ end
24
+
25
+ def test_can_parse
26
+ freeling = Ve::Provider::FreelingEn.new
27
+ parse = freeling.parse('')
28
+ assert_equal Ve::Parse::FreelingEn, parse.class
29
+ end
30
+
31
+ def test_all_literals_should_equal_the_input_text
32
+ text = <<-EOS
33
+ There once was a man from X
34
+ Who took it upon himself to Y
35
+ Z
36
+
37
+ EOS
38
+ freeling = Ve::Provider::FreelingEn.new
39
+ parse = freeling.parse(text)
40
+ assert_equal text, parse.tokens.collect { |t| t[:literal] }.join
41
+ end
42
+
43
+ def test_creates_tokens_from_data_that_is_ignored_in_parsing
44
+ freeling = Ve::Provider::FreelingEn.new
45
+ parse = freeling.parse('A B ')
46
+ assert_equal [:parsed, :unparsed, :parsed, :unparsed, :sentence_split], parse.tokens.collect { |t| t[:type] }
47
+ assert_equal ['A', ' ', 'B', ' ', ''], parse.tokens.collect { |t| t[:literal] }
48
+ end
49
+
50
+ def test_can_give_sentences
51
+ freeling = Ve::Provider::FreelingEn.new
52
+ parse = freeling.parse('This is a sentence. And this was another one')
53
+ assert_equal ['This is a sentence.', 'And this was another one'], parse.sentences
54
+ end
55
+
56
+ def test_can_give_words
57
+ freeling = Ve::Provider::FreelingEn.new
58
+ parse = freeling.parse('This was a sentence.')
59
+ words = parse.words
60
+ tokens = parse.tokens
61
+
62
+ assert_equal ['This', 'was', 'a', 'sentence', '.'], words.collect(&:word)
63
+ assert_equal ['this', 'be', 'a', 'sentence', '.'], words.collect(&:lemma)
64
+ assert_equal [Ve::PartOfSpeech::Pronoun, Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Determiner, Ve::PartOfSpeech::Noun, Ve::PartOfSpeech::Symbol], words.collect(&:part_of_speech)
65
+ assert_equal [{:grammar => :personal}, {:grammar => :past}, {:grammar => nil}, {:grammar => nil}, {:grammar => nil}], words.collect(&:extra)
66
+
67
+ assert_equal [[tokens[0]], [tokens[2]], [tokens[4]], [tokens[6]], [tokens[7]]], words.collect(&:tokens)
68
+ end
69
+
70
+ def test_possessive_endings_must_be_reattached
71
+ freeling = Ve::Provider::FreelingEn.new
72
+ parse = freeling.parse("This is Jane's sentence.")
73
+ words = parse.words
74
+ tokens = parse.tokens
75
+
76
+ assert_equal ['This', 'is', "Jane's", 'sentence', '.'], words.collect(&:word)
77
+ assert_equal ['this', 'be', "jane", 'sentence', '.'], words.collect(&:lemma)
78
+ assert_equal [Ve::PartOfSpeech::Pronoun, Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::ProperNoun, Ve::PartOfSpeech::Noun, Ve::PartOfSpeech::Symbol], words.collect(&:part_of_speech)
79
+ assert_equal [{:grammar => :personal}, {:grammar => nil}, {:gramamr => nil}, {:grammar => nil}, {:grammar => nil}], words.collect(&:extra)
80
+ assert_equal [[tokens[0]], [tokens[2]], tokens[4..5], [tokens[7]], [tokens[8]]], words.collect(&:tokens)
81
+ end
82
+
83
+ def test_date_parsing
84
+ # Should be turned off. At least for now
85
+ freeling = Ve::Provider::FreelingEn.new
86
+
87
+ assert_parses_into_words(freeling,
88
+ {:words => ['January'],
89
+ :lemmas => ['january'],
90
+ :pos => [Ve::PartOfSpeech::Noun],
91
+ :extra => [{:grammar => nil}],
92
+ :tokens => [0..0]},
93
+ 'January')
94
+ end
95
+
96
+ def test_symbol_parsing
97
+ freeling = Ve::Provider::FreelingEn.new
98
+
99
+ assert_parses_into_words(freeling,
100
+ {:words => ['.', ',', '$'],
101
+ :lemmas => ['.', ',', '$'],
102
+ :pos => [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::Symbol],
103
+ :extra => [{:grammar => nil}, {:grammar => nil}, {:grammar => nil}],
104
+ :tokens => [0..0, 1..1, 2..2]},
105
+ '.,$')
106
+ end
107
+
108
+ def test_can_handle_underscores_properly
109
+ # Should restore them
110
+ freeling = Ve::Provider::FreelingEn.new
111
+ parse = freeling.parse("In New York")
112
+ words = parse.words
113
+ tokens = parse.tokens
114
+
115
+ assert_equal ['In', 'New York'], words.collect(&:word)
116
+ assert_equal ['in', 'new york'], words.collect(&:lemma)
117
+ assert_equal [Ve::PartOfSpeech::Preposition, Ve::PartOfSpeech::ProperNoun], words.collect(&:part_of_speech)
118
+ assert_equal [{:grammar => nil}, {:grammar => nil}], words.collect(&:extra)
119
+ assert_equal [tokens[0..0], tokens[2..2]], words.collect(&:tokens)
120
+
121
+ # Should keep them
122
+ # TODO
123
+ freeling = Ve::Provider::FreelingEn.new
124
+ parse = freeling.parse("In New_York")
125
+ words = parse.words
126
+ tokens = parse.tokens
127
+
128
+ assert_equal ['In', 'New_York'], words.collect(&:word)
129
+ assert_equal ['in', 'new_york'], words.collect(&:lemma)
130
+ assert_equal [Ve::PartOfSpeech::Preposition, Ve::PartOfSpeech::ProperNoun], words.collect(&:part_of_speech)
131
+ assert_equal [{:grammar => nil}, {:grammarl => nil}], words.collect(&:extra)
132
+ assert_equal [tokens[0..1], tokens[2..2], tokens[3..11]], words.collect(&:tokens)
133
+ end
134
+
135
+ end
@@ -0,0 +1,79 @@
1
+ # Encoding: UTF-8
2
+
3
+ require_relative 'test_helper'
4
+
5
+ class JapaneseTransliteratorsTest < Test::Unit::TestCase
6
+
7
+ KATAKANA = "ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶ"
8
+ HIRAGANA = "ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ"
9
+ HALFWIDTH = "!\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ "
10
+ FULLWIDTH = "!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ "
11
+
12
+ def setup
13
+ @trans = Ve::Provider::JapaneseTransliterators.new
14
+ end
15
+
16
+ def test_should_be_able_to_start
17
+ assert @trans.works?
18
+ end
19
+
20
+ def test_transliterate_from_hira_to_latn
21
+ assert_equal 'kosoado', @trans.parse('こそあど').transliterate_from_hira_to_latn
22
+ assert_equal 'konna', @trans.parse('こんな').transliterate_from_hira_to_latn
23
+ assert_equal 'konyaku', @trans.parse('こにゃく').transliterate_from_hira_to_latn
24
+ assert_equal 'kon\'yaku', @trans.parse('こんやく').transliterate_from_hira_to_latn
25
+ assert_equal 'shinbun', @trans.parse('しんぶん').transliterate_from_hira_to_latn
26
+ assert_equal 'appa', @trans.parse('あっぱ').transliterate_from_hira_to_latn
27
+ end
28
+
29
+ def test_transliterate_from_latn_to_hrkt
30
+ assert_equal('かなです', @trans.parse('kanadesu').transliterate_from_latn_to_hrkt)
31
+ assert_equal('こそあど', @trans.parse('kosoado').transliterate_from_latn_to_hrkt)
32
+ assert_equal('こんな', @trans.parse('konna').transliterate_from_latn_to_hrkt)
33
+ assert_equal('しんぶん', @trans.parse('shimbun').transliterate_from_latn_to_hrkt)
34
+ assert_equal('しんぱい', @trans.parse('simpai').transliterate_from_latn_to_hrkt)
35
+ assert_equal('うぁ', @trans.parse('wha').transliterate_from_latn_to_hrkt)
36
+ assert_equal('かっちゃった', @trans.parse('katchatta').transliterate_from_latn_to_hrkt)
37
+ assert_equal('かっわいいぃ', @trans.parse('kawwaiixi').transliterate_from_latn_to_hrkt)
38
+ assert_equal('おっとせい', @trans.parse('ottosei').transliterate_from_latn_to_hrkt)
39
+ assert_equal('あっち', @trans.parse('acchi').transliterate_from_latn_to_hrkt)
40
+
41
+ # Katakana
42
+ assert_equal('カナデス', @trans.parse('KANADESU').transliterate_from_latn_to_hrkt)
43
+ assert_equal('コソアド', @trans.parse('KOSOADO').transliterate_from_latn_to_hrkt)
44
+ assert_equal('コンナ', @trans.parse('KONNA').transliterate_from_latn_to_hrkt)
45
+ assert_equal('シンブン', @trans.parse('SHIMBUN').transliterate_from_latn_to_hrkt)
46
+ assert_equal('シンパイ', @trans.parse('SIMPAI').transliterate_from_latn_to_hrkt)
47
+ assert_equal('ウァ', @trans.parse('WHA').transliterate_from_latn_to_hrkt)
48
+ assert_equal('カッチャッタ', @trans.parse('KATCHATTA').transliterate_from_latn_to_hrkt)
49
+ assert_equal('カッワイイィ', @trans.parse('KAWWAIIXI').transliterate_from_latn_to_hrkt)
50
+ assert_equal('オットセイ', @trans.parse('OTTOSEI').transliterate_from_latn_to_hrkt)
51
+ assert_equal('アッチ', @trans.parse('ACCHI').transliterate_from_latn_to_hrkt)
52
+ assert_equal('カタカナ です', @trans.parse('KATAKANA desu').transliterate_from_latn_to_hrkt)
53
+
54
+ # Non-Japanese
55
+ assert_equal('てぃs いs そめ えんgりsh', @trans.parse('this is some english').transliterate_from_latn_to_hrkt)
56
+ end
57
+
58
+ def test_transliterate_from_hira_to_kana
59
+ assert_equal KATAKANA, @trans.parse(HIRAGANA).transliterate_from_hira_to_kana
60
+ end
61
+
62
+ def test_transliterate_from_kata_to_hina
63
+ assert_equal HIRAGANA, @trans.parse(KATAKANA).transliterate_from_kana_to_hira
64
+ end
65
+
66
+ def test_transliterate_from_hrkt_to_latn
67
+ assert_equal 'hiraganakatakana', @trans.parse('ひらがなカタカナ').transliterate_from_hrkt_to_latn
68
+ end
69
+
70
+ def test_transliterate_from_fullwidth_to_halfwidth
71
+ assert_equal HALFWIDTH, @trans.parse(FULLWIDTH).transliterate_from_fullwidth_to_halfwidth
72
+ end
73
+
74
+ def test_transliterate_from_halfwidth_to_fullwidth
75
+ assert_equal FULLWIDTH, @trans.parse(HALFWIDTH).transliterate_from_halfwidth_to_fullwidth
76
+ end
77
+
78
+
79
+ end
@@ -0,0 +1,452 @@
1
+ # Encoding: UTF-8
2
+
3
+ require_relative 'test_helper'
4
+
5
+ class MecabIpadicTest < Test::Unit::TestCase
6
+
7
+ def test_should_be_able_to_start
8
+ mecab = Ve::Provider::MecabIpadic.new
9
+ assert mecab.works?
10
+ end
11
+
12
+ def test_can_parse
13
+ mecab = Ve::Provider::MecabIpadic.new
14
+ parse = mecab.parse('')
15
+ assert_equal Ve::Parse::MecabIpadic, parse.class
16
+ end
17
+
18
+ def test_all_literals_should_equal_the_input_text
19
+ text = <<-EOS
20
+ 古池や
21
+ 蛙飛び込む
22
+ 水の音
23
+
24
+ EOS
25
+ mecab = Ve::Provider::MecabIpadic.new
26
+ parse = mecab.parse(text)
27
+ assert_equal text, parse.tokens.collect { |t| t[:literal] }.join
28
+ end
29
+
30
+ def test_tokens_must_be_created_for_parsed_and_unparsed_text
31
+ mecab = Ve::Provider::MecabIpadic.new
32
+ parse = mecab.parse(' A B ')
33
+ assert_equal [:unparsed, :parsed, :unparsed, :parsed, :unparsed, :sentence_split], parse.tokens.collect { |t| t[:type] }
34
+ assert_equal [' ', 'A', ' ', 'B', ' ', ''], parse.tokens.collect { |t| t[:literal] }
35
+ assert_equal [0..0, 1..1, 2..4, 5..5, 6..7, nil], parse.tokens.collect { |t| t[:characters] }
36
+ end
37
+
38
+ def test_tokens_should_not_be_modified_when_attached_to_words
39
+ mecab = Ve::Provider::MecabIpadic.new
40
+ parse = mecab.parse('悪化する')
41
+ tokens = parse.tokens
42
+ assert_equal '悪化', tokens[0][:literal]
43
+ assert_equal '悪化', tokens[0][:lemma]
44
+ end
45
+
46
+ def test_sentences
47
+ mecab = Ve::Provider::MecabIpadic.new
48
+ parse = mecab.parse('これは文章である。で、also containing some Englishですね')
49
+ assert_equal ['これは文章である。', 'で、also containing some Englishですね'], parse.sentences
50
+ end
51
+
52
+ def test_this_shouldnt_crash
53
+ mecab = Ve::Provider::MecabIpadic.new
54
+ parse = mecab.parse('チューたろうは田中さんの犬です。')
55
+ assert_equal 11, parse.tokens.size
56
+ end
57
+
58
+ def test_this_shouldnt_crash_either
59
+ mecab = Ve::Provider::MecabIpadic.new
60
+ parse = mecab.parse('三十年式歩兵銃')
61
+ assert_equal 7, parse.tokens.size
62
+ end
63
+
64
+ def test_words
65
+ mecab = Ve::Provider::MecabIpadic.new
66
+
67
+ # Meishi
68
+ assert_parses_into_words(mecab, {:words => ['車'],
69
+ :lemmas => ['車'],
70
+ :pos => [Ve::PartOfSpeech::Noun],
71
+ :extra => [{:reading => 'クルマ', :transcription => 'クルマ', :grammar => nil}],
72
+ :tokens => [0..0]},
73
+ '車')
74
+
75
+ # Koyuumeishi
76
+ assert_parses_into_words(mecab, {:words => ['太郎'],
77
+ :lemmas => ['太郎'],
78
+ :pos => [Ve::PartOfSpeech::ProperNoun],
79
+ :extra => [{:reading => 'タロウ', :transcription => 'タロー', :grammar => nil}],
80
+ :tokens => [0..0]},
81
+ '太郎')
82
+
83
+ # Daimeishi
84
+ assert_parses_into_words(mecab, {:words => ['彼'],
85
+ :lemmas => ['彼'],
86
+ :pos => [Ve::PartOfSpeech::Pronoun],
87
+ :extra => [{:reading => 'カレ', :transcription => 'カレ', :grammar => nil}],
88
+ :tokens => [0..0]},
89
+ '彼')
90
+
91
+ # Fukushikanou
92
+ assert_parses_into_words(mecab, {:words => ['午後に'],
93
+ :lemmas => ['午後に'],
94
+ :pos => [Ve::PartOfSpeech::Adverb],
95
+ :extra => [{:reading => 'ゴゴニ', :transcription => 'ゴゴニ', :grammar => nil}],
96
+ :tokens => [0..1]},
97
+ '午後に')
98
+
99
+ # Kazu
100
+ assert_parses_into_words(mecab, {:words => ['一'],
101
+ :lemmas => ['一'],
102
+ :pos => [Ve::PartOfSpeech::Number],
103
+ :extra => [{:reading => 'イチ', :transcription => 'イチ', :grammar => nil}],
104
+ :tokens => [0..0]},
105
+ '一')
106
+
107
+ assert_parses_into_words(mecab, {:words => ['123'],
108
+ :lemmas => ['123'],
109
+ :pos => [Ve::PartOfSpeech::Number],
110
+ :extra => [{:reading => 'イチニサン', :transcription => 'イチニサン', :grammar => nil}],
111
+ :tokens => [0..2]},
112
+ '123')
113
+
114
+ # Sahensetsuzoku + tokumi ta
115
+ assert_parses_into_words(mecab, {:words => ['悪化した'],
116
+ :lemmas => ['悪化する'],
117
+ :pos => [Ve::PartOfSpeech::Verb],
118
+ :extra => [{:reading => 'アッカシタ', :transcription => 'アッカシタ', :grammar => nil}],
119
+ :tokens => [0..2]},
120
+ '悪化した')
121
+
122
+ # Keiyoudoushigokan
123
+ assert_parses_into_words(mecab, {:words => ['重要な'],
124
+ :lemmas => ['重要'],
125
+ :pos => [Ve::PartOfSpeech::Adjective],
126
+ :extra => [{:reading => 'ジュウヨウナ', :transcription => 'ジューヨーナ', :grammar => nil}],
127
+ :tokens => [0..1]},
128
+ '重要な')
129
+
130
+ # Naikeiyoushigokan
131
+ assert_parses_into_words(mecab, {:words => ['とんでもない'],
132
+ :lemmas => ['とんでもない'],
133
+ :pos => [Ve::PartOfSpeech::Adjective],
134
+ :extra => [{:reading => 'トンデモナイ', :transcription => 'トンデモナイ', :grammar => nil}],
135
+ :tokens => [0..1]},
136
+ 'とんでもない')
137
+
138
+ # Meishi hijiritsu fukushikanou
139
+ assert_parses_into_words(mecab, {:words => ['の', 'うちに'],
140
+ :lemmas => ['の', 'うちに'],
141
+ :pos => [Ve::PartOfSpeech::Postposition, Ve::PartOfSpeech::Adverb],
142
+ :extra => [{:reading => 'ノ', :transcription => 'ノ', :grammar => nil},
143
+ {:reading => 'ウチニ', :transcription => 'ウチニ', :grammar => nil}],
144
+ :tokens => [0..0, 1..2]},
145
+ 'のうちに')
146
+
147
+ # Meishi hijiritsu jodoushigokan
148
+ assert_parses_into_words(mecab, {:words => ['の', 'ような'],
149
+ :lemmas => ['の', 'ようだ'],
150
+ :pos => [Ve::PartOfSpeech::Postposition, Ve::PartOfSpeech::Verb],
151
+ :extra => [{:reading => 'ノ', :transcription => 'ノ', :grammar => nil},
152
+ {:reading => 'ヨウナ', :transcription => 'ヨーナ', :grammar => :auxillary}],
153
+ :tokens => [0..0, 1..2]},
154
+ 'のような')
155
+
156
+ assert_parses_into_words(mecab, {:words => ['の', 'ように'],
157
+ :lemmas => ['の', 'ように'],
158
+ :pos => [Ve::PartOfSpeech::Postposition, Ve::PartOfSpeech::Adverb],
159
+ :extra => [{:reading => 'ノ', :transcription => 'ノ', :grammar => nil},
160
+ {:reading => 'ヨウニ', :transcription => 'ヨーニ', :grammar => nil}],
161
+ :tokens => [0..0, 1..2]},
162
+ 'のように')
163
+
164
+ # Meishi hijiritsu keiyoudoushigokan
165
+ assert_parses_into_words(mecab, {:words => ['みたいな'],
166
+ :lemmas => ['みたいだ'],
167
+ :pos => [Ve::PartOfSpeech::Adjective],
168
+ :extra => [{:reading => 'ミタイナ', :transcription => 'ミタイナ', :grammar => nil}],
169
+ :tokens => [0..1]},
170
+ 'みたいな')
171
+
172
+ assert_parses_into_words(mecab, {:words => ['みたいの'],
173
+ :lemmas => ['みたいの'],
174
+ :pos => [Ve::PartOfSpeech::Adjective],
175
+ :extra => [{:reading => 'ミタイノ', :transcription => 'ミタイノ', :grammar => nil}],
176
+ :tokens => [0..1]},
177
+ 'みたいの')
178
+
179
+ assert_parses_into_words(mecab, {:words => ['みたい', 'だ'],
180
+ :lemmas => ['みたい', 'だ'],
181
+ :pos => [Ve::PartOfSpeech::Adjective, Ve::PartOfSpeech::Verb],
182
+ :extra => [{:reading => 'ミタイ', :transcription => 'ミタイ', :grammar => nil},
183
+ {:reading => 'ダ', :transcription => 'ダ', :grammar => nil}],
184
+ :tokens => [0..0, 1..1]},
185
+ 'みたいだ')
186
+
187
+ # Meishi tokushu jodoushigokan
188
+ assert_parses_into_words(mecab, {:words => ['行く', 'そう', 'だ'],
189
+ :lemmas => ['行く', 'そう', 'だ'],
190
+ :pos => [Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Verb],
191
+ :extra => [{:reading => 'イク', :transcription => 'イク', :grammar => nil},
192
+ {:reading => 'ソウ', :transcription => 'ソー', :grammar => :auxillary},
193
+ {:reading => 'ダ', :transcription => 'ダ', :grammar => nil}],
194
+ :tokens => [0..0, 1..1, 2..2]},
195
+ '行くそうだ')
196
+
197
+ # Meishi setsubi
198
+ # TODO: This should maybe be parsed as one noun instead
199
+ assert_parses_into_words(mecab, {:words => ['楽し', 'さ'],
200
+ :lemmas => ['楽しい', 'さ'],
201
+ :pos => [Ve::PartOfSpeech::Adjective, Ve::PartOfSpeech::Suffix],
202
+ :extra => [{:reading => 'タノシ', :transcription => 'タノシ', :grammar => nil},
203
+ {:reading => 'サ', :transcription => 'サ', :grammar => nil}],
204
+ :tokens => [0..0, 1..1]},
205
+ '楽しさ')
206
+
207
+ # Meishi setsuzokushiteki
208
+ assert_parses_into_words(mecab, {:words => ['日本', '対', 'アメリカ'],
209
+ :lemmas => ['日本', '対', 'アメリカ'],
210
+ :pos => [Ve::PartOfSpeech::ProperNoun, Ve::PartOfSpeech::Conjunction, Ve::PartOfSpeech::ProperNoun],
211
+ :extra => [{:reading => 'ニッポン', :transcription => 'ニッポン', :grammar => nil},
212
+ {:reading => 'タイ', :transcription => 'タイ', :grammar => nil},
213
+ {:reading => 'アメリカ', :transcription => 'アメリカ', :grammar => nil}],
214
+ :tokens => [0..0, 1..1, 2..2]},
215
+ '日本対アメリカ')
216
+
217
+ # Meishi doushihijiritsuteki
218
+ assert_parses_into_words(mecab, {:words => ['見て', 'ごらん'],
219
+ :lemmas => ['見る', 'ごらん'],
220
+ :pos => [Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Verb],
221
+ :extra => [{:reading => 'ミテ', :transcription => 'ミテ', :grammar => nil},
222
+ {:reading => 'ゴラン', :transcription => 'ゴラン', :grammar => :nominal}],
223
+ :tokens => [0..1, 2..2]},
224
+ '見てごらん')
225
+
226
+ # Settoushi
227
+ assert_parses_into_words(mecab, {:words => ['お', '座り'],
228
+ :lemmas => ['お', '座り'],
229
+ :pos => [Ve::PartOfSpeech::Prefix, Ve::PartOfSpeech::Noun],
230
+ :extra => [{:reading => 'オ', :transcription => 'オ', :grammar => nil},
231
+ {:reading => 'スワリ', :transcription => 'スワリ', :grammar => nil}],
232
+ :tokens => [0..0, 1..1]},
233
+ 'お座り')
234
+
235
+ # Kigou
236
+ assert_parses_into_words(mecab, {:words => ['。'],
237
+ :lemmas => ['。'],
238
+ :pos => [Ve::PartOfSpeech::Symbol],
239
+ :extra => [{:reading => '。', :transcription => '。', :grammar => nil}],
240
+ :tokens => [0..0]},
241
+ '。')
242
+
243
+ # Firaa
244
+ assert_parses_into_words(mecab, {:words => ['えと'],
245
+ :lemmas => ['えと'],
246
+ :pos => [Ve::PartOfSpeech::Interjection],
247
+ :extra => [{:reading => 'エト', :transcription => 'エト', :grammar => nil}],
248
+ :tokens => [0..0]},
249
+ 'えと')
250
+
251
+ # Sonota
252
+ assert_parses_into_words(mecab, {:words => ['だ', 'ァ'],
253
+ :lemmas => ['だ', 'ァ'],
254
+ :pos => [Ve::PartOfSpeech::Postposition, Ve::PartOfSpeech::Other],
255
+ :extra => [{:reading => 'ダ', :transcription => 'ダ', :grammar => nil},
256
+ {:reading => 'ァ', :transcription => 'ア', :grammar => nil}],
257
+ :tokens => [0..0, 1..1]},
258
+ 'だァ')
259
+
260
+ # Kandoushi
261
+ assert_parses_into_words(mecab, {:words => ['おはよう'],
262
+ :lemmas => ['おはよう'],
263
+ :pos => [Ve::PartOfSpeech::Interjection],
264
+ :extra => [{:reading => 'オハヨウ', :transcription => 'オハヨー', :grammar => nil}],
265
+ :tokens => [0..0]},
266
+ 'おはよう')
267
+
268
+ # Rentaishi
269
+ assert_parses_into_words(mecab, {:words => ['この'],
270
+ :lemmas => ['この'],
271
+ :pos => [Ve::PartOfSpeech::Determiner],
272
+ :extra => [{:reading => 'コノ', :transcription => 'コノ', :grammar => nil}],
273
+ :tokens => [0..0]},
274
+ 'この')
275
+
276
+ # Setsuzokushi
277
+ assert_parses_into_words(mecab, {:words => ['そして'],
278
+ :lemmas => ['そして'],
279
+ :pos => [Ve::PartOfSpeech::Conjunction],
280
+ :extra => [{:reading => 'ソシテ', :transcription => 'ソシテ', :grammar => nil}],
281
+ :tokens => [0..0]},
282
+ 'そして')
283
+
284
+ # Fukushi
285
+ assert_parses_into_words(mecab, {:words => ['多分'],
286
+ :lemmas => ['多分'],
287
+ :pos => [Ve::PartOfSpeech::Adverb],
288
+ :extra => [{:reading => 'タブン', :transcription => 'タブン', :grammar => nil}],
289
+ :tokens => [0..0]},
290
+ '多分')
291
+
292
+ # Doushi
293
+ assert_parses_into_words(mecab, {:words => ['行く'],
294
+ :lemmas => ['行く'],
295
+ :pos => [Ve::PartOfSpeech::Verb],
296
+ :extra => [{:reading => 'イク', :transcription => 'イク', :grammar => nil}],
297
+ :tokens => [0..0]},
298
+ '行く')
299
+
300
+ assert_parses_into_words(mecab, {:words => ['行かない'],
301
+ :lemmas => ['行く'],
302
+ :pos => [Ve::PartOfSpeech::Verb],
303
+ :extra => [{:reading => 'イカナイ', :transcription => 'イカナイ', :grammar => nil}],
304
+ :tokens => [0..1]},
305
+ '行かない')
306
+
307
+ assert_parses_into_words(mecab, {:words => ['行って', 'きて'],
308
+ :lemmas => ['行く', 'くる'],
309
+ :pos => [Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Verb],
310
+ :extra => [{:reading => 'イッテ', :transcription => 'イッテ', :grammar => nil},
311
+ {:reading => 'キテ', :transcription => 'キテ', :grammar => :auxillary}],
312
+ :tokens => [0..1, 2..3]},
313
+ '行ってきて')
314
+
315
+ # Doushi setsubi
316
+ assert_parses_into_words(mecab, {:words => ['行かれる'],
317
+ :lemmas => ['行く'],
318
+ :pos => [Ve::PartOfSpeech::Verb],
319
+ :extra => [{:reading => 'イカレル', :transcription => 'イカレル', :grammar => nil}],
320
+ :tokens => [0..1]},
321
+ '行かれる')
322
+
323
+ assert_parses_into_words(mecab, {:words => ['食べさせられた'],
324
+ :lemmas => ['食べる'],
325
+ :pos => [Ve::PartOfSpeech::Verb],
326
+ :extra => [{:reading => 'タベサセラレタ', :transcription => 'タベサセラレタ', :grammar => nil}],
327
+ :tokens => [0..3]},
328
+ '食べさせられた')
329
+
330
+ # Doushi + jodoushi
331
+ assert_parses_into_words(mecab, {:words => ['食べました'],
332
+ :lemmas => ['食べる'],
333
+ :pos => [Ve::PartOfSpeech::Verb],
334
+ :extra => [{:reading => 'タベマシタ', :transcription => 'タベマシタ', :grammar => nil}],
335
+ :tokens => [0..2]},
336
+ '食べました')
337
+
338
+ # Keiyoushi
339
+ assert_parses_into_words(mecab, {:words => ['寒い'],
340
+ :lemmas => ['寒い'],
341
+ :pos => [Ve::PartOfSpeech::Adjective],
342
+ :extra => [{:reading => 'サムイ', :transcription => 'サムイ', :grammar => nil}],
343
+ :tokens => [0..0]},
344
+ '寒い')
345
+
346
+ assert_parses_into_words(mecab, {:words => ['寒くて'],
347
+ :lemmas => ['寒い'],
348
+ :pos => [Ve::PartOfSpeech::Adjective],
349
+ :extra => [{:reading => 'サムクテ', :transcription => 'サムクテ', :grammar => nil}],
350
+ :tokens => [0..1]},
351
+ '寒くて')
352
+
353
+ assert_parses_into_words(mecab, {:words => ['寒かった'],
354
+ :lemmas => ['寒い'],
355
+ :pos => [Ve::PartOfSpeech::Adjective],
356
+ :extra => [{:reading => 'サムカッタ', :transcription => 'サムカッタ', :grammar => nil}],
357
+ :tokens => [0..1]},
358
+ '寒かった')
359
+
360
+ assert_parses_into_words(mecab, {:words => ['寒ければ'],
361
+ :lemmas => ['寒い'],
362
+ :pos => [Ve::PartOfSpeech::Adjective],
363
+ :extra => [{:reading => 'サムケレバ', :transcription => 'サムケレバ', :grammar => nil}],
364
+ :tokens => [0..1]},
365
+ '寒ければ')
366
+
367
+ assert_parses_into_words(mecab, {:words => ['寒けりゃ'],
368
+ :lemmas => ['寒い'],
369
+ :pos => [Ve::PartOfSpeech::Adjective],
370
+ :extra => [{:reading => 'サムケリャ', :transcription => 'サムケリャ', :grammar => nil}],
371
+ :tokens => [0..0]},
372
+ '寒けりゃ')
373
+
374
+ assert_parses_into_words(mecab, {:words => ['食べたい'],
375
+ :lemmas => ['食べる'],
376
+ :pos => [Ve::PartOfSpeech::Verb],
377
+ :extra => [{:reading => 'タベタイ', :transcription => 'タベタイ', :grammar => nil}],
378
+ :tokens => [0..1]},
379
+ '食べたい')
380
+
381
+ # Joshi
382
+ assert_parses_into_words(mecab, {:words => ['日本', 'から'],
383
+ :lemmas => ['日本', 'から'],
384
+ :pos => [Ve::PartOfSpeech::ProperNoun, Ve::PartOfSpeech::Postposition],
385
+ :extra => [{:reading => 'ニッポン', :transcription => 'ニッポン', :grammar => nil},
386
+ {:reading => 'カラ', :transcription => 'カラ', :grammar => nil}],
387
+ :tokens => [0..0, 1..1]},
388
+ '日本から')
389
+
390
+ # The copula
391
+ assert_parses_into_words(mecab, {:words => ['日本', 'です'],
392
+ :lemmas => ['日本', 'です'],
393
+ :pos => [Ve::PartOfSpeech::ProperNoun, Ve::PartOfSpeech::Verb],
394
+ :extra => [{:reading => 'ニッポン', :transcription => 'ニッポン', :grammar => nil},
395
+ {:reading => 'デス', :transcription => 'デス', :grammar => nil}],
396
+ :tokens => [0..0, 1..1]},
397
+ '日本です')
398
+
399
+ assert_parses_into_words(mecab, {:words => ['日本', 'だった'],
400
+ :lemmas => ['日本', 'だ'],
401
+ :pos => [Ve::PartOfSpeech::ProperNoun, Ve::PartOfSpeech::Verb],
402
+ :extra => [{:reading => 'ニッポン', :transcription => 'ニッポン', :grammar => nil},
403
+ {:reading => 'ダッタ', :transcription => 'ダッタ', :grammar => nil}],
404
+ :tokens => [0..0, 1..2]},
405
+ '日本だった')
406
+
407
+ # いるから
408
+ assert_parses_into_words(mecab, {:words => ['いる', 'から'],
409
+ :lemmas => ['いる', 'から'],
410
+ :pos => [Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Postposition],
411
+ :extra => [{:reading => 'イル', :transcription => 'イル', :grammar => nil},
412
+ {:reading => 'カラ', :transcription => 'カラ', :grammar => nil}],
413
+ :tokens => [0..0, 1..1]},
414
+ 'いるから')
415
+
416
+ # しているから
417
+ assert_parses_into_words(mecab, {:words => ['して', 'いる', 'から'],
418
+ :lemmas => ['する', 'いる', 'から'],
419
+ :pos => [Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Postposition],
420
+ :extra => [{:reading => 'シテ', :transcription => 'シテ', :grammar => nil},
421
+ {:reading => 'イル', :transcription => 'イル', :grammar => :auxillary},
422
+ {:reading => 'カラ', :transcription => 'カラ', :grammar => nil}],
423
+ :tokens => [0..0, 1..1, 2..2]},
424
+ 'しているから')
425
+
426
+ # 基準があるが、
427
+ assert_parses_into_words(mecab, {:words => ['して', 'いる', 'から'],
428
+ :lemmas => ['する', 'いる', 'から'],
429
+ :pos => [Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::Postposition],
430
+ :extra => [{:reading => 'シテ', :transcription => 'シテ', :grammar => nil},
431
+ {:reading => 'イル', :transcription => 'イル', :grammar => :auxillary},
432
+ {:reading => 'カラ', :transcription => 'カラ', :grammar => nil}],
433
+ :tokens => [0..0, 1..1, 2..2]},
434
+ '基準があるが、')
435
+
436
+ # TODO: xした should parse as adjective?
437
+ assert_parses_into_words(mecab, {:words => [],
438
+ :lemmas => [],
439
+ :pos => [],
440
+ :extra => [],
441
+ :tokens => []},
442
+ '')
443
+ end
444
+
445
+ def todo_test_word_transliteration
446
+ mecab = Ve::Provider::MecabIpadic.new
447
+ parse = mecab.parse('日本', :transliterate_words => :latn)
448
+
449
+ assert_equal 'nihon', parse.words.first.transliteration(:latn)
450
+ end
451
+
452
+ end