ve 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.travis.yml +9 -0
- data/Gemfile +5 -0
- data/Gemfile.lock +6 -0
- data/lib/providers/freeling_en.rb +4 -2
- data/lib/providers/mecab_ipadic.rb +52 -32
- data/lib/ve.rb +15 -4
- data/tests/{freeling_en_test.rb → freeling_en_parse_test.rb} +37 -48
- data/tests/freeling_en_provider_test.rb +38 -0
- data/tests/japanese_transliterators_test.rb +1 -1
- data/tests/mecab_ipadic_parse_test.rb +772 -0
- data/tests/mecab_ipadic_provider_test.rb +21 -0
- data/tests/test_helper.rb +5 -4
- data/tests/ve_test.rb +5 -1
- data/ve.gemspec +1 -1
- metadata +27 -35
- data/tests/mecab_ipadic_test.rb +0 -452
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 7667f10a89f699b284d7a412bab815d46e9bf26d
|
4
|
+
data.tar.gz: 7700e9a46ee0321b746ce23c806df91fb54b7252
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fa87fa761966cc70ec3edf7dad6b4ef36404a8adc14314f88aa7bd91b34784a98de198d699f6fe8650ace92fb45b9be7c5ac6e23d35b5f1fcf9a138d75d647b0
|
7
|
+
data.tar.gz: 78e90e9c7af44b26bebd04bd4f44e002b392438bfe7422950f473d01e99840e5e26f6ca0887a17f76f2d7211f49d44fee2ce7e86ed832824f0d01d5109a6cea4
|
data/.travis.yml
ADDED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -2,11 +2,15 @@ GEM
|
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
4
|
json (1.6.1)
|
5
|
+
metaclass (0.0.1)
|
6
|
+
mocha (0.11.4)
|
7
|
+
metaclass (~> 0.0.1)
|
5
8
|
rack (1.3.5)
|
6
9
|
rack-cors (0.2.4)
|
7
10
|
rack
|
8
11
|
rack-protection (1.1.4)
|
9
12
|
rack
|
13
|
+
rake (0.8.7)
|
10
14
|
sinatra (1.3.1)
|
11
15
|
rack (~> 1.3, >= 1.3.4)
|
12
16
|
rack-protection (~> 1.1, >= 1.1.2)
|
@@ -18,5 +22,7 @@ PLATFORMS
|
|
18
22
|
|
19
23
|
DEPENDENCIES
|
20
24
|
json
|
25
|
+
mocha
|
21
26
|
rack-cors
|
27
|
+
rake
|
22
28
|
sinatra
|
@@ -8,7 +8,7 @@ require 'open3'
|
|
8
8
|
class Ve
|
9
9
|
class Provider
|
10
10
|
class FreelingEn < Ve::Provider
|
11
|
-
|
11
|
+
# FIX: This class isn't tested
|
12
12
|
BIT_STOP = 'VeEnd'
|
13
13
|
|
14
14
|
# TODO: Automatically set FREELINGSHARE if it's not set?
|
@@ -27,7 +27,8 @@ class Ve
|
|
27
27
|
# Interface methods
|
28
28
|
|
29
29
|
def works?
|
30
|
-
|
30
|
+
p = parse('Wrote')
|
31
|
+
["Wrote write VBD 1", ""] == p.tokens.collect { |t| t[:raw] }
|
31
32
|
end
|
32
33
|
|
33
34
|
# Talks to the app and returns a parse object
|
@@ -41,6 +42,7 @@ class Ve
|
|
41
42
|
output = []
|
42
43
|
|
43
44
|
while line = @stdout.readline
|
45
|
+
puts line
|
44
46
|
if line =~ /#{BIT_STOP}/x
|
45
47
|
@stdout.readline
|
46
48
|
break
|
@@ -7,31 +7,31 @@ class Ve
|
|
7
7
|
class MecabIpadic < Ve::Provider
|
8
8
|
|
9
9
|
BIT_STOP = 'VeEnd'
|
10
|
-
|
10
|
+
|
11
11
|
def initialize(config = {})
|
12
12
|
# TODO: Make config handling better
|
13
13
|
@config = {:app => 'mecab',
|
14
14
|
:path => '',
|
15
15
|
:flags => ''}.merge(config)
|
16
|
-
|
17
|
-
@config[:app] = `which #{@config[:app]}
|
18
|
-
|
16
|
+
|
17
|
+
@config[:app] = `which #{@config[:app]}`.chomp
|
18
|
+
|
19
19
|
start!
|
20
20
|
end
|
21
|
-
|
21
|
+
|
22
22
|
def works?
|
23
23
|
(["だっ\t助動詞,*,*,*,特殊・ダ,連用タ接続,だ,ダッ,ダッ",
|
24
24
|
"た\t助動詞,*,*,*,特殊・タ,基本形,た,タ,タ",
|
25
25
|
"EOS"] == parse('だった').tokens.collect { |t| t[:raw] } )
|
26
26
|
end
|
27
|
-
|
27
|
+
|
28
28
|
# Talks to the app and returns a parse object
|
29
29
|
def parse(text, options = {})
|
30
30
|
start! if @stdin.nil? # Restart if the provider crashed
|
31
|
-
|
31
|
+
|
32
32
|
@stdin.puts "#{text} #{BIT_STOP}"
|
33
33
|
output = []
|
34
|
-
|
34
|
+
|
35
35
|
while line = @stdout.readline.force_encoding('UTF-8')
|
36
36
|
if line =~ /#{BIT_STOP}/x
|
37
37
|
output << @stdout.readline # Catch the EOS
|
@@ -39,25 +39,25 @@ class Ve
|
|
39
39
|
end
|
40
40
|
output << line
|
41
41
|
end
|
42
|
-
|
42
|
+
|
43
43
|
Ve::Parse::MecabIpadic.new(text, output)
|
44
|
-
rescue
|
44
|
+
rescue => e
|
45
45
|
# TODO: No good to catch all errors like this
|
46
46
|
# I need a backtrace when something unexpected fails
|
47
47
|
Ve::Parse::MecabIpadic.new(text, [])
|
48
48
|
end
|
49
49
|
|
50
50
|
private
|
51
|
-
|
51
|
+
|
52
52
|
# TODO: Use Process.spawn/kill for process control?
|
53
53
|
def start!
|
54
|
-
@stdin, @stdout, @stderr = Open3.popen3(@config[:app])
|
54
|
+
@stdin, @stdout, @stderr = Open3.popen3("#{@config[:app]} #{@config[:flags]}")
|
55
55
|
@stdin.set_encoding('UTF-8')
|
56
56
|
@stdout.set_encoding('UTF-8')
|
57
|
-
rescue Errno::ENOENT
|
57
|
+
rescue Errno::ENOENT => e
|
58
58
|
# The parser couldn't be started. Probably not installed on this system
|
59
59
|
end
|
60
|
-
|
60
|
+
|
61
61
|
end
|
62
62
|
end
|
63
63
|
end
|
@@ -65,15 +65,15 @@ end
|
|
65
65
|
class Ve
|
66
66
|
class Parse
|
67
67
|
class MecabIpadic < Ve::Parse
|
68
|
-
|
68
|
+
|
69
69
|
PARSER = %r{^ (.+?) \t (.+) }x
|
70
70
|
attr_reader :tokens, :text
|
71
|
-
|
71
|
+
|
72
72
|
def initialize(text, output)
|
73
73
|
@tokens = []
|
74
74
|
@text = text
|
75
75
|
position = 0
|
76
|
-
|
76
|
+
|
77
77
|
output.each_with_index do |line, index|
|
78
78
|
line.rstrip!
|
79
79
|
token = {:raw => line}
|
@@ -87,7 +87,7 @@ class Ve
|
|
87
87
|
@tokens << unparsed_token
|
88
88
|
end
|
89
89
|
end
|
90
|
-
|
90
|
+
|
91
91
|
if line =~ %r{^ EOS $}x
|
92
92
|
token[:type] = :sentence_split
|
93
93
|
token[:literal] = ''
|
@@ -99,7 +99,7 @@ class Ve
|
|
99
99
|
[:pos, :pos2, :pos3, :pos4, :inflection_type, :inflection_form, :lemma, :reading, :hatsuon].each_with_index do |attr, i|
|
100
100
|
token[attr] = info[i]
|
101
101
|
end
|
102
|
-
|
102
|
+
|
103
103
|
# Anything unparsed preceding this token
|
104
104
|
unparsed_md = %r{(.*?) #{Regexp.quote(token[:literal])}}mx.match(text, position)
|
105
105
|
if unparsed_md[1].length > 0
|
@@ -108,7 +108,7 @@ class Ve
|
|
108
108
|
@tokens << unparsed_token
|
109
109
|
position += unparsed_token[:literal].length
|
110
110
|
end
|
111
|
-
|
111
|
+
|
112
112
|
token[:characters] = (position..(position+token[:literal].length-1))
|
113
113
|
position += token[:literal].length
|
114
114
|
else
|
@@ -118,7 +118,7 @@ class Ve
|
|
118
118
|
@tokens << token
|
119
119
|
end
|
120
120
|
end
|
121
|
-
|
121
|
+
|
122
122
|
# PoS
|
123
123
|
MEISHI = '名詞'
|
124
124
|
KOYUUMEISHI = '固有名詞'
|
@@ -159,6 +159,11 @@ class Ve
|
|
159
159
|
TOKUSHU_DESU = '特殊・デス'
|
160
160
|
TOKUSHU_DA = '特殊・ダ'
|
161
161
|
TOKUSHU_MASU = '特殊・マス'
|
162
|
+
TOKUSHU_NU = '特殊・ヌ'
|
163
|
+
FUHENKAGATA = '不変化型'
|
164
|
+
JINMEI = '人名'
|
165
|
+
MEIREI_I = '命令i'
|
166
|
+
KAKARIJOSHI = '係助詞'
|
162
167
|
|
163
168
|
# Etc
|
164
169
|
NA = 'な'
|
@@ -166,11 +171,14 @@ class Ve
|
|
166
171
|
TE = 'て'
|
167
172
|
DE = 'で'
|
168
173
|
BA = 'ば'
|
174
|
+
NN = 'ん'
|
175
|
+
SA = 'さ'
|
169
176
|
|
170
177
|
def words
|
171
178
|
words = []
|
172
179
|
tokens = @tokens.find_all { |t| t[:type] == :parsed }
|
173
180
|
tokens = tokens.to_enum
|
181
|
+
previous = nil
|
174
182
|
|
175
183
|
# This is becoming very big
|
176
184
|
begin
|
@@ -181,6 +189,7 @@ class Ve
|
|
181
189
|
eat_lemma = true
|
182
190
|
attach_to_previous = false
|
183
191
|
also_attach_to_lemma = false
|
192
|
+
update_pos = false
|
184
193
|
|
185
194
|
case token[:pos]
|
186
195
|
when MEISHI
|
@@ -208,7 +217,7 @@ class Ve
|
|
208
217
|
eat_next = true
|
209
218
|
elsif following[:pos] == JOSHI && following[:literal] == NI
|
210
219
|
pos = Ve::PartOfSpeech::Adverb
|
211
|
-
eat_next =
|
220
|
+
eat_next = false
|
212
221
|
end
|
213
222
|
end
|
214
223
|
when HIJIRITSU, TOKUSHU
|
@@ -246,8 +255,13 @@ class Ve
|
|
246
255
|
also_attach_to_lemma = true
|
247
256
|
end
|
248
257
|
when SETSUBI
|
249
|
-
|
250
|
-
|
258
|
+
if token[:pos3] == TOKUSHU && token[:lemma] == SA
|
259
|
+
attach_to_previous = true
|
260
|
+
update_pos = true
|
261
|
+
pos = Ve::PartOfSpeech::Noun
|
262
|
+
else
|
263
|
+
pos = Ve::PartOfSpeech::Suffix
|
264
|
+
end
|
251
265
|
when SETSUZOKUSHITEKI
|
252
266
|
pos = Ve::PartOfSpeech::Conjunction
|
253
267
|
when DOUSHIHIJIRITSUTEKI
|
@@ -260,7 +274,10 @@ class Ve
|
|
260
274
|
when JODOUSHI
|
261
275
|
pos = Ve::PartOfSpeech::Postposition
|
262
276
|
|
263
|
-
if
|
277
|
+
if (previous.nil? || (!previous.nil? && previous[:pos2] != KAKARIJOSHI)) &&
|
278
|
+
[TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU, TOKUSHU_NU].include?(token[:inflection_type])
|
279
|
+
attach_to_previous = true
|
280
|
+
elsif token[:inflection_type] == FUHENKAGATA && token[:lemma] == NN
|
264
281
|
attach_to_previous = true
|
265
282
|
elsif (token[:inflection_type] == TOKUSHU_DA || token[:inflection_type] == TOKUSHU_DESU) && token[:literal] != NA
|
266
283
|
pos = Ve::PartOfSpeech::Verb
|
@@ -269,8 +286,8 @@ class Ve
|
|
269
286
|
pos = Ve::PartOfSpeech::Verb
|
270
287
|
if token[:pos2] == SETSUBI
|
271
288
|
attach_to_previous = true
|
272
|
-
elsif token[:pos2] == HIJIRITSU
|
273
|
-
|
289
|
+
elsif token[:pos2] == HIJIRITSU && token[:inflection_form] != MEIREI_I
|
290
|
+
attach_to_previous = true
|
274
291
|
end
|
275
292
|
when KEIYOUSHI
|
276
293
|
pos = Ve::PartOfSpeech::Adjective
|
@@ -301,6 +318,7 @@ class Ve
|
|
301
318
|
words[-1].extra[:reading] << (token[:reading] || '')
|
302
319
|
words[-1].extra[:transcription] << (token[:hatsuon] || '')
|
303
320
|
words[-1].lemma << token[:lemma] if also_attach_to_lemma
|
321
|
+
words[-1].part_of_speech = pos if update_pos
|
304
322
|
else
|
305
323
|
pos = Ve::PartOfSpeech::TBD if pos.nil?
|
306
324
|
word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {
|
@@ -323,18 +341,20 @@ class Ve
|
|
323
341
|
|
324
342
|
words << word
|
325
343
|
end
|
344
|
+
|
345
|
+
previous = token
|
326
346
|
end
|
327
347
|
rescue StopIteration
|
328
348
|
end
|
329
349
|
|
330
350
|
return words
|
331
351
|
end
|
332
|
-
|
352
|
+
|
333
353
|
def sentences
|
334
354
|
# TODO: Sentence objects that keep track of the sentence's tokens
|
335
355
|
sentences = []
|
336
356
|
current = ''
|
337
|
-
|
357
|
+
|
338
358
|
@tokens.each do |token|
|
339
359
|
if token[:type] == :sentence_split
|
340
360
|
sentences << current
|
@@ -347,13 +367,13 @@ class Ve
|
|
347
367
|
current << token[:literal]
|
348
368
|
end
|
349
369
|
end
|
350
|
-
|
370
|
+
|
351
371
|
# In case there is no :sentence_split at the end
|
352
372
|
sentences << current if current.length > 0
|
353
|
-
|
373
|
+
|
354
374
|
sentences
|
355
375
|
end
|
356
|
-
|
376
|
+
|
357
377
|
end
|
358
378
|
end
|
359
379
|
end
|
data/lib/ve.rb
CHANGED
@@ -10,23 +10,34 @@ require 'pp'
|
|
10
10
|
class Ve
|
11
11
|
|
12
12
|
class Manager
|
13
|
+
@@config_for = {}
|
14
|
+
|
15
|
+
def self.set_default_config_for(klass, config = {})
|
16
|
+
@@config_for[klass] = config
|
17
|
+
end
|
18
|
+
|
13
19
|
def self.provider_for(language, function)
|
14
|
-
@@provider_for[language.to_sym][function.to_sym]
|
20
|
+
provider = @@provider_for[language.to_sym][function.to_sym]
|
21
|
+
if provider.is_a?(Class)
|
22
|
+
config = @@config_for[provider] || {}
|
23
|
+
provider = @@provider_for[language.to_sym][function.to_sym].new(config)
|
24
|
+
@@provider_for[language.to_sym][function.to_sym] = provider
|
25
|
+
end
|
26
|
+
provider
|
15
27
|
end
|
16
28
|
|
17
29
|
# TODO: Make a difference between what features are available locally
|
18
30
|
# and what requires contacting external Ves
|
19
31
|
def self.register(klass, language)
|
20
32
|
@@provider_for ||= {}
|
21
|
-
provider = klass.new
|
22
33
|
# This won't work if people start monkey patching the providers with public methods that arent abilities
|
23
34
|
# It's also not pretty, but kinda nifty
|
24
|
-
provider_name =
|
35
|
+
provider_name = klass.to_s.split('::').last
|
25
36
|
parse_class = Kernel.class_eval("Ve::Parse::#{provider_name}")
|
26
37
|
abilities = parse_class.public_instance_methods - Object.public_instance_methods
|
27
38
|
abilities.each do |a|
|
28
39
|
@@provider_for[language.to_sym] ||= {}
|
29
|
-
@@provider_for[language.to_sym][a] =
|
40
|
+
@@provider_for[language.to_sym][a] = klass
|
30
41
|
end
|
31
42
|
end
|
32
43
|
end
|
@@ -2,31 +2,7 @@
|
|
2
2
|
|
3
3
|
require_relative 'test_helper'
|
4
4
|
|
5
|
-
class
|
6
|
-
|
7
|
-
def test_should_be_able_to_start
|
8
|
-
freeling = Ve::Provider::FreelingEn.new
|
9
|
-
assert freeling.works?
|
10
|
-
end
|
11
|
-
|
12
|
-
def test_doesnt_die_on_japanese
|
13
|
-
freeling = Ve::Provider::FreelingEn.new
|
14
|
-
parse = freeling.parse('これは日本語です')
|
15
|
-
assert_equal Ve::Parse::FreelingEn, parse.class
|
16
|
-
end
|
17
|
-
|
18
|
-
# TODO: UTF-8 handling
|
19
|
-
def test_can_handle_utf8
|
20
|
-
freeling = Ve::Provider::FreelingEn.new
|
21
|
-
parse = freeling.parse('I’m')
|
22
|
-
assert_equal ['I\'m'], parse.tokens.collect { |t| t[:literal] }
|
23
|
-
end
|
24
|
-
|
25
|
-
def test_can_parse
|
26
|
-
freeling = Ve::Provider::FreelingEn.new
|
27
|
-
parse = freeling.parse('')
|
28
|
-
assert_equal Ve::Parse::FreelingEn, parse.class
|
29
|
-
end
|
5
|
+
class FreelingEnParseTest < MiniTest::Unit::TestCase
|
30
6
|
|
31
7
|
def test_all_literals_should_equal_the_input_text
|
32
8
|
text = <<-EOS
|
@@ -35,27 +11,30 @@ class FreelingEnTest < Test::Unit::TestCase
|
|
35
11
|
Z
|
36
12
|
|
37
13
|
EOS
|
38
|
-
|
39
|
-
parse =
|
14
|
+
raw = ["There there EX 0.857656", "once once RB 0.809237", "was be VBD 1", "a a DT 0.333333", "man man NN 0.980535", "from from IN 1", "X x NNP 1", "", "Who who WP 1", "took take VBD 1", "it it PRP 1", "upon upon IN 0.915152", "himself himself PRP 1", "to to TO 0.999909", "Y y NNP 1", "", "Z z NNP 1", ""]
|
15
|
+
parse = Ve::Parse::FreelingEn.new(text, raw)
|
40
16
|
assert_equal text, parse.tokens.collect { |t| t[:literal] }.join
|
41
17
|
end
|
42
18
|
|
43
19
|
def test_creates_tokens_from_data_that_is_ignored_in_parsing
|
44
|
-
|
45
|
-
|
20
|
+
text = 'A B '
|
21
|
+
raw = ['A a DT 0.333333', 'B b NNP 1', '']
|
22
|
+
parse = Ve::Parse::FreelingEn.new(text, raw)
|
46
23
|
assert_equal [:parsed, :unparsed, :parsed, :unparsed, :sentence_split], parse.tokens.collect { |t| t[:type] }
|
47
24
|
assert_equal ['A', ' ', 'B', ' ', ''], parse.tokens.collect { |t| t[:literal] }
|
48
25
|
end
|
49
26
|
|
50
27
|
def test_can_give_sentences
|
51
|
-
|
52
|
-
|
28
|
+
text = 'This is a sentence. And this was another one'
|
29
|
+
raw = ['This this PRP 0.0001755', 'is be VBZ 1', 'a a DT 0.333333', 'sentence sentence NN 0.966667', '. . Fp 1', '', 'And and CC 1', 'this this PRP 0.0001755', 'was be VBD 1', 'another another DT 0.999067', 'one one NN 0.25', '']
|
30
|
+
parse = Ve::Parse::FreelingEn.new(text, raw)
|
53
31
|
assert_equal ['This is a sentence.', 'And this was another one'], parse.sentences
|
54
32
|
end
|
55
33
|
|
56
34
|
def test_can_give_words
|
57
|
-
|
58
|
-
|
35
|
+
text = 'This was a sentence.'
|
36
|
+
raw = ['This this PRP 0.0001755', 'was be VBD 1', 'a a DT 0.333333', 'sentence sentence NN 0.966667', '. . Fp 1', '']
|
37
|
+
parse = Ve::Parse::FreelingEn.new(text, raw)
|
59
38
|
words = parse.words
|
60
39
|
tokens = parse.tokens
|
61
40
|
|
@@ -67,48 +46,55 @@ class FreelingEnTest < Test::Unit::TestCase
|
|
67
46
|
assert_equal [[tokens[0]], [tokens[2]], [tokens[4]], [tokens[6]], [tokens[7]]], words.collect(&:tokens)
|
68
47
|
end
|
69
48
|
|
49
|
+
def test_words_can_handle_contractions
|
50
|
+
# TODO
|
51
|
+
skip
|
52
|
+
text = "I'm eating."
|
53
|
+
raw = ['I i PRP 1', "'m 'm VBP 0.997563", 'eating eat VBG 1', '. . Fp 1', '']
|
54
|
+
parse = Ve::Parse::FreelingEn.new(text, raw)
|
55
|
+
assert_equal ["I'm", "eating", "."], parse.tokens.collect { |t| t[:literal] }
|
56
|
+
end
|
57
|
+
|
70
58
|
def test_possessive_endings_must_be_reattached
|
71
|
-
|
72
|
-
|
59
|
+
text = "This is Jane's sentence."
|
60
|
+
raw = ["This this PRP 0.0001755", "is be VBZ 1", "Jane jane NNP 1", "'s 's POS 0.751711", "sentence sentence NN 0.966667", ". . Fp 1", ""]
|
61
|
+
parse = Ve::Parse::FreelingEn.new(text, raw)
|
73
62
|
words = parse.words
|
74
63
|
tokens = parse.tokens
|
75
64
|
|
76
65
|
assert_equal ['This', 'is', "Jane's", 'sentence', '.'], words.collect(&:word)
|
77
66
|
assert_equal ['this', 'be', "jane", 'sentence', '.'], words.collect(&:lemma)
|
78
67
|
assert_equal [Ve::PartOfSpeech::Pronoun, Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::ProperNoun, Ve::PartOfSpeech::Noun, Ve::PartOfSpeech::Symbol], words.collect(&:part_of_speech)
|
79
|
-
assert_equal [{:grammar => :personal}, {:grammar => nil}, {:
|
68
|
+
assert_equal [{:grammar => :personal}, {:grammar => nil}, {:grammar => nil}, {:grammar => nil}, {:grammar => nil}], words.collect(&:extra)
|
80
69
|
assert_equal [[tokens[0]], [tokens[2]], tokens[4..5], [tokens[7]], [tokens[8]]], words.collect(&:tokens)
|
81
70
|
end
|
82
71
|
|
83
72
|
def test_date_parsing
|
84
73
|
# Should be turned off. At least for now
|
85
|
-
|
86
|
-
|
87
|
-
assert_parses_into_words(freeling,
|
74
|
+
assert_parses_into_words(Ve::Parse::FreelingEn,
|
88
75
|
{:words => ['January'],
|
89
76
|
:lemmas => ['january'],
|
90
77
|
:pos => [Ve::PartOfSpeech::Noun],
|
91
78
|
:extra => [{:grammar => nil}],
|
92
79
|
:tokens => [0..0]},
|
93
|
-
'January')
|
80
|
+
'January', ['January january NN 1'])
|
94
81
|
end
|
95
82
|
|
96
83
|
def test_symbol_parsing
|
97
|
-
|
98
|
-
|
99
|
-
assert_parses_into_words(freeling,
|
84
|
+
assert_parses_into_words(Ve::Parse::FreelingEn,
|
100
85
|
{:words => ['.', ',', '$'],
|
101
86
|
:lemmas => ['.', ',', '$'],
|
102
87
|
:pos => [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::Symbol],
|
103
88
|
:extra => [{:grammar => nil}, {:grammar => nil}, {:grammar => nil}],
|
104
89
|
:tokens => [0..0, 1..1, 2..2]},
|
105
|
-
'.,$')
|
90
|
+
'.,$', ['. . Fp 1', ', , Fc 1', '$ $ Fp', ''])
|
106
91
|
end
|
107
92
|
|
108
93
|
def test_can_handle_underscores_properly
|
109
94
|
# Should restore them
|
110
|
-
|
111
|
-
|
95
|
+
text = 'In New York'
|
96
|
+
raw = ['In in IN 0.986184', 'New_York new_york NNP 1', '']
|
97
|
+
parse = Ve::Parse::FreelingEn.new(text, raw)
|
112
98
|
words = parse.words
|
113
99
|
tokens = parse.tokens
|
114
100
|
|
@@ -120,8 +106,10 @@ class FreelingEnTest < Test::Unit::TestCase
|
|
120
106
|
|
121
107
|
# Should keep them
|
122
108
|
# TODO
|
123
|
-
|
124
|
-
|
109
|
+
skip
|
110
|
+
text = 'In New_York'
|
111
|
+
raw = ['In in IN 0.986184', 'New_York new_york NNP 1', '']
|
112
|
+
parse = Ve::Parse::FreelingEn.new(text, raw)
|
125
113
|
words = parse.words
|
126
114
|
tokens = parse.tokens
|
127
115
|
|
@@ -133,3 +121,4 @@ class FreelingEnTest < Test::Unit::TestCase
|
|
133
121
|
end
|
134
122
|
|
135
123
|
end
|
124
|
+
|