ve 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.travis.yml +9 -0
- data/Gemfile +5 -0
- data/Gemfile.lock +6 -0
- data/lib/providers/freeling_en.rb +4 -2
- data/lib/providers/mecab_ipadic.rb +52 -32
- data/lib/ve.rb +15 -4
- data/tests/{freeling_en_test.rb → freeling_en_parse_test.rb} +37 -48
- data/tests/freeling_en_provider_test.rb +38 -0
- data/tests/japanese_transliterators_test.rb +1 -1
- data/tests/mecab_ipadic_parse_test.rb +772 -0
- data/tests/mecab_ipadic_provider_test.rb +21 -0
- data/tests/test_helper.rb +5 -4
- data/tests/ve_test.rb +5 -1
- data/ve.gemspec +1 -1
- metadata +27 -35
- data/tests/mecab_ipadic_test.rb +0 -452
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 7667f10a89f699b284d7a412bab815d46e9bf26d
|
4
|
+
data.tar.gz: 7700e9a46ee0321b746ce23c806df91fb54b7252
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fa87fa761966cc70ec3edf7dad6b4ef36404a8adc14314f88aa7bd91b34784a98de198d699f6fe8650ace92fb45b9be7c5ac6e23d35b5f1fcf9a138d75d647b0
|
7
|
+
data.tar.gz: 78e90e9c7af44b26bebd04bd4f44e002b392438bfe7422950f473d01e99840e5e26f6ca0887a17f76f2d7211f49d44fee2ce7e86ed832824f0d01d5109a6cea4
|
data/.travis.yml
ADDED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -2,11 +2,15 @@ GEM
|
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
4
|
json (1.6.1)
|
5
|
+
metaclass (0.0.1)
|
6
|
+
mocha (0.11.4)
|
7
|
+
metaclass (~> 0.0.1)
|
5
8
|
rack (1.3.5)
|
6
9
|
rack-cors (0.2.4)
|
7
10
|
rack
|
8
11
|
rack-protection (1.1.4)
|
9
12
|
rack
|
13
|
+
rake (0.8.7)
|
10
14
|
sinatra (1.3.1)
|
11
15
|
rack (~> 1.3, >= 1.3.4)
|
12
16
|
rack-protection (~> 1.1, >= 1.1.2)
|
@@ -18,5 +22,7 @@ PLATFORMS
|
|
18
22
|
|
19
23
|
DEPENDENCIES
|
20
24
|
json
|
25
|
+
mocha
|
21
26
|
rack-cors
|
27
|
+
rake
|
22
28
|
sinatra
|
@@ -8,7 +8,7 @@ require 'open3'
|
|
8
8
|
class Ve
|
9
9
|
class Provider
|
10
10
|
class FreelingEn < Ve::Provider
|
11
|
-
|
11
|
+
# FIX: This class isn't tested
|
12
12
|
BIT_STOP = 'VeEnd'
|
13
13
|
|
14
14
|
# TODO: Automatically set FREELINGSHARE if it's not set?
|
@@ -27,7 +27,8 @@ class Ve
|
|
27
27
|
# Interface methods
|
28
28
|
|
29
29
|
def works?
|
30
|
-
|
30
|
+
p = parse('Wrote')
|
31
|
+
["Wrote write VBD 1", ""] == p.tokens.collect { |t| t[:raw] }
|
31
32
|
end
|
32
33
|
|
33
34
|
# Talks to the app and returns a parse object
|
@@ -41,6 +42,7 @@ class Ve
|
|
41
42
|
output = []
|
42
43
|
|
43
44
|
while line = @stdout.readline
|
45
|
+
puts line
|
44
46
|
if line =~ /#{BIT_STOP}/x
|
45
47
|
@stdout.readline
|
46
48
|
break
|
@@ -7,31 +7,31 @@ class Ve
|
|
7
7
|
class MecabIpadic < Ve::Provider
|
8
8
|
|
9
9
|
BIT_STOP = 'VeEnd'
|
10
|
-
|
10
|
+
|
11
11
|
def initialize(config = {})
|
12
12
|
# TODO: Make config handling better
|
13
13
|
@config = {:app => 'mecab',
|
14
14
|
:path => '',
|
15
15
|
:flags => ''}.merge(config)
|
16
|
-
|
17
|
-
@config[:app] = `which #{@config[:app]}
|
18
|
-
|
16
|
+
|
17
|
+
@config[:app] = `which #{@config[:app]}`.chomp
|
18
|
+
|
19
19
|
start!
|
20
20
|
end
|
21
|
-
|
21
|
+
|
22
22
|
def works?
|
23
23
|
(["だっ\t助動詞,*,*,*,特殊・ダ,連用タ接続,だ,ダッ,ダッ",
|
24
24
|
"た\t助動詞,*,*,*,特殊・タ,基本形,た,タ,タ",
|
25
25
|
"EOS"] == parse('だった').tokens.collect { |t| t[:raw] } )
|
26
26
|
end
|
27
|
-
|
27
|
+
|
28
28
|
# Talks to the app and returns a parse object
|
29
29
|
def parse(text, options = {})
|
30
30
|
start! if @stdin.nil? # Restart if the provider crashed
|
31
|
-
|
31
|
+
|
32
32
|
@stdin.puts "#{text} #{BIT_STOP}"
|
33
33
|
output = []
|
34
|
-
|
34
|
+
|
35
35
|
while line = @stdout.readline.force_encoding('UTF-8')
|
36
36
|
if line =~ /#{BIT_STOP}/x
|
37
37
|
output << @stdout.readline # Catch the EOS
|
@@ -39,25 +39,25 @@ class Ve
|
|
39
39
|
end
|
40
40
|
output << line
|
41
41
|
end
|
42
|
-
|
42
|
+
|
43
43
|
Ve::Parse::MecabIpadic.new(text, output)
|
44
|
-
rescue
|
44
|
+
rescue => e
|
45
45
|
# TODO: No good to catch all errors like this
|
46
46
|
# I need a backtrace when something unexpected fails
|
47
47
|
Ve::Parse::MecabIpadic.new(text, [])
|
48
48
|
end
|
49
49
|
|
50
50
|
private
|
51
|
-
|
51
|
+
|
52
52
|
# TODO: Use Process.spawn/kill for process control?
|
53
53
|
def start!
|
54
|
-
@stdin, @stdout, @stderr = Open3.popen3(@config[:app])
|
54
|
+
@stdin, @stdout, @stderr = Open3.popen3("#{@config[:app]} #{@config[:flags]}")
|
55
55
|
@stdin.set_encoding('UTF-8')
|
56
56
|
@stdout.set_encoding('UTF-8')
|
57
|
-
rescue Errno::ENOENT
|
57
|
+
rescue Errno::ENOENT => e
|
58
58
|
# The parser couldn't be started. Probably not installed on this system
|
59
59
|
end
|
60
|
-
|
60
|
+
|
61
61
|
end
|
62
62
|
end
|
63
63
|
end
|
@@ -65,15 +65,15 @@ end
|
|
65
65
|
class Ve
|
66
66
|
class Parse
|
67
67
|
class MecabIpadic < Ve::Parse
|
68
|
-
|
68
|
+
|
69
69
|
PARSER = %r{^ (.+?) \t (.+) }x
|
70
70
|
attr_reader :tokens, :text
|
71
|
-
|
71
|
+
|
72
72
|
def initialize(text, output)
|
73
73
|
@tokens = []
|
74
74
|
@text = text
|
75
75
|
position = 0
|
76
|
-
|
76
|
+
|
77
77
|
output.each_with_index do |line, index|
|
78
78
|
line.rstrip!
|
79
79
|
token = {:raw => line}
|
@@ -87,7 +87,7 @@ class Ve
|
|
87
87
|
@tokens << unparsed_token
|
88
88
|
end
|
89
89
|
end
|
90
|
-
|
90
|
+
|
91
91
|
if line =~ %r{^ EOS $}x
|
92
92
|
token[:type] = :sentence_split
|
93
93
|
token[:literal] = ''
|
@@ -99,7 +99,7 @@ class Ve
|
|
99
99
|
[:pos, :pos2, :pos3, :pos4, :inflection_type, :inflection_form, :lemma, :reading, :hatsuon].each_with_index do |attr, i|
|
100
100
|
token[attr] = info[i]
|
101
101
|
end
|
102
|
-
|
102
|
+
|
103
103
|
# Anything unparsed preceding this token
|
104
104
|
unparsed_md = %r{(.*?) #{Regexp.quote(token[:literal])}}mx.match(text, position)
|
105
105
|
if unparsed_md[1].length > 0
|
@@ -108,7 +108,7 @@ class Ve
|
|
108
108
|
@tokens << unparsed_token
|
109
109
|
position += unparsed_token[:literal].length
|
110
110
|
end
|
111
|
-
|
111
|
+
|
112
112
|
token[:characters] = (position..(position+token[:literal].length-1))
|
113
113
|
position += token[:literal].length
|
114
114
|
else
|
@@ -118,7 +118,7 @@ class Ve
|
|
118
118
|
@tokens << token
|
119
119
|
end
|
120
120
|
end
|
121
|
-
|
121
|
+
|
122
122
|
# PoS
|
123
123
|
MEISHI = '名詞'
|
124
124
|
KOYUUMEISHI = '固有名詞'
|
@@ -159,6 +159,11 @@ class Ve
|
|
159
159
|
TOKUSHU_DESU = '特殊・デス'
|
160
160
|
TOKUSHU_DA = '特殊・ダ'
|
161
161
|
TOKUSHU_MASU = '特殊・マス'
|
162
|
+
TOKUSHU_NU = '特殊・ヌ'
|
163
|
+
FUHENKAGATA = '不変化型'
|
164
|
+
JINMEI = '人名'
|
165
|
+
MEIREI_I = '命令i'
|
166
|
+
KAKARIJOSHI = '係助詞'
|
162
167
|
|
163
168
|
# Etc
|
164
169
|
NA = 'な'
|
@@ -166,11 +171,14 @@ class Ve
|
|
166
171
|
TE = 'て'
|
167
172
|
DE = 'で'
|
168
173
|
BA = 'ば'
|
174
|
+
NN = 'ん'
|
175
|
+
SA = 'さ'
|
169
176
|
|
170
177
|
def words
|
171
178
|
words = []
|
172
179
|
tokens = @tokens.find_all { |t| t[:type] == :parsed }
|
173
180
|
tokens = tokens.to_enum
|
181
|
+
previous = nil
|
174
182
|
|
175
183
|
# This is becoming very big
|
176
184
|
begin
|
@@ -181,6 +189,7 @@ class Ve
|
|
181
189
|
eat_lemma = true
|
182
190
|
attach_to_previous = false
|
183
191
|
also_attach_to_lemma = false
|
192
|
+
update_pos = false
|
184
193
|
|
185
194
|
case token[:pos]
|
186
195
|
when MEISHI
|
@@ -208,7 +217,7 @@ class Ve
|
|
208
217
|
eat_next = true
|
209
218
|
elsif following[:pos] == JOSHI && following[:literal] == NI
|
210
219
|
pos = Ve::PartOfSpeech::Adverb
|
211
|
-
eat_next =
|
220
|
+
eat_next = false
|
212
221
|
end
|
213
222
|
end
|
214
223
|
when HIJIRITSU, TOKUSHU
|
@@ -246,8 +255,13 @@ class Ve
|
|
246
255
|
also_attach_to_lemma = true
|
247
256
|
end
|
248
257
|
when SETSUBI
|
249
|
-
|
250
|
-
|
258
|
+
if token[:pos3] == TOKUSHU && token[:lemma] == SA
|
259
|
+
attach_to_previous = true
|
260
|
+
update_pos = true
|
261
|
+
pos = Ve::PartOfSpeech::Noun
|
262
|
+
else
|
263
|
+
pos = Ve::PartOfSpeech::Suffix
|
264
|
+
end
|
251
265
|
when SETSUZOKUSHITEKI
|
252
266
|
pos = Ve::PartOfSpeech::Conjunction
|
253
267
|
when DOUSHIHIJIRITSUTEKI
|
@@ -260,7 +274,10 @@ class Ve
|
|
260
274
|
when JODOUSHI
|
261
275
|
pos = Ve::PartOfSpeech::Postposition
|
262
276
|
|
263
|
-
if
|
277
|
+
if (previous.nil? || (!previous.nil? && previous[:pos2] != KAKARIJOSHI)) &&
|
278
|
+
[TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU, TOKUSHU_NU].include?(token[:inflection_type])
|
279
|
+
attach_to_previous = true
|
280
|
+
elsif token[:inflection_type] == FUHENKAGATA && token[:lemma] == NN
|
264
281
|
attach_to_previous = true
|
265
282
|
elsif (token[:inflection_type] == TOKUSHU_DA || token[:inflection_type] == TOKUSHU_DESU) && token[:literal] != NA
|
266
283
|
pos = Ve::PartOfSpeech::Verb
|
@@ -269,8 +286,8 @@ class Ve
|
|
269
286
|
pos = Ve::PartOfSpeech::Verb
|
270
287
|
if token[:pos2] == SETSUBI
|
271
288
|
attach_to_previous = true
|
272
|
-
elsif token[:pos2] == HIJIRITSU
|
273
|
-
|
289
|
+
elsif token[:pos2] == HIJIRITSU && token[:inflection_form] != MEIREI_I
|
290
|
+
attach_to_previous = true
|
274
291
|
end
|
275
292
|
when KEIYOUSHI
|
276
293
|
pos = Ve::PartOfSpeech::Adjective
|
@@ -301,6 +318,7 @@ class Ve
|
|
301
318
|
words[-1].extra[:reading] << (token[:reading] || '')
|
302
319
|
words[-1].extra[:transcription] << (token[:hatsuon] || '')
|
303
320
|
words[-1].lemma << token[:lemma] if also_attach_to_lemma
|
321
|
+
words[-1].part_of_speech = pos if update_pos
|
304
322
|
else
|
305
323
|
pos = Ve::PartOfSpeech::TBD if pos.nil?
|
306
324
|
word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {
|
@@ -323,18 +341,20 @@ class Ve
|
|
323
341
|
|
324
342
|
words << word
|
325
343
|
end
|
344
|
+
|
345
|
+
previous = token
|
326
346
|
end
|
327
347
|
rescue StopIteration
|
328
348
|
end
|
329
349
|
|
330
350
|
return words
|
331
351
|
end
|
332
|
-
|
352
|
+
|
333
353
|
def sentences
|
334
354
|
# TODO: Sentence objects that keep track of the sentence's tokens
|
335
355
|
sentences = []
|
336
356
|
current = ''
|
337
|
-
|
357
|
+
|
338
358
|
@tokens.each do |token|
|
339
359
|
if token[:type] == :sentence_split
|
340
360
|
sentences << current
|
@@ -347,13 +367,13 @@ class Ve
|
|
347
367
|
current << token[:literal]
|
348
368
|
end
|
349
369
|
end
|
350
|
-
|
370
|
+
|
351
371
|
# In case there is no :sentence_split at the end
|
352
372
|
sentences << current if current.length > 0
|
353
|
-
|
373
|
+
|
354
374
|
sentences
|
355
375
|
end
|
356
|
-
|
376
|
+
|
357
377
|
end
|
358
378
|
end
|
359
379
|
end
|
data/lib/ve.rb
CHANGED
@@ -10,23 +10,34 @@ require 'pp'
|
|
10
10
|
class Ve
|
11
11
|
|
12
12
|
class Manager
|
13
|
+
@@config_for = {}
|
14
|
+
|
15
|
+
def self.set_default_config_for(klass, config = {})
|
16
|
+
@@config_for[klass] = config
|
17
|
+
end
|
18
|
+
|
13
19
|
def self.provider_for(language, function)
|
14
|
-
@@provider_for[language.to_sym][function.to_sym]
|
20
|
+
provider = @@provider_for[language.to_sym][function.to_sym]
|
21
|
+
if provider.is_a?(Class)
|
22
|
+
config = @@config_for[provider] || {}
|
23
|
+
provider = @@provider_for[language.to_sym][function.to_sym].new(config)
|
24
|
+
@@provider_for[language.to_sym][function.to_sym] = provider
|
25
|
+
end
|
26
|
+
provider
|
15
27
|
end
|
16
28
|
|
17
29
|
# TODO: Make a difference between what features are available locally
|
18
30
|
# and what requires contacting external Ves
|
19
31
|
def self.register(klass, language)
|
20
32
|
@@provider_for ||= {}
|
21
|
-
provider = klass.new
|
22
33
|
# This won't work if people start monkey patching the providers with public methods that arent abilities
|
23
34
|
# It's also not pretty, but kinda nifty
|
24
|
-
provider_name =
|
35
|
+
provider_name = klass.to_s.split('::').last
|
25
36
|
parse_class = Kernel.class_eval("Ve::Parse::#{provider_name}")
|
26
37
|
abilities = parse_class.public_instance_methods - Object.public_instance_methods
|
27
38
|
abilities.each do |a|
|
28
39
|
@@provider_for[language.to_sym] ||= {}
|
29
|
-
@@provider_for[language.to_sym][a] =
|
40
|
+
@@provider_for[language.to_sym][a] = klass
|
30
41
|
end
|
31
42
|
end
|
32
43
|
end
|
@@ -2,31 +2,7 @@
|
|
2
2
|
|
3
3
|
require_relative 'test_helper'
|
4
4
|
|
5
|
-
class
|
6
|
-
|
7
|
-
def test_should_be_able_to_start
|
8
|
-
freeling = Ve::Provider::FreelingEn.new
|
9
|
-
assert freeling.works?
|
10
|
-
end
|
11
|
-
|
12
|
-
def test_doesnt_die_on_japanese
|
13
|
-
freeling = Ve::Provider::FreelingEn.new
|
14
|
-
parse = freeling.parse('これは日本語です')
|
15
|
-
assert_equal Ve::Parse::FreelingEn, parse.class
|
16
|
-
end
|
17
|
-
|
18
|
-
# TODO: UTF-8 handling
|
19
|
-
def test_can_handle_utf8
|
20
|
-
freeling = Ve::Provider::FreelingEn.new
|
21
|
-
parse = freeling.parse('I’m')
|
22
|
-
assert_equal ['I\'m'], parse.tokens.collect { |t| t[:literal] }
|
23
|
-
end
|
24
|
-
|
25
|
-
def test_can_parse
|
26
|
-
freeling = Ve::Provider::FreelingEn.new
|
27
|
-
parse = freeling.parse('')
|
28
|
-
assert_equal Ve::Parse::FreelingEn, parse.class
|
29
|
-
end
|
5
|
+
class FreelingEnParseTest < MiniTest::Unit::TestCase
|
30
6
|
|
31
7
|
def test_all_literals_should_equal_the_input_text
|
32
8
|
text = <<-EOS
|
@@ -35,27 +11,30 @@ class FreelingEnTest < Test::Unit::TestCase
|
|
35
11
|
Z
|
36
12
|
|
37
13
|
EOS
|
38
|
-
|
39
|
-
parse =
|
14
|
+
raw = ["There there EX 0.857656", "once once RB 0.809237", "was be VBD 1", "a a DT 0.333333", "man man NN 0.980535", "from from IN 1", "X x NNP 1", "", "Who who WP 1", "took take VBD 1", "it it PRP 1", "upon upon IN 0.915152", "himself himself PRP 1", "to to TO 0.999909", "Y y NNP 1", "", "Z z NNP 1", ""]
|
15
|
+
parse = Ve::Parse::FreelingEn.new(text, raw)
|
40
16
|
assert_equal text, parse.tokens.collect { |t| t[:literal] }.join
|
41
17
|
end
|
42
18
|
|
43
19
|
def test_creates_tokens_from_data_that_is_ignored_in_parsing
|
44
|
-
|
45
|
-
|
20
|
+
text = 'A B '
|
21
|
+
raw = ['A a DT 0.333333', 'B b NNP 1', '']
|
22
|
+
parse = Ve::Parse::FreelingEn.new(text, raw)
|
46
23
|
assert_equal [:parsed, :unparsed, :parsed, :unparsed, :sentence_split], parse.tokens.collect { |t| t[:type] }
|
47
24
|
assert_equal ['A', ' ', 'B', ' ', ''], parse.tokens.collect { |t| t[:literal] }
|
48
25
|
end
|
49
26
|
|
50
27
|
def test_can_give_sentences
|
51
|
-
|
52
|
-
|
28
|
+
text = 'This is a sentence. And this was another one'
|
29
|
+
raw = ['This this PRP 0.0001755', 'is be VBZ 1', 'a a DT 0.333333', 'sentence sentence NN 0.966667', '. . Fp 1', '', 'And and CC 1', 'this this PRP 0.0001755', 'was be VBD 1', 'another another DT 0.999067', 'one one NN 0.25', '']
|
30
|
+
parse = Ve::Parse::FreelingEn.new(text, raw)
|
53
31
|
assert_equal ['This is a sentence.', 'And this was another one'], parse.sentences
|
54
32
|
end
|
55
33
|
|
56
34
|
def test_can_give_words
|
57
|
-
|
58
|
-
|
35
|
+
text = 'This was a sentence.'
|
36
|
+
raw = ['This this PRP 0.0001755', 'was be VBD 1', 'a a DT 0.333333', 'sentence sentence NN 0.966667', '. . Fp 1', '']
|
37
|
+
parse = Ve::Parse::FreelingEn.new(text, raw)
|
59
38
|
words = parse.words
|
60
39
|
tokens = parse.tokens
|
61
40
|
|
@@ -67,48 +46,55 @@ class FreelingEnTest < Test::Unit::TestCase
|
|
67
46
|
assert_equal [[tokens[0]], [tokens[2]], [tokens[4]], [tokens[6]], [tokens[7]]], words.collect(&:tokens)
|
68
47
|
end
|
69
48
|
|
49
|
+
def test_words_can_handle_contractions
|
50
|
+
# TODO
|
51
|
+
skip
|
52
|
+
text = "I'm eating."
|
53
|
+
raw = ['I i PRP 1', "'m 'm VBP 0.997563", 'eating eat VBG 1', '. . Fp 1', '']
|
54
|
+
parse = Ve::Parse::FreelingEn.new(text, raw)
|
55
|
+
assert_equal ["I'm", "eating", "."], parse.tokens.collect { |t| t[:literal] }
|
56
|
+
end
|
57
|
+
|
70
58
|
def test_possessive_endings_must_be_reattached
|
71
|
-
|
72
|
-
|
59
|
+
text = "This is Jane's sentence."
|
60
|
+
raw = ["This this PRP 0.0001755", "is be VBZ 1", "Jane jane NNP 1", "'s 's POS 0.751711", "sentence sentence NN 0.966667", ". . Fp 1", ""]
|
61
|
+
parse = Ve::Parse::FreelingEn.new(text, raw)
|
73
62
|
words = parse.words
|
74
63
|
tokens = parse.tokens
|
75
64
|
|
76
65
|
assert_equal ['This', 'is', "Jane's", 'sentence', '.'], words.collect(&:word)
|
77
66
|
assert_equal ['this', 'be', "jane", 'sentence', '.'], words.collect(&:lemma)
|
78
67
|
assert_equal [Ve::PartOfSpeech::Pronoun, Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::ProperNoun, Ve::PartOfSpeech::Noun, Ve::PartOfSpeech::Symbol], words.collect(&:part_of_speech)
|
79
|
-
assert_equal [{:grammar => :personal}, {:grammar => nil}, {:
|
68
|
+
assert_equal [{:grammar => :personal}, {:grammar => nil}, {:grammar => nil}, {:grammar => nil}, {:grammar => nil}], words.collect(&:extra)
|
80
69
|
assert_equal [[tokens[0]], [tokens[2]], tokens[4..5], [tokens[7]], [tokens[8]]], words.collect(&:tokens)
|
81
70
|
end
|
82
71
|
|
83
72
|
def test_date_parsing
|
84
73
|
# Should be turned off. At least for now
|
85
|
-
|
86
|
-
|
87
|
-
assert_parses_into_words(freeling,
|
74
|
+
assert_parses_into_words(Ve::Parse::FreelingEn,
|
88
75
|
{:words => ['January'],
|
89
76
|
:lemmas => ['january'],
|
90
77
|
:pos => [Ve::PartOfSpeech::Noun],
|
91
78
|
:extra => [{:grammar => nil}],
|
92
79
|
:tokens => [0..0]},
|
93
|
-
'January')
|
80
|
+
'January', ['January january NN 1'])
|
94
81
|
end
|
95
82
|
|
96
83
|
def test_symbol_parsing
|
97
|
-
|
98
|
-
|
99
|
-
assert_parses_into_words(freeling,
|
84
|
+
assert_parses_into_words(Ve::Parse::FreelingEn,
|
100
85
|
{:words => ['.', ',', '$'],
|
101
86
|
:lemmas => ['.', ',', '$'],
|
102
87
|
:pos => [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::Symbol],
|
103
88
|
:extra => [{:grammar => nil}, {:grammar => nil}, {:grammar => nil}],
|
104
89
|
:tokens => [0..0, 1..1, 2..2]},
|
105
|
-
'.,$')
|
90
|
+
'.,$', ['. . Fp 1', ', , Fc 1', '$ $ Fp', ''])
|
106
91
|
end
|
107
92
|
|
108
93
|
def test_can_handle_underscores_properly
|
109
94
|
# Should restore them
|
110
|
-
|
111
|
-
|
95
|
+
text = 'In New York'
|
96
|
+
raw = ['In in IN 0.986184', 'New_York new_york NNP 1', '']
|
97
|
+
parse = Ve::Parse::FreelingEn.new(text, raw)
|
112
98
|
words = parse.words
|
113
99
|
tokens = parse.tokens
|
114
100
|
|
@@ -120,8 +106,10 @@ class FreelingEnTest < Test::Unit::TestCase
|
|
120
106
|
|
121
107
|
# Should keep them
|
122
108
|
# TODO
|
123
|
-
|
124
|
-
|
109
|
+
skip
|
110
|
+
text = 'In New_York'
|
111
|
+
raw = ['In in IN 0.986184', 'New_York new_york NNP 1', '']
|
112
|
+
parse = Ve::Parse::FreelingEn.new(text, raw)
|
125
113
|
words = parse.words
|
126
114
|
tokens = parse.tokens
|
127
115
|
|
@@ -133,3 +121,4 @@ class FreelingEnTest < Test::Unit::TestCase
|
|
133
121
|
end
|
134
122
|
|
135
123
|
end
|
124
|
+
|