ve 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7667f10a89f699b284d7a412bab815d46e9bf26d
4
+ data.tar.gz: 7700e9a46ee0321b746ce23c806df91fb54b7252
5
+ SHA512:
6
+ metadata.gz: fa87fa761966cc70ec3edf7dad6b4ef36404a8adc14314f88aa7bd91b34784a98de198d699f6fe8650ace92fb45b9be7c5ac6e23d35b5f1fcf9a138d75d647b0
7
+ data.tar.gz: 78e90e9c7af44b26bebd04bd4f44e002b392438bfe7422950f473d01e99840e5e26f6ca0887a17f76f2d7211f49d44fee2ce7e86ed832824f0d01d5109a6cea4
data/.gitignore CHANGED
@@ -1,4 +1,5 @@
1
1
  .DS_Store
2
2
  .*.swp
3
3
  *.gem
4
+ .rvmrc
4
5
 
data/.travis.yml ADDED
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.2
4
+ - 1.9.3
5
+ - jruby-19mode
6
+ - rbx-19mode
7
+ - ruby-head
8
+ - jruby-head
9
+ - ree
data/Gemfile CHANGED
@@ -6,3 +6,8 @@ group :server do
6
6
  gem "sinatra"
7
7
  gem "rack-cors"
8
8
  end
9
+
10
+ group :test do
11
+ gem "rake"
12
+ gem "mocha", :require => false
13
+ end
data/Gemfile.lock CHANGED
@@ -2,11 +2,15 @@ GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
4
  json (1.6.1)
5
+ metaclass (0.0.1)
6
+ mocha (0.11.4)
7
+ metaclass (~> 0.0.1)
5
8
  rack (1.3.5)
6
9
  rack-cors (0.2.4)
7
10
  rack
8
11
  rack-protection (1.1.4)
9
12
  rack
13
+ rake (0.8.7)
10
14
  sinatra (1.3.1)
11
15
  rack (~> 1.3, >= 1.3.4)
12
16
  rack-protection (~> 1.1, >= 1.1.2)
@@ -18,5 +22,7 @@ PLATFORMS
18
22
 
19
23
  DEPENDENCIES
20
24
  json
25
+ mocha
21
26
  rack-cors
27
+ rake
22
28
  sinatra
@@ -8,7 +8,7 @@ require 'open3'
8
8
  class Ve
9
9
  class Provider
10
10
  class FreelingEn < Ve::Provider
11
-
11
+ # FIX: This class isn't tested
12
12
  BIT_STOP = 'VeEnd'
13
13
 
14
14
  # TODO: Automatically set FREELINGSHARE if it's not set?
@@ -27,7 +27,8 @@ class Ve
27
27
  # Interface methods
28
28
 
29
29
  def works?
30
- (["Wrote write VBD 1", ""] == parse('Wrote').tokens.collect { |t| t[:raw] })
30
+ p = parse('Wrote')
31
+ ["Wrote write VBD 1", ""] == p.tokens.collect { |t| t[:raw] }
31
32
  end
32
33
 
33
34
  # Talks to the app and returns a parse object
@@ -41,6 +42,7 @@ class Ve
41
42
  output = []
42
43
 
43
44
  while line = @stdout.readline
45
+ puts line
44
46
  if line =~ /#{BIT_STOP}/x
45
47
  @stdout.readline
46
48
  break
@@ -7,31 +7,31 @@ class Ve
7
7
  class MecabIpadic < Ve::Provider
8
8
 
9
9
  BIT_STOP = 'VeEnd'
10
-
10
+
11
11
  def initialize(config = {})
12
12
  # TODO: Make config handling better
13
13
  @config = {:app => 'mecab',
14
14
  :path => '',
15
15
  :flags => ''}.merge(config)
16
-
17
- @config[:app] = `which #{@config[:app]}`
18
-
16
+
17
+ @config[:app] = `which #{@config[:app]}`.chomp
18
+
19
19
  start!
20
20
  end
21
-
21
+
22
22
  def works?
23
23
  (["だっ\t助動詞,*,*,*,特殊・ダ,連用タ接続,だ,ダッ,ダッ",
24
24
  "た\t助動詞,*,*,*,特殊・タ,基本形,た,タ,タ",
25
25
  "EOS"] == parse('だった').tokens.collect { |t| t[:raw] } )
26
26
  end
27
-
27
+
28
28
  # Talks to the app and returns a parse object
29
29
  def parse(text, options = {})
30
30
  start! if @stdin.nil? # Restart if the provider crashed
31
-
31
+
32
32
  @stdin.puts "#{text} #{BIT_STOP}"
33
33
  output = []
34
-
34
+
35
35
  while line = @stdout.readline.force_encoding('UTF-8')
36
36
  if line =~ /#{BIT_STOP}/x
37
37
  output << @stdout.readline # Catch the EOS
@@ -39,25 +39,25 @@ class Ve
39
39
  end
40
40
  output << line
41
41
  end
42
-
42
+
43
43
  Ve::Parse::MecabIpadic.new(text, output)
44
- rescue
44
+ rescue => e
45
45
  # TODO: No good to catch all errors like this
46
46
  # I need a backtrace when something unexpected fails
47
47
  Ve::Parse::MecabIpadic.new(text, [])
48
48
  end
49
49
 
50
50
  private
51
-
51
+
52
52
  # TODO: Use Process.spawn/kill for process control?
53
53
  def start!
54
- @stdin, @stdout, @stderr = Open3.popen3(@config[:app])
54
+ @stdin, @stdout, @stderr = Open3.popen3("#{@config[:app]} #{@config[:flags]}")
55
55
  @stdin.set_encoding('UTF-8')
56
56
  @stdout.set_encoding('UTF-8')
57
- rescue Errno::ENOENT
57
+ rescue Errno::ENOENT => e
58
58
  # The parser couldn't be started. Probably not installed on this system
59
59
  end
60
-
60
+
61
61
  end
62
62
  end
63
63
  end
@@ -65,15 +65,15 @@ end
65
65
  class Ve
66
66
  class Parse
67
67
  class MecabIpadic < Ve::Parse
68
-
68
+
69
69
  PARSER = %r{^ (.+?) \t (.+) }x
70
70
  attr_reader :tokens, :text
71
-
71
+
72
72
  def initialize(text, output)
73
73
  @tokens = []
74
74
  @text = text
75
75
  position = 0
76
-
76
+
77
77
  output.each_with_index do |line, index|
78
78
  line.rstrip!
79
79
  token = {:raw => line}
@@ -87,7 +87,7 @@ class Ve
87
87
  @tokens << unparsed_token
88
88
  end
89
89
  end
90
-
90
+
91
91
  if line =~ %r{^ EOS $}x
92
92
  token[:type] = :sentence_split
93
93
  token[:literal] = ''
@@ -99,7 +99,7 @@ class Ve
99
99
  [:pos, :pos2, :pos3, :pos4, :inflection_type, :inflection_form, :lemma, :reading, :hatsuon].each_with_index do |attr, i|
100
100
  token[attr] = info[i]
101
101
  end
102
-
102
+
103
103
  # Anything unparsed preceding this token
104
104
  unparsed_md = %r{(.*?) #{Regexp.quote(token[:literal])}}mx.match(text, position)
105
105
  if unparsed_md[1].length > 0
@@ -108,7 +108,7 @@ class Ve
108
108
  @tokens << unparsed_token
109
109
  position += unparsed_token[:literal].length
110
110
  end
111
-
111
+
112
112
  token[:characters] = (position..(position+token[:literal].length-1))
113
113
  position += token[:literal].length
114
114
  else
@@ -118,7 +118,7 @@ class Ve
118
118
  @tokens << token
119
119
  end
120
120
  end
121
-
121
+
122
122
  # PoS
123
123
  MEISHI = '名詞'
124
124
  KOYUUMEISHI = '固有名詞'
@@ -159,6 +159,11 @@ class Ve
159
159
  TOKUSHU_DESU = '特殊・デス'
160
160
  TOKUSHU_DA = '特殊・ダ'
161
161
  TOKUSHU_MASU = '特殊・マス'
162
+ TOKUSHU_NU = '特殊・ヌ'
163
+ FUHENKAGATA = '不変化型'
164
+ JINMEI = '人名'
165
+ MEIREI_I = '命令i'
166
+ KAKARIJOSHI = '係助詞'
162
167
 
163
168
  # Etc
164
169
  NA = 'な'
@@ -166,11 +171,14 @@ class Ve
166
171
  TE = 'て'
167
172
  DE = 'で'
168
173
  BA = 'ば'
174
+ NN = 'ん'
175
+ SA = 'さ'
169
176
 
170
177
  def words
171
178
  words = []
172
179
  tokens = @tokens.find_all { |t| t[:type] == :parsed }
173
180
  tokens = tokens.to_enum
181
+ previous = nil
174
182
 
175
183
  # This is becoming very big
176
184
  begin
@@ -181,6 +189,7 @@ class Ve
181
189
  eat_lemma = true
182
190
  attach_to_previous = false
183
191
  also_attach_to_lemma = false
192
+ update_pos = false
184
193
 
185
194
  case token[:pos]
186
195
  when MEISHI
@@ -208,7 +217,7 @@ class Ve
208
217
  eat_next = true
209
218
  elsif following[:pos] == JOSHI && following[:literal] == NI
210
219
  pos = Ve::PartOfSpeech::Adverb
211
- eat_next = true
220
+ eat_next = false
212
221
  end
213
222
  end
214
223
  when HIJIRITSU, TOKUSHU
@@ -246,8 +255,13 @@ class Ve
246
255
  also_attach_to_lemma = true
247
256
  end
248
257
  when SETSUBI
249
- # TODO: elaborate a bit?
250
- pos = Ve::PartOfSpeech::Suffix
258
+ if token[:pos3] == TOKUSHU && token[:lemma] == SA
259
+ attach_to_previous = true
260
+ update_pos = true
261
+ pos = Ve::PartOfSpeech::Noun
262
+ else
263
+ pos = Ve::PartOfSpeech::Suffix
264
+ end
251
265
  when SETSUZOKUSHITEKI
252
266
  pos = Ve::PartOfSpeech::Conjunction
253
267
  when DOUSHIHIJIRITSUTEKI
@@ -260,7 +274,10 @@ class Ve
260
274
  when JODOUSHI
261
275
  pos = Ve::PartOfSpeech::Postposition
262
276
 
263
- if [TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU].include?(token[:inflection_type])
277
+ if (previous.nil? || (!previous.nil? && previous[:pos2] != KAKARIJOSHI)) &&
278
+ [TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU, TOKUSHU_NU].include?(token[:inflection_type])
279
+ attach_to_previous = true
280
+ elsif token[:inflection_type] == FUHENKAGATA && token[:lemma] == NN
264
281
  attach_to_previous = true
265
282
  elsif (token[:inflection_type] == TOKUSHU_DA || token[:inflection_type] == TOKUSHU_DESU) && token[:literal] != NA
266
283
  pos = Ve::PartOfSpeech::Verb
@@ -269,8 +286,8 @@ class Ve
269
286
  pos = Ve::PartOfSpeech::Verb
270
287
  if token[:pos2] == SETSUBI
271
288
  attach_to_previous = true
272
- elsif token[:pos2] == HIJIRITSU
273
- grammar = :auxillary
289
+ elsif token[:pos2] == HIJIRITSU && token[:inflection_form] != MEIREI_I
290
+ attach_to_previous = true
274
291
  end
275
292
  when KEIYOUSHI
276
293
  pos = Ve::PartOfSpeech::Adjective
@@ -301,6 +318,7 @@ class Ve
301
318
  words[-1].extra[:reading] << (token[:reading] || '')
302
319
  words[-1].extra[:transcription] << (token[:hatsuon] || '')
303
320
  words[-1].lemma << token[:lemma] if also_attach_to_lemma
321
+ words[-1].part_of_speech = pos if update_pos
304
322
  else
305
323
  pos = Ve::PartOfSpeech::TBD if pos.nil?
306
324
  word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {
@@ -323,18 +341,20 @@ class Ve
323
341
 
324
342
  words << word
325
343
  end
344
+
345
+ previous = token
326
346
  end
327
347
  rescue StopIteration
328
348
  end
329
349
 
330
350
  return words
331
351
  end
332
-
352
+
333
353
  def sentences
334
354
  # TODO: Sentence objects that keep track of the sentence's tokens
335
355
  sentences = []
336
356
  current = ''
337
-
357
+
338
358
  @tokens.each do |token|
339
359
  if token[:type] == :sentence_split
340
360
  sentences << current
@@ -347,13 +367,13 @@ class Ve
347
367
  current << token[:literal]
348
368
  end
349
369
  end
350
-
370
+
351
371
  # In case there is no :sentence_split at the end
352
372
  sentences << current if current.length > 0
353
-
373
+
354
374
  sentences
355
375
  end
356
-
376
+
357
377
  end
358
378
  end
359
379
  end
data/lib/ve.rb CHANGED
@@ -10,23 +10,34 @@ require 'pp'
10
10
  class Ve
11
11
 
12
12
  class Manager
13
+ @@config_for = {}
14
+
15
+ def self.set_default_config_for(klass, config = {})
16
+ @@config_for[klass] = config
17
+ end
18
+
13
19
  def self.provider_for(language, function)
14
- @@provider_for[language.to_sym][function.to_sym]
20
+ provider = @@provider_for[language.to_sym][function.to_sym]
21
+ if provider.is_a?(Class)
22
+ config = @@config_for[provider] || {}
23
+ provider = @@provider_for[language.to_sym][function.to_sym].new(config)
24
+ @@provider_for[language.to_sym][function.to_sym] = provider
25
+ end
26
+ provider
15
27
  end
16
28
 
17
29
  # TODO: Make a difference between what features are available locally
18
30
  # and what requires contacting external Ves
19
31
  def self.register(klass, language)
20
32
  @@provider_for ||= {}
21
- provider = klass.new
22
33
  # This won't work if people start monkey patching the providers with public methods that arent abilities
23
34
  # It's also not pretty, but kinda nifty
24
- provider_name = provider.class.to_s.split('::').last
35
+ provider_name = klass.to_s.split('::').last
25
36
  parse_class = Kernel.class_eval("Ve::Parse::#{provider_name}")
26
37
  abilities = parse_class.public_instance_methods - Object.public_instance_methods
27
38
  abilities.each do |a|
28
39
  @@provider_for[language.to_sym] ||= {}
29
- @@provider_for[language.to_sym][a] = provider
40
+ @@provider_for[language.to_sym][a] = klass
30
41
  end
31
42
  end
32
43
  end
@@ -2,31 +2,7 @@
2
2
 
3
3
  require_relative 'test_helper'
4
4
 
5
- class FreelingEnTest < Test::Unit::TestCase
6
-
7
- def test_should_be_able_to_start
8
- freeling = Ve::Provider::FreelingEn.new
9
- assert freeling.works?
10
- end
11
-
12
- def test_doesnt_die_on_japanese
13
- freeling = Ve::Provider::FreelingEn.new
14
- parse = freeling.parse('これは日本語です')
15
- assert_equal Ve::Parse::FreelingEn, parse.class
16
- end
17
-
18
- # TODO: UTF-8 handling
19
- def test_can_handle_utf8
20
- freeling = Ve::Provider::FreelingEn.new
21
- parse = freeling.parse('I’m')
22
- assert_equal ['I\'m'], parse.tokens.collect { |t| t[:literal] }
23
- end
24
-
25
- def test_can_parse
26
- freeling = Ve::Provider::FreelingEn.new
27
- parse = freeling.parse('')
28
- assert_equal Ve::Parse::FreelingEn, parse.class
29
- end
5
+ class FreelingEnParseTest < MiniTest::Unit::TestCase
30
6
 
31
7
  def test_all_literals_should_equal_the_input_text
32
8
  text = <<-EOS
@@ -35,27 +11,30 @@ class FreelingEnTest < Test::Unit::TestCase
35
11
  Z
36
12
 
37
13
  EOS
38
- freeling = Ve::Provider::FreelingEn.new
39
- parse = freeling.parse(text)
14
+ raw = ["There there EX 0.857656", "once once RB 0.809237", "was be VBD 1", "a a DT 0.333333", "man man NN 0.980535", "from from IN 1", "X x NNP 1", "", "Who who WP 1", "took take VBD 1", "it it PRP 1", "upon upon IN 0.915152", "himself himself PRP 1", "to to TO 0.999909", "Y y NNP 1", "", "Z z NNP 1", ""]
15
+ parse = Ve::Parse::FreelingEn.new(text, raw)
40
16
  assert_equal text, parse.tokens.collect { |t| t[:literal] }.join
41
17
  end
42
18
 
43
19
  def test_creates_tokens_from_data_that_is_ignored_in_parsing
44
- freeling = Ve::Provider::FreelingEn.new
45
- parse = freeling.parse('A B ')
20
+ text = 'A B '
21
+ raw = ['A a DT 0.333333', 'B b NNP 1', '']
22
+ parse = Ve::Parse::FreelingEn.new(text, raw)
46
23
  assert_equal [:parsed, :unparsed, :parsed, :unparsed, :sentence_split], parse.tokens.collect { |t| t[:type] }
47
24
  assert_equal ['A', ' ', 'B', ' ', ''], parse.tokens.collect { |t| t[:literal] }
48
25
  end
49
26
 
50
27
  def test_can_give_sentences
51
- freeling = Ve::Provider::FreelingEn.new
52
- parse = freeling.parse('This is a sentence. And this was another one')
28
+ text = 'This is a sentence. And this was another one'
29
+ raw = ['This this PRP 0.0001755', 'is be VBZ 1', 'a a DT 0.333333', 'sentence sentence NN 0.966667', '. . Fp 1', '', 'And and CC 1', 'this this PRP 0.0001755', 'was be VBD 1', 'another another DT 0.999067', 'one one NN 0.25', '']
30
+ parse = Ve::Parse::FreelingEn.new(text, raw)
53
31
  assert_equal ['This is a sentence.', 'And this was another one'], parse.sentences
54
32
  end
55
33
 
56
34
  def test_can_give_words
57
- freeling = Ve::Provider::FreelingEn.new
58
- parse = freeling.parse('This was a sentence.')
35
+ text = 'This was a sentence.'
36
+ raw = ['This this PRP 0.0001755', 'was be VBD 1', 'a a DT 0.333333', 'sentence sentence NN 0.966667', '. . Fp 1', '']
37
+ parse = Ve::Parse::FreelingEn.new(text, raw)
59
38
  words = parse.words
60
39
  tokens = parse.tokens
61
40
 
@@ -67,48 +46,55 @@ class FreelingEnTest < Test::Unit::TestCase
67
46
  assert_equal [[tokens[0]], [tokens[2]], [tokens[4]], [tokens[6]], [tokens[7]]], words.collect(&:tokens)
68
47
  end
69
48
 
49
+ def test_words_can_handle_contractions
50
+ # TODO
51
+ skip
52
+ text = "I'm eating."
53
+ raw = ['I i PRP 1', "'m 'm VBP 0.997563", 'eating eat VBG 1', '. . Fp 1', '']
54
+ parse = Ve::Parse::FreelingEn.new(text, raw)
55
+ assert_equal ["I'm", "eating", "."], parse.tokens.collect { |t| t[:literal] }
56
+ end
57
+
70
58
  def test_possessive_endings_must_be_reattached
71
- freeling = Ve::Provider::FreelingEn.new
72
- parse = freeling.parse("This is Jane's sentence.")
59
+ text = "This is Jane's sentence."
60
+ raw = ["This this PRP 0.0001755", "is be VBZ 1", "Jane jane NNP 1", "'s 's POS 0.751711", "sentence sentence NN 0.966667", ". . Fp 1", ""]
61
+ parse = Ve::Parse::FreelingEn.new(text, raw)
73
62
  words = parse.words
74
63
  tokens = parse.tokens
75
64
 
76
65
  assert_equal ['This', 'is', "Jane's", 'sentence', '.'], words.collect(&:word)
77
66
  assert_equal ['this', 'be', "jane", 'sentence', '.'], words.collect(&:lemma)
78
67
  assert_equal [Ve::PartOfSpeech::Pronoun, Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::ProperNoun, Ve::PartOfSpeech::Noun, Ve::PartOfSpeech::Symbol], words.collect(&:part_of_speech)
79
- assert_equal [{:grammar => :personal}, {:grammar => nil}, {:gramamr => nil}, {:grammar => nil}, {:grammar => nil}], words.collect(&:extra)
68
+ assert_equal [{:grammar => :personal}, {:grammar => nil}, {:grammar => nil}, {:grammar => nil}, {:grammar => nil}], words.collect(&:extra)
80
69
  assert_equal [[tokens[0]], [tokens[2]], tokens[4..5], [tokens[7]], [tokens[8]]], words.collect(&:tokens)
81
70
  end
82
71
 
83
72
  def test_date_parsing
84
73
  # Should be turned off. At least for now
85
- freeling = Ve::Provider::FreelingEn.new
86
-
87
- assert_parses_into_words(freeling,
74
+ assert_parses_into_words(Ve::Parse::FreelingEn,
88
75
  {:words => ['January'],
89
76
  :lemmas => ['january'],
90
77
  :pos => [Ve::PartOfSpeech::Noun],
91
78
  :extra => [{:grammar => nil}],
92
79
  :tokens => [0..0]},
93
- 'January')
80
+ 'January', ['January january NN 1'])
94
81
  end
95
82
 
96
83
  def test_symbol_parsing
97
- freeling = Ve::Provider::FreelingEn.new
98
-
99
- assert_parses_into_words(freeling,
84
+ assert_parses_into_words(Ve::Parse::FreelingEn,
100
85
  {:words => ['.', ',', '$'],
101
86
  :lemmas => ['.', ',', '$'],
102
87
  :pos => [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::Symbol],
103
88
  :extra => [{:grammar => nil}, {:grammar => nil}, {:grammar => nil}],
104
89
  :tokens => [0..0, 1..1, 2..2]},
105
- '.,$')
90
+ '.,$', ['. . Fp 1', ', , Fc 1', '$ $ Fp', ''])
106
91
  end
107
92
 
108
93
  def test_can_handle_underscores_properly
109
94
  # Should restore them
110
- freeling = Ve::Provider::FreelingEn.new
111
- parse = freeling.parse("In New York")
95
+ text = 'In New York'
96
+ raw = ['In in IN 0.986184', 'New_York new_york NNP 1', '']
97
+ parse = Ve::Parse::FreelingEn.new(text, raw)
112
98
  words = parse.words
113
99
  tokens = parse.tokens
114
100
 
@@ -120,8 +106,10 @@ class FreelingEnTest < Test::Unit::TestCase
120
106
 
121
107
  # Should keep them
122
108
  # TODO
123
- freeling = Ve::Provider::FreelingEn.new
124
- parse = freeling.parse("In New_York")
109
+ skip
110
+ text = 'In New_York'
111
+ raw = ['In in IN 0.986184', 'New_York new_york NNP 1', '']
112
+ parse = Ve::Parse::FreelingEn.new(text, raw)
125
113
  words = parse.words
126
114
  tokens = parse.tokens
127
115
 
@@ -133,3 +121,4 @@ class FreelingEnTest < Test::Unit::TestCase
133
121
  end
134
122
 
135
123
  end
124
+