ve 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7667f10a89f699b284d7a412bab815d46e9bf26d
4
+ data.tar.gz: 7700e9a46ee0321b746ce23c806df91fb54b7252
5
+ SHA512:
6
+ metadata.gz: fa87fa761966cc70ec3edf7dad6b4ef36404a8adc14314f88aa7bd91b34784a98de198d699f6fe8650ace92fb45b9be7c5ac6e23d35b5f1fcf9a138d75d647b0
7
+ data.tar.gz: 78e90e9c7af44b26bebd04bd4f44e002b392438bfe7422950f473d01e99840e5e26f6ca0887a17f76f2d7211f49d44fee2ce7e86ed832824f0d01d5109a6cea4
data/.gitignore CHANGED
@@ -1,4 +1,5 @@
1
1
  .DS_Store
2
2
  .*.swp
3
3
  *.gem
4
+ .rvmrc
4
5
 
data/.travis.yml ADDED
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.2
4
+ - 1.9.3
5
+ - jruby-19mode
6
+ - rbx-19mode
7
+ - ruby-head
8
+ - jruby-head
9
+ - ree
data/Gemfile CHANGED
@@ -6,3 +6,8 @@ group :server do
6
6
  gem "sinatra"
7
7
  gem "rack-cors"
8
8
  end
9
+
10
+ group :test do
11
+ gem "rake"
12
+ gem "mocha", :require => false
13
+ end
data/Gemfile.lock CHANGED
@@ -2,11 +2,15 @@ GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
4
  json (1.6.1)
5
+ metaclass (0.0.1)
6
+ mocha (0.11.4)
7
+ metaclass (~> 0.0.1)
5
8
  rack (1.3.5)
6
9
  rack-cors (0.2.4)
7
10
  rack
8
11
  rack-protection (1.1.4)
9
12
  rack
13
+ rake (0.8.7)
10
14
  sinatra (1.3.1)
11
15
  rack (~> 1.3, >= 1.3.4)
12
16
  rack-protection (~> 1.1, >= 1.1.2)
@@ -18,5 +22,7 @@ PLATFORMS
18
22
 
19
23
  DEPENDENCIES
20
24
  json
25
+ mocha
21
26
  rack-cors
27
+ rake
22
28
  sinatra
@@ -8,7 +8,7 @@ require 'open3'
8
8
  class Ve
9
9
  class Provider
10
10
  class FreelingEn < Ve::Provider
11
-
11
+ # FIX: This class isn't tested
12
12
  BIT_STOP = 'VeEnd'
13
13
 
14
14
  # TODO: Automatically set FREELINGSHARE if it's not set?
@@ -27,7 +27,8 @@ class Ve
27
27
  # Interface methods
28
28
 
29
29
  def works?
30
- (["Wrote write VBD 1", ""] == parse('Wrote').tokens.collect { |t| t[:raw] })
30
+ p = parse('Wrote')
31
+ ["Wrote write VBD 1", ""] == p.tokens.collect { |t| t[:raw] }
31
32
  end
32
33
 
33
34
  # Talks to the app and returns a parse object
@@ -41,6 +42,7 @@ class Ve
41
42
  output = []
42
43
 
43
44
  while line = @stdout.readline
45
+ puts line
44
46
  if line =~ /#{BIT_STOP}/x
45
47
  @stdout.readline
46
48
  break
@@ -7,31 +7,31 @@ class Ve
7
7
  class MecabIpadic < Ve::Provider
8
8
 
9
9
  BIT_STOP = 'VeEnd'
10
-
10
+
11
11
  def initialize(config = {})
12
12
  # TODO: Make config handling better
13
13
  @config = {:app => 'mecab',
14
14
  :path => '',
15
15
  :flags => ''}.merge(config)
16
-
17
- @config[:app] = `which #{@config[:app]}`
18
-
16
+
17
+ @config[:app] = `which #{@config[:app]}`.chomp
18
+
19
19
  start!
20
20
  end
21
-
21
+
22
22
  def works?
23
23
  (["だっ\t助動詞,*,*,*,特殊・ダ,連用タ接続,だ,ダッ,ダッ",
24
24
  "た\t助動詞,*,*,*,特殊・タ,基本形,た,タ,タ",
25
25
  "EOS"] == parse('だった').tokens.collect { |t| t[:raw] } )
26
26
  end
27
-
27
+
28
28
  # Talks to the app and returns a parse object
29
29
  def parse(text, options = {})
30
30
  start! if @stdin.nil? # Restart if the provider crashed
31
-
31
+
32
32
  @stdin.puts "#{text} #{BIT_STOP}"
33
33
  output = []
34
-
34
+
35
35
  while line = @stdout.readline.force_encoding('UTF-8')
36
36
  if line =~ /#{BIT_STOP}/x
37
37
  output << @stdout.readline # Catch the EOS
@@ -39,25 +39,25 @@ class Ve
39
39
  end
40
40
  output << line
41
41
  end
42
-
42
+
43
43
  Ve::Parse::MecabIpadic.new(text, output)
44
- rescue
44
+ rescue => e
45
45
  # TODO: No good to catch all errors like this
46
46
  # I need a backtrace when something unexpected fails
47
47
  Ve::Parse::MecabIpadic.new(text, [])
48
48
  end
49
49
 
50
50
  private
51
-
51
+
52
52
  # TODO: Use Process.spawn/kill for process control?
53
53
  def start!
54
- @stdin, @stdout, @stderr = Open3.popen3(@config[:app])
54
+ @stdin, @stdout, @stderr = Open3.popen3("#{@config[:app]} #{@config[:flags]}")
55
55
  @stdin.set_encoding('UTF-8')
56
56
  @stdout.set_encoding('UTF-8')
57
- rescue Errno::ENOENT
57
+ rescue Errno::ENOENT => e
58
58
  # The parser couldn't be started. Probably not installed on this system
59
59
  end
60
-
60
+
61
61
  end
62
62
  end
63
63
  end
@@ -65,15 +65,15 @@ end
65
65
  class Ve
66
66
  class Parse
67
67
  class MecabIpadic < Ve::Parse
68
-
68
+
69
69
  PARSER = %r{^ (.+?) \t (.+) }x
70
70
  attr_reader :tokens, :text
71
-
71
+
72
72
  def initialize(text, output)
73
73
  @tokens = []
74
74
  @text = text
75
75
  position = 0
76
-
76
+
77
77
  output.each_with_index do |line, index|
78
78
  line.rstrip!
79
79
  token = {:raw => line}
@@ -87,7 +87,7 @@ class Ve
87
87
  @tokens << unparsed_token
88
88
  end
89
89
  end
90
-
90
+
91
91
  if line =~ %r{^ EOS $}x
92
92
  token[:type] = :sentence_split
93
93
  token[:literal] = ''
@@ -99,7 +99,7 @@ class Ve
99
99
  [:pos, :pos2, :pos3, :pos4, :inflection_type, :inflection_form, :lemma, :reading, :hatsuon].each_with_index do |attr, i|
100
100
  token[attr] = info[i]
101
101
  end
102
-
102
+
103
103
  # Anything unparsed preceding this token
104
104
  unparsed_md = %r{(.*?) #{Regexp.quote(token[:literal])}}mx.match(text, position)
105
105
  if unparsed_md[1].length > 0
@@ -108,7 +108,7 @@ class Ve
108
108
  @tokens << unparsed_token
109
109
  position += unparsed_token[:literal].length
110
110
  end
111
-
111
+
112
112
  token[:characters] = (position..(position+token[:literal].length-1))
113
113
  position += token[:literal].length
114
114
  else
@@ -118,7 +118,7 @@ class Ve
118
118
  @tokens << token
119
119
  end
120
120
  end
121
-
121
+
122
122
  # PoS
123
123
  MEISHI = '名詞'
124
124
  KOYUUMEISHI = '固有名詞'
@@ -159,6 +159,11 @@ class Ve
159
159
  TOKUSHU_DESU = '特殊・デス'
160
160
  TOKUSHU_DA = '特殊・ダ'
161
161
  TOKUSHU_MASU = '特殊・マス'
162
+ TOKUSHU_NU = '特殊・ヌ'
163
+ FUHENKAGATA = '不変化型'
164
+ JINMEI = '人名'
165
+ MEIREI_I = '命令i'
166
+ KAKARIJOSHI = '係助詞'
162
167
 
163
168
  # Etc
164
169
  NA = 'な'
@@ -166,11 +171,14 @@ class Ve
166
171
  TE = 'て'
167
172
  DE = 'で'
168
173
  BA = 'ば'
174
+ NN = 'ん'
175
+ SA = 'さ'
169
176
 
170
177
  def words
171
178
  words = []
172
179
  tokens = @tokens.find_all { |t| t[:type] == :parsed }
173
180
  tokens = tokens.to_enum
181
+ previous = nil
174
182
 
175
183
  # This is becoming very big
176
184
  begin
@@ -181,6 +189,7 @@ class Ve
181
189
  eat_lemma = true
182
190
  attach_to_previous = false
183
191
  also_attach_to_lemma = false
192
+ update_pos = false
184
193
 
185
194
  case token[:pos]
186
195
  when MEISHI
@@ -208,7 +217,7 @@ class Ve
208
217
  eat_next = true
209
218
  elsif following[:pos] == JOSHI && following[:literal] == NI
210
219
  pos = Ve::PartOfSpeech::Adverb
211
- eat_next = true
220
+ eat_next = false
212
221
  end
213
222
  end
214
223
  when HIJIRITSU, TOKUSHU
@@ -246,8 +255,13 @@ class Ve
246
255
  also_attach_to_lemma = true
247
256
  end
248
257
  when SETSUBI
249
- # TODO: elaborate a bit?
250
- pos = Ve::PartOfSpeech::Suffix
258
+ if token[:pos3] == TOKUSHU && token[:lemma] == SA
259
+ attach_to_previous = true
260
+ update_pos = true
261
+ pos = Ve::PartOfSpeech::Noun
262
+ else
263
+ pos = Ve::PartOfSpeech::Suffix
264
+ end
251
265
  when SETSUZOKUSHITEKI
252
266
  pos = Ve::PartOfSpeech::Conjunction
253
267
  when DOUSHIHIJIRITSUTEKI
@@ -260,7 +274,10 @@ class Ve
260
274
  when JODOUSHI
261
275
  pos = Ve::PartOfSpeech::Postposition
262
276
 
263
- if [TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU].include?(token[:inflection_type])
277
+ if (previous.nil? || (!previous.nil? && previous[:pos2] != KAKARIJOSHI)) &&
278
+ [TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU, TOKUSHU_NU].include?(token[:inflection_type])
279
+ attach_to_previous = true
280
+ elsif token[:inflection_type] == FUHENKAGATA && token[:lemma] == NN
264
281
  attach_to_previous = true
265
282
  elsif (token[:inflection_type] == TOKUSHU_DA || token[:inflection_type] == TOKUSHU_DESU) && token[:literal] != NA
266
283
  pos = Ve::PartOfSpeech::Verb
@@ -269,8 +286,8 @@ class Ve
269
286
  pos = Ve::PartOfSpeech::Verb
270
287
  if token[:pos2] == SETSUBI
271
288
  attach_to_previous = true
272
- elsif token[:pos2] == HIJIRITSU
273
- grammar = :auxillary
289
+ elsif token[:pos2] == HIJIRITSU && token[:inflection_form] != MEIREI_I
290
+ attach_to_previous = true
274
291
  end
275
292
  when KEIYOUSHI
276
293
  pos = Ve::PartOfSpeech::Adjective
@@ -301,6 +318,7 @@ class Ve
301
318
  words[-1].extra[:reading] << (token[:reading] || '')
302
319
  words[-1].extra[:transcription] << (token[:hatsuon] || '')
303
320
  words[-1].lemma << token[:lemma] if also_attach_to_lemma
321
+ words[-1].part_of_speech = pos if update_pos
304
322
  else
305
323
  pos = Ve::PartOfSpeech::TBD if pos.nil?
306
324
  word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {
@@ -323,18 +341,20 @@ class Ve
323
341
 
324
342
  words << word
325
343
  end
344
+
345
+ previous = token
326
346
  end
327
347
  rescue StopIteration
328
348
  end
329
349
 
330
350
  return words
331
351
  end
332
-
352
+
333
353
  def sentences
334
354
  # TODO: Sentence objects that keep track of the sentence's tokens
335
355
  sentences = []
336
356
  current = ''
337
-
357
+
338
358
  @tokens.each do |token|
339
359
  if token[:type] == :sentence_split
340
360
  sentences << current
@@ -347,13 +367,13 @@ class Ve
347
367
  current << token[:literal]
348
368
  end
349
369
  end
350
-
370
+
351
371
  # In case there is no :sentence_split at the end
352
372
  sentences << current if current.length > 0
353
-
373
+
354
374
  sentences
355
375
  end
356
-
376
+
357
377
  end
358
378
  end
359
379
  end
data/lib/ve.rb CHANGED
@@ -10,23 +10,34 @@ require 'pp'
10
10
  class Ve
11
11
 
12
12
  class Manager
13
+ @@config_for = {}
14
+
15
+ def self.set_default_config_for(klass, config = {})
16
+ @@config_for[klass] = config
17
+ end
18
+
13
19
  def self.provider_for(language, function)
14
- @@provider_for[language.to_sym][function.to_sym]
20
+ provider = @@provider_for[language.to_sym][function.to_sym]
21
+ if provider.is_a?(Class)
22
+ config = @@config_for[provider] || {}
23
+ provider = @@provider_for[language.to_sym][function.to_sym].new(config)
24
+ @@provider_for[language.to_sym][function.to_sym] = provider
25
+ end
26
+ provider
15
27
  end
16
28
 
17
29
  # TODO: Make a difference between what features are available locally
18
30
  # and what requires contacting external Ves
19
31
  def self.register(klass, language)
20
32
  @@provider_for ||= {}
21
- provider = klass.new
22
33
  # This won't work if people start monkey patching the providers with public methods that arent abilities
23
34
  # It's also not pretty, but kinda nifty
24
- provider_name = provider.class.to_s.split('::').last
35
+ provider_name = klass.to_s.split('::').last
25
36
  parse_class = Kernel.class_eval("Ve::Parse::#{provider_name}")
26
37
  abilities = parse_class.public_instance_methods - Object.public_instance_methods
27
38
  abilities.each do |a|
28
39
  @@provider_for[language.to_sym] ||= {}
29
- @@provider_for[language.to_sym][a] = provider
40
+ @@provider_for[language.to_sym][a] = klass
30
41
  end
31
42
  end
32
43
  end
@@ -2,31 +2,7 @@
2
2
 
3
3
  require_relative 'test_helper'
4
4
 
5
- class FreelingEnTest < Test::Unit::TestCase
6
-
7
- def test_should_be_able_to_start
8
- freeling = Ve::Provider::FreelingEn.new
9
- assert freeling.works?
10
- end
11
-
12
- def test_doesnt_die_on_japanese
13
- freeling = Ve::Provider::FreelingEn.new
14
- parse = freeling.parse('これは日本語です')
15
- assert_equal Ve::Parse::FreelingEn, parse.class
16
- end
17
-
18
- # TODO: UTF-8 handling
19
- def test_can_handle_utf8
20
- freeling = Ve::Provider::FreelingEn.new
21
- parse = freeling.parse('I’m')
22
- assert_equal ['I\'m'], parse.tokens.collect { |t| t[:literal] }
23
- end
24
-
25
- def test_can_parse
26
- freeling = Ve::Provider::FreelingEn.new
27
- parse = freeling.parse('')
28
- assert_equal Ve::Parse::FreelingEn, parse.class
29
- end
5
+ class FreelingEnParseTest < MiniTest::Unit::TestCase
30
6
 
31
7
  def test_all_literals_should_equal_the_input_text
32
8
  text = <<-EOS
@@ -35,27 +11,30 @@ class FreelingEnTest < Test::Unit::TestCase
35
11
  Z
36
12
 
37
13
  EOS
38
- freeling = Ve::Provider::FreelingEn.new
39
- parse = freeling.parse(text)
14
+ raw = ["There there EX 0.857656", "once once RB 0.809237", "was be VBD 1", "a a DT 0.333333", "man man NN 0.980535", "from from IN 1", "X x NNP 1", "", "Who who WP 1", "took take VBD 1", "it it PRP 1", "upon upon IN 0.915152", "himself himself PRP 1", "to to TO 0.999909", "Y y NNP 1", "", "Z z NNP 1", ""]
15
+ parse = Ve::Parse::FreelingEn.new(text, raw)
40
16
  assert_equal text, parse.tokens.collect { |t| t[:literal] }.join
41
17
  end
42
18
 
43
19
  def test_creates_tokens_from_data_that_is_ignored_in_parsing
44
- freeling = Ve::Provider::FreelingEn.new
45
- parse = freeling.parse('A B ')
20
+ text = 'A B '
21
+ raw = ['A a DT 0.333333', 'B b NNP 1', '']
22
+ parse = Ve::Parse::FreelingEn.new(text, raw)
46
23
  assert_equal [:parsed, :unparsed, :parsed, :unparsed, :sentence_split], parse.tokens.collect { |t| t[:type] }
47
24
  assert_equal ['A', ' ', 'B', ' ', ''], parse.tokens.collect { |t| t[:literal] }
48
25
  end
49
26
 
50
27
  def test_can_give_sentences
51
- freeling = Ve::Provider::FreelingEn.new
52
- parse = freeling.parse('This is a sentence. And this was another one')
28
+ text = 'This is a sentence. And this was another one'
29
+ raw = ['This this PRP 0.0001755', 'is be VBZ 1', 'a a DT 0.333333', 'sentence sentence NN 0.966667', '. . Fp 1', '', 'And and CC 1', 'this this PRP 0.0001755', 'was be VBD 1', 'another another DT 0.999067', 'one one NN 0.25', '']
30
+ parse = Ve::Parse::FreelingEn.new(text, raw)
53
31
  assert_equal ['This is a sentence.', 'And this was another one'], parse.sentences
54
32
  end
55
33
 
56
34
  def test_can_give_words
57
- freeling = Ve::Provider::FreelingEn.new
58
- parse = freeling.parse('This was a sentence.')
35
+ text = 'This was a sentence.'
36
+ raw = ['This this PRP 0.0001755', 'was be VBD 1', 'a a DT 0.333333', 'sentence sentence NN 0.966667', '. . Fp 1', '']
37
+ parse = Ve::Parse::FreelingEn.new(text, raw)
59
38
  words = parse.words
60
39
  tokens = parse.tokens
61
40
 
@@ -67,48 +46,55 @@ class FreelingEnTest < Test::Unit::TestCase
67
46
  assert_equal [[tokens[0]], [tokens[2]], [tokens[4]], [tokens[6]], [tokens[7]]], words.collect(&:tokens)
68
47
  end
69
48
 
49
+ def test_words_can_handle_contractions
50
+ # TODO
51
+ skip
52
+ text = "I'm eating."
53
+ raw = ['I i PRP 1', "'m 'm VBP 0.997563", 'eating eat VBG 1', '. . Fp 1', '']
54
+ parse = Ve::Parse::FreelingEn.new(text, raw)
55
+ assert_equal ["I'm", "eating", "."], parse.tokens.collect { |t| t[:literal] }
56
+ end
57
+
70
58
  def test_possessive_endings_must_be_reattached
71
- freeling = Ve::Provider::FreelingEn.new
72
- parse = freeling.parse("This is Jane's sentence.")
59
+ text = "This is Jane's sentence."
60
+ raw = ["This this PRP 0.0001755", "is be VBZ 1", "Jane jane NNP 1", "'s 's POS 0.751711", "sentence sentence NN 0.966667", ". . Fp 1", ""]
61
+ parse = Ve::Parse::FreelingEn.new(text, raw)
73
62
  words = parse.words
74
63
  tokens = parse.tokens
75
64
 
76
65
  assert_equal ['This', 'is', "Jane's", 'sentence', '.'], words.collect(&:word)
77
66
  assert_equal ['this', 'be', "jane", 'sentence', '.'], words.collect(&:lemma)
78
67
  assert_equal [Ve::PartOfSpeech::Pronoun, Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::ProperNoun, Ve::PartOfSpeech::Noun, Ve::PartOfSpeech::Symbol], words.collect(&:part_of_speech)
79
- assert_equal [{:grammar => :personal}, {:grammar => nil}, {:gramamr => nil}, {:grammar => nil}, {:grammar => nil}], words.collect(&:extra)
68
+ assert_equal [{:grammar => :personal}, {:grammar => nil}, {:grammar => nil}, {:grammar => nil}, {:grammar => nil}], words.collect(&:extra)
80
69
  assert_equal [[tokens[0]], [tokens[2]], tokens[4..5], [tokens[7]], [tokens[8]]], words.collect(&:tokens)
81
70
  end
82
71
 
83
72
  def test_date_parsing
84
73
  # Should be turned off. At least for now
85
- freeling = Ve::Provider::FreelingEn.new
86
-
87
- assert_parses_into_words(freeling,
74
+ assert_parses_into_words(Ve::Parse::FreelingEn,
88
75
  {:words => ['January'],
89
76
  :lemmas => ['january'],
90
77
  :pos => [Ve::PartOfSpeech::Noun],
91
78
  :extra => [{:grammar => nil}],
92
79
  :tokens => [0..0]},
93
- 'January')
80
+ 'January', ['January january NN 1'])
94
81
  end
95
82
 
96
83
  def test_symbol_parsing
97
- freeling = Ve::Provider::FreelingEn.new
98
-
99
- assert_parses_into_words(freeling,
84
+ assert_parses_into_words(Ve::Parse::FreelingEn,
100
85
  {:words => ['.', ',', '$'],
101
86
  :lemmas => ['.', ',', '$'],
102
87
  :pos => [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::Symbol],
103
88
  :extra => [{:grammar => nil}, {:grammar => nil}, {:grammar => nil}],
104
89
  :tokens => [0..0, 1..1, 2..2]},
105
- '.,$')
90
+ '.,$', ['. . Fp 1', ', , Fc 1', '$ $ Fp', ''])
106
91
  end
107
92
 
108
93
  def test_can_handle_underscores_properly
109
94
  # Should restore them
110
- freeling = Ve::Provider::FreelingEn.new
111
- parse = freeling.parse("In New York")
95
+ text = 'In New York'
96
+ raw = ['In in IN 0.986184', 'New_York new_york NNP 1', '']
97
+ parse = Ve::Parse::FreelingEn.new(text, raw)
112
98
  words = parse.words
113
99
  tokens = parse.tokens
114
100
 
@@ -120,8 +106,10 @@ class FreelingEnTest < Test::Unit::TestCase
120
106
 
121
107
  # Should keep them
122
108
  # TODO
123
- freeling = Ve::Provider::FreelingEn.new
124
- parse = freeling.parse("In New_York")
109
+ skip
110
+ text = 'In New_York'
111
+ raw = ['In in IN 0.986184', 'New_York new_york NNP 1', '']
112
+ parse = Ve::Parse::FreelingEn.new(text, raw)
125
113
  words = parse.words
126
114
  tokens = parse.tokens
127
115
 
@@ -133,3 +121,4 @@ class FreelingEnTest < Test::Unit::TestCase
133
121
  end
134
122
 
135
123
  end
124
+