natto 0.9.9 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,9 +4,9 @@ require 'natto/option_parse'
4
4
  require 'natto/struct'
5
5
 
6
6
  module Natto
7
- # `MeCab` is a wrapper class for the MeCab `Tagger`.
8
- # Options to the MeCab `Tagger` are passed in as a string
9
- # (MeCab command-line style) or as a Ruby-style hash at
7
+ # `MeCab` is a class providing an interface to the MeCab library.
8
+ # Options to the MeCab Model, Tagger and Lattice are passed in
9
+ # as a string (MeCab command-line style) or as a Ruby-style hash at
10
10
  # initialization.
11
11
  #
12
12
  # ## Usage
@@ -16,14 +16,16 @@ module Natto
16
16
  # text = '凡人にしか見えねえ風景ってのがあるんだよ。'
17
17
  #
18
18
  # nm = Natto::MeCab.new
19
- # => #<Natto::MeCab:0x28d3bdc8 \
20
- # @tagger=#<FFI::Pointer address=0x28afb980>, \
21
- # @libpath="/usr/local/lib/libmecab.so" \
22
- # @options={}, \
23
- # @dicts=[#<Natto::DictionaryInfo:0x289a1f14 \
19
+ # => #<Natto::MeCab:0x0000080318d278 \
20
+ # @model=#<FFI::Pointer address=0x000008039174c0>, \
21
+ # @tagger=#<FFI::Pointer address=0x0000080329ba60>, \
22
+ # @lattice=#<FFI::Pointer address=0x000008045bd140>, \
23
+ # @libpath="/usr/local/lib/libmecab.so" \
24
+ # @options={}, \
25
+ # @dicts=[#<Natto::DictionaryInfo:0x0000080318ce90 \
24
26
  # @filepath="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
25
- # charset=utf8, \
26
- # type=0>], \
27
+ # charset=utf8, \
28
+ # type=0>], \
27
29
  # @version=0.996>
28
30
  #
29
31
  # # print entire MeCab result to stdout
@@ -104,7 +106,7 @@ module Natto
104
106
  #
105
107
  # # Boundary constraint parsing with output formatting.
106
108
  # # %m ... morpheme surface
107
- # # %F, ... comma-delimited ChaSen feature values
109
+ # # %f ... tab-delimited ChaSen feature values
108
110
  # # part-of-speech (index 0)
109
111
  # # %2 ... MeCab node status value (1 unknown)
110
112
  # #
@@ -148,19 +150,22 @@ module Natto
148
150
  MECAB_TOKEN_BOUNDARY = 1
149
151
  MECAB_INSIDE_TOKEN = 2
150
152
 
151
- # @return [FFI:Pointer] pointer to MeCab tagger.
153
+ # @return [FFI:Pointer] pointer to MeCab Model.
154
+ attr_reader :model
155
+ # @return [FFI:Pointer] pointer to MeCab Tagger.
152
156
  attr_reader :tagger
157
+ # @return [FFI:Pointer] pointer to MeCab Lattice.
158
+ attr_reader :lattice
153
159
  # @return [String] absolute filepath to MeCab library.
154
160
  attr_reader :libpath
155
161
  # @return [Hash] MeCab options as key-value pairs.
156
162
  attr_reader :options
157
163
  # @return [Array] listing of all of dictionaries referenced.
158
164
  attr_reader :dicts
159
- # @return [String] `MeCab` version.
165
+ # @return [String] MeCab version.
160
166
  attr_reader :version
161
167
 
162
- # Initializes the wrapped `Tagger` instance with the
163
- # given `options`.
168
+ # Initializes the wrapped Tagger instance with the given `options`.
164
169
  #
165
170
  # Options supported are:
166
171
  #
@@ -186,19 +191,21 @@ module Natto
186
191
  # - :cost_factor -- cost factor (integer, default 700)
187
192
  #
188
193
  # <p>MeCab command-line arguments (-F) or long (--node-format) may be used in
189
- # addition to Ruby-style `Hash`es</p>
194
+ # addition to Ruby-style hashs</p>
190
195
  # <i>Use single-quotes to preserve format options that contain escape chars.</i><br/>
191
196
  # e.g.<br/>
192
197
  #
193
198
  # nm = Natto::MeCab.new(node_format: '%m¥t%f[7]¥n')
194
- # => #<Natto::MeCab:0x28d2ae10
195
- # @tagger=#<FFI::Pointer address=0x28a97980>, \
196
- # @libpath="/usr/local/lib/libmecab.so", \
197
- # @options={:node_format=>"%m¥t%f[7]¥n"}, \
198
- # @dicts=[#<Natto::DictionaryInfo:0x28d2a85c \
199
+ # => #<Natto::MeCab:0x00000803503ee8 \
200
+ # @model=#<FFI::Pointer address=0x00000802b6d9c0>, \
201
+ # @tagger=#<FFI::Pointer address=0x00000802ad3ec0>, \
202
+ # @lattice=#<FFI::Pointer address=0x000008035f3980>, \
203
+ # @libpath="/usr/local/lib/libmecab.so", \
204
+ # @options={:node_format=>"%m¥t%f[7]¥n"}, \
205
+ # @dicts=[#<Natto::DictionaryInfo:0x000008035038f8 \
199
206
  # @filepath="/usr/local/lib/mecab/dic/ipadic/sys.dic" \
200
- # charset=utf8, \
201
- # type=0>] \
207
+ # charset=utf8, \
208
+ # type=0>] \
202
209
  # @version=0.996>
203
210
  #
204
211
  # puts nm.parse('才能とは求める人間に与えられるものではない。')
@@ -216,210 +223,216 @@ module Natto
216
223
  # ない ナイ
217
224
  # 。 。
218
225
  # EOS
219
- #
220
- # @param options [Hash, String] the MeCab options for tagger
221
- # @raise [MeCabError] if `mecab` cannot be initialized with the given `options`
226
+ # @param options [Hash, String] the MeCab options
227
+ # @raise [MeCabError] if MeCab cannot be initialized with the given `options`
222
228
  def initialize(options={})
223
229
  @options = self.class.parse_mecab_options(options)
224
- @dicts = []
225
- # TODO invoke function for enhancing MeCabNode after this point
226
-
227
230
  opt_str = self.class.build_options_str(@options)
228
- @tagger = self.class.mecab_new2(opt_str)
231
+
232
+ @model = self.class.mecab_model_new2(opt_str)
233
+ if @model.address == 0x0
234
+ raise MeCabError.new("Could not initialize Model with options: '#{opt_str}'")
235
+ end
236
+
237
+ @tagger = self.class.mecab_model_new_tagger(@model)
238
+ if @tagger.address == 0x0
239
+ raise MeCabError.new("Could not initialize Tagger with options: '#{opt_str}'")
240
+ end
241
+
242
+ @lattice = self.class.mecab_model_new_lattice(@model)
243
+ if @lattice.address == 0x0
244
+ raise MeCabError.new("Could not initialize Lattice with options: '#{opt_str}'")
245
+ end
246
+
229
247
  @libpath = self.class.find_library
230
- raise MeCabError.new("Could not initialize MeCab with options: '#{opt_str}'") if @tagger.address == 0x0
231
-
232
- self.mecab_set_theta(@tagger, @options[:theta]) if @options[:theta]
233
- self.mecab_set_lattice_level(@tagger, @options[:lattice_level]) if @options[:lattice_level]
234
- self.mecab_set_all_morphs(@tagger, 1) if @options[:all_morphs]
235
- self.mecab_set_partial(@tagger, 1) if @options[:partial]
236
-
237
- # Define lambda for each major parsing type: _tostr, _tonode,
238
- # boundary constraint _tostr, boundary constraint _node;
239
- # and each parsing type will support both normal and N-best
240
- # options
241
- @parse_tostr = ->(text) {
242
- if @options[:nbest] && @options[:nbest] > 1
243
- #self.mecab_set_lattice_level(@tagger, (@options[:lattice_level] || 1))
244
- retval = self.mecab_nbest_sparse_tostr(@tagger, @options[:nbest], text) ||
245
- raise(MeCabError.new(self.mecab_strerror(@tagger)))
246
- else
247
- retval = self.mecab_sparse_tostr(@tagger, text) ||
248
- raise(MeCabError.new(self.mecab_strerror(@tagger)))
249
- end
250
248
 
251
- retval.force_encoding(Encoding.default_external)
252
- }
249
+ if @options[:nbest] && @options[:nbest] > 1
250
+ self.mecab_lattice_set_request_type(@lattice, MECAB_LATTICE_NBEST)
251
+ else
252
+ self.mecab_lattice_set_request_type(@lattice, MECAB_LATTICE_ONE_BEST)
253
+ end
254
+ if @options[:partial]
255
+ self.mecab_lattice_add_request_type(@lattice, MECAB_LATTICE_PARTIAL)
256
+ end
257
+ if @options[:marginal]
258
+ self.mecab_lattice_add_request_type(@lattice,
259
+ MECAB_LATTICE_MARGINAL_PROB)
260
+ end
261
+ if @options[:all_morphs]
262
+ # required when node parsing
263
+ #self.mecab_lattice_add_request_type(@lattice, MECAB_LATTICE_NBEST)
264
+ self.mecab_lattice_add_request_type(@lattice,
265
+ MECAB_LATTICE_ALL_MORPHS)
266
+ end
267
+ if @options[:allocate_sentence]
268
+ self.mecab_lattice_add_request_type(@lattice,
269
+ MECAB_LATTICE_ALLOCATE_SENTENCE)
270
+ end
253
271
 
254
- @parse_tonodes = ->(text) {
255
- Enumerator.new do |y|
256
- if @options[:nbest] && @options[:nbest] > 1
257
- nlen = @options[:nbest]
258
- #self.mecab_set_lattice_level(@tagger, (@options[:lattice_level] || 1))
259
- self.mecab_nbest_init(@tagger, text)
260
- nptr = self.mecab_nbest_next_tonode(@tagger)
261
- else
262
- nlen = 1
263
- nptr = self.mecab_sparse_tonode(@tagger, text)
264
- end
265
- raise(MeCabError.new(self.mecab_strerror(@tagger))) if nptr.nil? || nptr.address==0x0
266
-
267
- nlen.times do
268
- s = text.bytes.to_a
269
- while nptr && nptr.address != 0x0
270
- mn = Natto::MeCabNode.new(nptr)
271
- # ignore BOS nodes, since mecab does so
272
- if !mn.is_bos?
273
- s = s.drop_while {|e| (e==0xa || e==0x20)}
274
- if !s.empty?
275
- sarr = []
276
- mn.length.times { sarr << s.shift }
277
- surf = sarr.pack('C*')
278
- mn.surface = surf.force_encoding(Encoding.default_external)
279
- end
280
- if @options[:output_format_type] || @options[:node_format]
281
- mn.feature = self.mecab_format_node(@tagger, nptr).force_encoding(Encoding.default_external)
282
- end
283
- y.yield mn
284
- end
285
- nptr = mn.next
286
- end
287
- if nlen > 1
288
- nptr = self.mecab_nbest_next_tonode(@tagger)
289
- end
290
- end
291
- end
292
- }
293
-
294
- @bcparse_tostr = ->(text, boundary_constraints=/./) {
295
- begin
296
- lattice = self.mecab_lattice_new()
297
- raise MeCabError.new("Could not create Lattice") if lattice.address == 0x0
272
+ if @options[:theta]
273
+ self.mecab_lattice_set_theta(@lattice, @options[:theta])
274
+ end
298
275
 
276
+ @parse_tostr = ->(text, constraints) {
277
+ begin
299
278
  if @options[:nbest] && @options[:nbest] > 1
300
279
  n = @options[:nbest]
301
- self.mecab_lattice_set_request_type(lattice, MECAB_LATTICE_NBEST)
302
280
  else
303
281
  n = 1
304
- self.mecab_lattice_set_request_type(lattice, MECAB_LATTICE_ONE_BEST)
305
- end
306
- if @options[:theta]
307
- self.mecab_lattice_set_theta(lattice, @options[:theta])
308
282
  end
309
283
 
310
- tokens = tokenize(text, boundary_constraints)
311
- text = tokens.map {|t| t.first}.join
312
- self.mecab_lattice_set_sentence(lattice, text)
313
-
314
- bpos = 0
315
- tokens.each do |token|
316
- c = token.first.bytes.count
284
+ if constraints[:boundary_constraints]
285
+ tokens = tokenize_by_pattern(text,
286
+ constraints[:boundary_constraints])
287
+ text = tokens.map {|t| t.first}.join
288
+ self.mecab_lattice_set_sentence(@lattice, text)
317
289
 
318
- self.mecab_lattice_set_boundary_constraint(lattice, bpos, MECAB_TOKEN_BOUNDARY)
319
- bpos += 1
290
+ bpos = 0
291
+ tokens.each do |token|
292
+ c = token.first.bytes.count
320
293
 
321
- mark = token.last ? MECAB_INSIDE_TOKEN : MECAB_ANY_BOUNDARY
322
- (c-1).times do
323
- self.mecab_lattice_set_boundary_constraint(lattice, bpos, mark)
294
+ self.mecab_lattice_set_boundary_constraint(@lattice,
295
+ bpos,
296
+ MECAB_TOKEN_BOUNDARY)
324
297
  bpos += 1
298
+
299
+ mark = token.last ? MECAB_INSIDE_TOKEN : MECAB_ANY_BOUNDARY
300
+ (c-1).times do
301
+ self.mecab_lattice_set_boundary_constraint(@lattice,
302
+ bpos,
303
+ mark)
304
+ bpos += 1
305
+ end
325
306
  end
307
+ elsif constraints[:feature_constraints]
308
+ features = constraints[:feature_constraints]
309
+ tokens = tokenize_by_features(text,
310
+ features.keys)
311
+ text = tokens.map {|t| t.first}.join
312
+ self.mecab_lattice_set_sentence(@lattice, text)
313
+
314
+ bpos = 0
315
+ tokens.each do |token|
316
+ chunk = token.first
317
+ c = chunk.bytes.count
318
+ if token.last
319
+ self.mecab_lattice_set_feature_constraint(@lattice,
320
+ bpos,
321
+ bpos+c,
322
+ features[chunk])
323
+ end
324
+ bpos += c
325
+ end
326
+ else
327
+ self.mecab_lattice_set_sentence(@lattice, text)
326
328
  end
327
329
 
328
- self.mecab_parse_lattice(@tagger, lattice)
330
+ self.mecab_parse_lattice(@tagger, @lattice)
329
331
 
330
332
  if n > 1
331
- retval = self.mecab_lattice_nbest_tostr(lattice, n)
333
+ retval = self.mecab_lattice_nbest_tostr(@lattice, n)
332
334
  else
333
- retval = self.mecab_lattice_tostr(lattice)
335
+ retval = self.mecab_lattice_tostr(@lattice)
334
336
  end
335
337
  retval.force_encoding(Encoding.default_external)
336
338
  rescue
337
- raise(MeCabError.new(self.mecab_lattice_strerror(lattice)))
338
- ensure
339
- if lattice.address != 0x0
340
- self.mecab_lattice_destroy(lattice)
341
- end
339
+ raise(MeCabError.new(self.mecab_lattice_strerror(@lattice)))
342
340
  end
343
341
  }
344
342
 
345
- @bcparse_tonodes = ->(text, boundary_constraints=/./) {
343
+ @parse_tonodes = ->(text, constraints) {
344
+ self.mecab_lattice_add_request_type(@lattice, MECAB_LATTICE_NBEST)
346
345
  Enumerator.new do |y|
347
346
  begin
348
- lattice = self.mecab_lattice_new()
349
- raise MeCabError.new("Could not create Lattice") if lattice.address == 0x0
350
-
351
347
  if @options[:nbest] && @options[:nbest] > 1
352
348
  n = @options[:nbest]
353
- self.mecab_lattice_set_request_type(lattice, MECAB_LATTICE_NBEST)
354
349
  else
355
350
  n = 1
356
- self.mecab_lattice_set_request_type(lattice, MECAB_LATTICE_ONE_BEST)
357
- end
358
- if @options[:theta]
359
- self.mecab_lattice_set_theta(lattice, @options[:theta])
360
351
  end
361
352
 
362
- tokens = tokenize(text, boundary_constraints)
363
- text = tokens.map {|t| t.first}.join
364
- self.mecab_lattice_set_sentence(lattice, text)
365
-
366
- bpos = 0
367
- tokens.each do |token|
368
- c = token.first.bytes.count
353
+ if constraints[:boundary_constraints]
354
+ tokens = tokenize_by_pattern(text,
355
+ constraints[:boundary_constraints])
356
+ text = tokens.map {|t| t.first}.join
357
+ self.mecab_lattice_set_sentence(@lattice, text)
369
358
 
370
- self.mecab_lattice_set_boundary_constraint(lattice, bpos, MECAB_TOKEN_BOUNDARY)
371
- bpos += 1
359
+ bpos = 0
360
+ tokens.each do |token|
361
+ c = token.first.bytes.count
372
362
 
373
- mark = token.last ? MECAB_INSIDE_TOKEN : MECAB_ANY_BOUNDARY
374
- (c-1).times do
375
- self.mecab_lattice_set_boundary_constraint(lattice, bpos, mark)
363
+ self.mecab_lattice_set_boundary_constraint(@lattice,
364
+ bpos,
365
+ MECAB_TOKEN_BOUNDARY)
376
366
  bpos += 1
367
+
368
+ mark = token.last ? MECAB_INSIDE_TOKEN : MECAB_ANY_BOUNDARY
369
+ (c-1).times do
370
+ self.mecab_lattice_set_boundary_constraint(@lattice, bpos, mark)
371
+ bpos += 1
372
+ end
373
+ end
374
+ elsif constraints[:feature_constraints]
375
+ features = constraints[:feature_constraints]
376
+ tokens = tokenize_by_features(text,
377
+ features.keys)
378
+ text = tokens.map {|t| t.first}.join
379
+ self.mecab_lattice_set_sentence(@lattice, text)
380
+
381
+ bpos = 0
382
+ tokens.each do |token|
383
+ chunk = token.first
384
+ c = chunk.bytes.count
385
+ if token.last
386
+ self.mecab_lattice_set_feature_constraint(@lattice,
387
+ bpos,
388
+ bpos+c,
389
+ features[chunk])
390
+ end
391
+ bpos += c
377
392
  end
393
+ else
394
+ self.mecab_lattice_set_sentence(@lattice, text)
378
395
  end
379
396
 
380
- self.mecab_parse_lattice(@tagger, lattice)
397
+ self.mecab_parse_lattice(@tagger, @lattice)
381
398
 
382
399
  n.times do
383
- check = self.mecab_lattice_next(lattice)
400
+ check = self.mecab_lattice_next(@lattice)
384
401
  if check
385
- nptr = self.mecab_lattice_get_bos_node(lattice)
402
+ nptr = self.mecab_lattice_get_bos_node(@lattice)
386
403
 
387
- s = text.bytes.to_a
388
404
  while nptr && nptr.address!=0x0
389
405
  mn = Natto::MeCabNode.new(nptr)
390
- s = s.drop_while {|e| (e==0xa || e==0x20)}
391
- if !s.empty?
392
- sarr = []
393
- mn.length.times { sarr << s.shift }
394
- surf = sarr.pack('C*')
406
+ if !mn.is_bos?
407
+ surf = mn[:surface].bytes.to_a.slice(0,mn.length).pack('C*')
395
408
  mn.surface = surf.force_encoding(Encoding.default_external)
409
+ if @options[:output_format_type] || @options[:node_format]
410
+ mn.feature = self.mecab_format_node(@tagger, nptr).force_encoding(Encoding.default_external)
411
+ end
412
+ y.yield mn
396
413
  end
397
- if @options[:output_format_type] || @options[:node_format]
398
- mn.feature = self.mecab_format_node(@tagger, nptr).force_encoding(Encoding.default_external)
399
- end
400
- y.yield mn
401
- nptr = mn.next
414
+ nptr = mn[:next]
402
415
  end
403
416
  end
404
417
  end
418
+ nil
405
419
  rescue
406
- raise(MeCabError.new(self.mecab_lattice_strerror(lattice)))
407
- ensure
408
- if lattice.address != 0x0
409
- self.mecab_lattice_destroy(lattice)
410
- end
420
+ raise(MeCabError.new(self.mecab_lattice_strerror(@lattice)))
411
421
  end
412
422
  end
413
423
  }
414
424
 
415
- @dicts << Natto::DictionaryInfo.new(Natto::Binding.mecab_dictionary_info(@tagger))
425
+ @dicts = []
426
+ @dicts << Natto::DictionaryInfo.new(self.mecab_model_dictionary_info(@model))
416
427
  while @dicts.last.next.address != 0x0
417
428
  @dicts << Natto::DictionaryInfo.new(@dicts.last.next)
418
429
  end
419
430
 
420
431
  @version = self.mecab_version
421
432
 
422
- ObjectSpace.define_finalizer(self, self.class.create_free_proc(@tagger))
433
+ ObjectSpace.define_finalizer(self, self.class.create_free_proc(@model,
434
+ @tagger,
435
+ @lattice))
423
436
  end
424
437
 
425
438
  # Parses the given `text`, returning the MeCab output as a single string.
@@ -430,36 +443,45 @@ module Natto
430
443
  # `boundary_constraints` key in the `options` hash. Boundary constraints
431
444
  # parsing provides hints to MeCab on where the morpheme boundaries in the
432
445
  # given `text` are located. `boundary_constraints` value may be either a
433
- # `Regexp` or `String`; please see
434
- # [String#scan](http://ruby-doc.org/core-2.2.0/String.html#method-i-scan String#scan.
446
+ # `Regexp` or `String`; please see [String#scan](http://ruby-doc.org/core-2.2.1/String.html#method-i-scan)
435
447
  # The boundary constraint parsed output will be returned as a single
436
448
  # string, unless a block is passed to this method for node parsing.
437
449
  #
450
+ # Feature constraint parsing is available by passing in the
451
+ # `feature_constraints` key in the `options` hash. Feature constraints
452
+ # parsing provides instructions to MeCab to use the feature indicated
453
+ # for any morpheme that is an exact match for the given key.
454
+ # `feature_constraints` is a hash mapping a specific morpheme (String)
455
+ # to a corresponding feature value (String).
438
456
  # @param text [String] the Japanese text to parse
439
- # @param options [Hash] only the `boundary_constraints` key is available
440
- # @return [String] parsing result from `mecab`
441
- # @raise [MeCabError] if the `mecab` tagger cannot parse the given `text`
457
+ # @param constraints [Hash] `boundary_constraints` or `feature_constraints`
458
+ # @return [String] parsing result from MeCab
459
+ # @raise [MeCabError] if the MeCab Tagger cannot parse the given `text`
442
460
  # @raise [ArgumentError] if the given string `text` argument is `nil`
443
461
  # @see MeCabNode
444
- def parse(text, options={})
445
- raise ArgumentError.new 'Text to parse cannot be nil' if text.nil?
446
- if options[:boundary_constraints]
447
- if block_given?
448
- @bcparse_tonodes.call(text, options[:boundary_constraints]).each {|n| yield n }
449
- else
450
- @bcparse_tostr.call(text, options[:boundary_constraints])
462
+ def parse(text, constraints={})
463
+ if text.nil?
464
+ raise ArgumentError.new 'Text to parse cannot be nil'
465
+ elsif constraints[:boundary_constraints]
466
+ if !(constraints[:boundary_constraints].is_a?(Regexp) ||
467
+ constraints[:boundary_constraints].is_a?(String))
468
+ raise ArgumentError.new 'boundary constraints must be a Regexp or String'
451
469
  end
470
+ elsif constraints[:feature_constraints] && !constraints[:feature_constraints].is_a?(Hash)
471
+ raise ArgumentError.new 'feature constraints must be a Hash'
472
+ elsif @options[:partial] && !text.end_with?("\n")
473
+ raise ArgumentError.new 'partial parsing requires new-line char at end of text'
474
+ end
475
+
476
+ if block_given?
477
+ @parse_tonodes.call(text, constraints).each {|n| yield n }
452
478
  else
453
- if block_given?
454
- @parse_tonodes.call(text).each {|n| yield n }
455
- else
456
- @parse_tostr.call(text)
457
- end
479
+ @parse_tostr.call(text, constraints)
458
480
  end
459
481
  end
460
482
 
461
483
  # Parses the given string `text`, returning an
462
- # {http://www.ruby-doc.org/core-2.1.5/Enumerator.html Enumerator} that may be
484
+ # [Enumerator](http://www.ruby-doc.org/core-2.2.1/Enumerator.html) that may be
463
485
  # used to iterate over the resulting {MeCabNode} objects. This is more
464
486
  # efficient than parsing to a simple string, since each node's
465
487
  # information will not be materialized all at once as it is with
@@ -469,45 +491,62 @@ module Natto
469
491
  # the morpheme. Node-formatting may also be used to customize
470
492
  # the resulting node's `feature` attribute.
471
493
  #
472
- # Boundary constraint parsing is available via passing in the
494
+ # Boundary constraint parsing is available by passing in the
473
495
  # `boundary_constraints` key in the `options` hash. Boundary constraints
474
496
  # parsing provides hints to MeCab on where the morpheme boundaries in the
475
497
  # given `text` are located. `boundary_constraints` value may be either a
476
498
  # `Regexp` or `String`; please see
477
- # [String#scan](http://ruby-doc.org/core-2.2.0/String.html#method-i-scan String#scan).
499
+ # [String#scan](http://ruby-doc.org/core-2.2.1/String.html#method-i-scan)
478
500
  #
501
+ # Feature constraint parsing is available by passing in the
502
+ # `feature_constraints` key in the `options` hash. Feature constraints
503
+ # parsing provides instructions to MeCab to use the feature indicated
504
+ # for any morpheme that is an exact match for the given key.
505
+ # `feature_constraints` is a hash mapping a specific morpheme (String)
506
+ # to a corresponding feature value (String).
479
507
  # @param text [String] the Japanese text to parse
480
- # @param options [Hash] only the `boundary_constraints` key is available
508
+ # @param constraints [Hash] `boundary_constraints` or `feature_constraints`
481
509
  # @return [Enumerator] of MeCabNode instances
482
- # @raise [MeCabError] if the `mecab` tagger cannot parse the given `text`
510
+ # @raise [MeCabError] if the MeCab Tagger cannot parse the given `text`
483
511
  # @raise [ArgumentError] if the given string `text` argument is `nil`
484
512
  # @see MeCabNode
485
513
  # @see http://ruby-doc.org/core-2.2.1/Enumerator.html
486
- def enum_parse(text, options={})
487
- raise ArgumentError.new 'Text to parse cannot be nil' if text.nil?
488
- if options[:boundary_constraints]
489
- @bcparse_tonodes.call(text, options[:boundary_constraints])
490
- else
491
- @parse_tonodes.call(text)
514
+ def enum_parse(text, constraints={})
515
+ if text.nil?
516
+ raise ArgumentError.new 'Text to parse cannot be nil'
517
+ elsif constraints[:boundary_constraints]
518
+ if !(constraints[:boundary_constraints].is_a?(Regexp) ||
519
+ constraints[:boundary_constraints].is_a?(String))
520
+ raise ArgumentError.new 'boundary constraints must be a Regexp or String'
521
+ end
522
+ elsif constraints[:feature_constraints] && !constraints[:feature_constraints].is_a?(Hash)
523
+ raise ArgumentError.new 'feature constraints must be a Hash'
524
+ elsif @options[:partial] && !text.end_with?("\n")
525
+ raise ArgumentError.new 'partial parsing requires new-line char at end of text'
492
526
  end
527
+
528
+ @parse_tonodes.call(text, constraints)
493
529
  end
494
530
 
495
- # Returns human-readable details for the wrapped `mecab` tagger.
531
+ # Returns human-readable details for the wrapped MeCab library.
496
532
  # Overrides `Object#to_s`.
497
533
  #
498
534
  # - encoded object id
499
- # - underlying FFI pointer to the `mecab` tagger
500
- # - real file path to `mecab` library
535
+ # - underlying FFI pointer to the MeCab Model
536
+ # - underlying FFI pointer to the MeCab Tagger
537
+ # - underlying FFI pointer to the MeCab Lattice
538
+ # - real file path to MeCab library
501
539
  # - options hash
502
540
  # - list of dictionaries
503
541
  # - MeCab version
504
- #
505
542
  # @return [String] encoded object id, underlying FFI pointer,
506
- # file path to `mecab` library, options hash,
543
+ # file path to MeCab library, options hash,
507
544
  # list of dictionaries and MeCab version
508
545
  def to_s
509
546
  [ super.chop,
547
+ "@model=#{@model},",
510
548
  "@tagger=#{@tagger},",
549
+ "@lattice=#{@lattice},",
511
550
  "@libpath=\"#{@libpath}\",",
512
551
  "@options=#{@options.inspect},",
513
552
  "@dicts=#{@dicts.to_s},",
@@ -515,7 +554,6 @@ module Natto
515
554
  end
516
555
 
517
556
  # Overrides `Object#inspect`.
518
- #
519
557
  # @return [String] encoded object id, FFI pointer, options hash,
520
558
  # list of dictionaries, and MeCab version
521
559
  # @see #to_s
@@ -524,26 +562,27 @@ module Natto
524
562
  end
525
563
 
526
564
  # Returns a `Proc` that will properly free resources
527
- # when this `Tagger` instance is garbage collected.
528
- # The `Proc` returned is registered to be invoked
529
- # after the `Tagger` instance owning `tptr`
530
- # has been destroyed.
531
- #
532
- # @param tptr [FFI::Pointer] pointer to `Tagger`
533
- # @return [Proc] to release `mecab` resources properly
534
- def self.create_free_proc(tptr)
565
+ # when this instance is garbage collected.
566
+ # @param mptr [FFI::Pointer] pointer to Model
567
+ # @param tptr [FFI::Pointer] pointer to Tagger
568
+ # @param lptr [FFI::Pointer] pointer to Lattice
569
+ # @return [Proc] to release MeCab resources properly
570
+ def self.create_free_proc(mptr, tptr, lptr)
535
571
  Proc.new do
572
+ self.mecab_lattice_destroy(lptr)
536
573
  self.mecab_destroy(tptr)
574
+ self.mecab_model_destory(mptr)
537
575
  end
538
576
  end
539
577
 
540
578
  private
541
579
 
542
580
  # @private
543
- def tokenize(text, pattern)
581
+ # MeCab eats all leading and training whitespace char
582
+ def tokenize_by_pattern(text, pattern)
544
583
  matches = text.scan(pattern)
545
584
 
546
- acc =[]
585
+ acc = []
547
586
  tmp = text
548
587
  matches.each_with_index do |m,i|
549
588
  bef, mat, aft = tmp.partition(m)
@@ -553,17 +592,34 @@ module Natto
553
592
  unless mat.empty?
554
593
  acc << [mat.strip, true]
555
594
  end
556
- if i==matches.size-1 and !aft.empty?
595
+ if i==matches.size-1 && !aft.empty?
557
596
  acc << [aft.strip, false]
558
597
  end
559
598
  tmp = aft
560
599
  end
561
600
  acc
562
601
  end
602
+
603
+ def tokenize_by_features(text, features)
604
+ acc = []
605
+ acc << [text.strip, false]
606
+
607
+ features.each do |feature|
608
+ acc.each_with_index do |e,i|
609
+ if !e.last
610
+ tmp = tokenize_by_pattern(e.first, feature)
611
+ if !tmp.empty?
612
+ acc.delete_at(i)
613
+ acc.insert(i, *tmp)
614
+ end
615
+ end
616
+ end
617
+ end
618
+ acc
619
+ end
563
620
  end
564
621
 
565
- # `MeCabError` is a general error class
566
- # for the `Natto` module.
622
+ # `MeCabError` is a general error class for the `Natto` module.
567
623
  class MeCabError < RuntimeError; end
568
624
  end
569
625