natto 0.9.9 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,9 +4,9 @@ require 'natto/option_parse'
4
4
  require 'natto/struct'
5
5
 
6
6
  module Natto
7
- # `MeCab` is a wrapper class for the MeCab `Tagger`.
8
- # Options to the MeCab `Tagger` are passed in as a string
9
- # (MeCab command-line style) or as a Ruby-style hash at
7
+ # `MeCab` is a class providing an interface to the MeCab library.
8
+ # Options to the MeCab Model, Tagger and Lattice are passed in
9
+ # as a string (MeCab command-line style) or as a Ruby-style hash at
10
10
  # initialization.
11
11
  #
12
12
  # ## Usage
@@ -16,14 +16,16 @@ module Natto
16
16
  # text = '凡人にしか見えねえ風景ってのがあるんだよ。'
17
17
  #
18
18
  # nm = Natto::MeCab.new
19
- # => #<Natto::MeCab:0x28d3bdc8 \
20
- # @tagger=#<FFI::Pointer address=0x28afb980>, \
21
- # @libpath="/usr/local/lib/libmecab.so" \
22
- # @options={}, \
23
- # @dicts=[#<Natto::DictionaryInfo:0x289a1f14 \
19
+ # => #<Natto::MeCab:0x0000080318d278 \
20
+ # @model=#<FFI::Pointer address=0x000008039174c0>, \
21
+ # @tagger=#<FFI::Pointer address=0x0000080329ba60>, \
22
+ # @lattice=#<FFI::Pointer address=0x000008045bd140>, \
23
+ # @libpath="/usr/local/lib/libmecab.so" \
24
+ # @options={}, \
25
+ # @dicts=[#<Natto::DictionaryInfo:0x0000080318ce90 \
24
26
  # @filepath="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
25
- # charset=utf8, \
26
- # type=0>], \
27
+ # charset=utf8, \
28
+ # type=0>], \
27
29
  # @version=0.996>
28
30
  #
29
31
  # # print entire MeCab result to stdout
@@ -104,7 +106,7 @@ module Natto
104
106
  #
105
107
  # # Boundary constraint parsing with output formatting.
106
108
  # # %m ... morpheme surface
107
- # # %F, ... comma-delimited ChaSen feature values
109
+ # # %f ... tab-delimited ChaSen feature values
108
110
  # # part-of-speech (index 0)
109
111
  # # %2 ... MeCab node status value (1 unknown)
110
112
  # #
@@ -148,19 +150,22 @@ module Natto
148
150
  MECAB_TOKEN_BOUNDARY = 1
149
151
  MECAB_INSIDE_TOKEN = 2
150
152
 
151
- # @return [FFI:Pointer] pointer to MeCab tagger.
153
+ # @return [FFI:Pointer] pointer to MeCab Model.
154
+ attr_reader :model
155
+ # @return [FFI:Pointer] pointer to MeCab Tagger.
152
156
  attr_reader :tagger
157
+ # @return [FFI:Pointer] pointer to MeCab Lattice.
158
+ attr_reader :lattice
153
159
  # @return [String] absolute filepath to MeCab library.
154
160
  attr_reader :libpath
155
161
  # @return [Hash] MeCab options as key-value pairs.
156
162
  attr_reader :options
157
163
  # @return [Array] listing of all of dictionaries referenced.
158
164
  attr_reader :dicts
159
- # @return [String] `MeCab` version.
165
+ # @return [String] MeCab version.
160
166
  attr_reader :version
161
167
 
162
- # Initializes the wrapped `Tagger` instance with the
163
- # given `options`.
168
+ # Initializes the wrapped Tagger instance with the given `options`.
164
169
  #
165
170
  # Options supported are:
166
171
  #
@@ -186,19 +191,21 @@ module Natto
186
191
  # - :cost_factor -- cost factor (integer, default 700)
187
192
  #
188
193
  # <p>MeCab command-line arguments (-F) or long (--node-format) may be used in
189
- # addition to Ruby-style `Hash`es</p>
194
+ # addition to Ruby-style hashs</p>
190
195
  # <i>Use single-quotes to preserve format options that contain escape chars.</i><br/>
191
196
  # e.g.<br/>
192
197
  #
193
198
  # nm = Natto::MeCab.new(node_format: '%m¥t%f[7]¥n')
194
- # => #<Natto::MeCab:0x28d2ae10
195
- # @tagger=#<FFI::Pointer address=0x28a97980>, \
196
- # @libpath="/usr/local/lib/libmecab.so", \
197
- # @options={:node_format=>"%m¥t%f[7]¥n"}, \
198
- # @dicts=[#<Natto::DictionaryInfo:0x28d2a85c \
199
+ # => #<Natto::MeCab:0x00000803503ee8 \
200
+ # @model=#<FFI::Pointer address=0x00000802b6d9c0>, \
201
+ # @tagger=#<FFI::Pointer address=0x00000802ad3ec0>, \
202
+ # @lattice=#<FFI::Pointer address=0x000008035f3980>, \
203
+ # @libpath="/usr/local/lib/libmecab.so", \
204
+ # @options={:node_format=>"%m¥t%f[7]¥n"}, \
205
+ # @dicts=[#<Natto::DictionaryInfo:0x000008035038f8 \
199
206
  # @filepath="/usr/local/lib/mecab/dic/ipadic/sys.dic" \
200
- # charset=utf8, \
201
- # type=0>] \
207
+ # charset=utf8, \
208
+ # type=0>] \
202
209
  # @version=0.996>
203
210
  #
204
211
  # puts nm.parse('才能とは求める人間に与えられるものではない。')
@@ -216,210 +223,216 @@ module Natto
216
223
  # ない ナイ
217
224
  # 。 。
218
225
  # EOS
219
- #
220
- # @param options [Hash, String] the MeCab options for tagger
221
- # @raise [MeCabError] if `mecab` cannot be initialized with the given `options`
226
+ # @param options [Hash, String] the MeCab options
227
+ # @raise [MeCabError] if MeCab cannot be initialized with the given `options`
222
228
  def initialize(options={})
223
229
  @options = self.class.parse_mecab_options(options)
224
- @dicts = []
225
- # TODO invoke function for enhancing MeCabNode after this point
226
-
227
230
  opt_str = self.class.build_options_str(@options)
228
- @tagger = self.class.mecab_new2(opt_str)
231
+
232
+ @model = self.class.mecab_model_new2(opt_str)
233
+ if @model.address == 0x0
234
+ raise MeCabError.new("Could not initialize Model with options: '#{opt_str}'")
235
+ end
236
+
237
+ @tagger = self.class.mecab_model_new_tagger(@model)
238
+ if @tagger.address == 0x0
239
+ raise MeCabError.new("Could not initialize Tagger with options: '#{opt_str}'")
240
+ end
241
+
242
+ @lattice = self.class.mecab_model_new_lattice(@model)
243
+ if @lattice.address == 0x0
244
+ raise MeCabError.new("Could not initialize Lattice with options: '#{opt_str}'")
245
+ end
246
+
229
247
  @libpath = self.class.find_library
230
- raise MeCabError.new("Could not initialize MeCab with options: '#{opt_str}'") if @tagger.address == 0x0
231
-
232
- self.mecab_set_theta(@tagger, @options[:theta]) if @options[:theta]
233
- self.mecab_set_lattice_level(@tagger, @options[:lattice_level]) if @options[:lattice_level]
234
- self.mecab_set_all_morphs(@tagger, 1) if @options[:all_morphs]
235
- self.mecab_set_partial(@tagger, 1) if @options[:partial]
236
-
237
- # Define lambda for each major parsing type: _tostr, _tonode,
238
- # boundary constraint _tostr, boundary constraint _node;
239
- # and each parsing type will support both normal and N-best
240
- # options
241
- @parse_tostr = ->(text) {
242
- if @options[:nbest] && @options[:nbest] > 1
243
- #self.mecab_set_lattice_level(@tagger, (@options[:lattice_level] || 1))
244
- retval = self.mecab_nbest_sparse_tostr(@tagger, @options[:nbest], text) ||
245
- raise(MeCabError.new(self.mecab_strerror(@tagger)))
246
- else
247
- retval = self.mecab_sparse_tostr(@tagger, text) ||
248
- raise(MeCabError.new(self.mecab_strerror(@tagger)))
249
- end
250
248
 
251
- retval.force_encoding(Encoding.default_external)
252
- }
249
+ if @options[:nbest] && @options[:nbest] > 1
250
+ self.mecab_lattice_set_request_type(@lattice, MECAB_LATTICE_NBEST)
251
+ else
252
+ self.mecab_lattice_set_request_type(@lattice, MECAB_LATTICE_ONE_BEST)
253
+ end
254
+ if @options[:partial]
255
+ self.mecab_lattice_add_request_type(@lattice, MECAB_LATTICE_PARTIAL)
256
+ end
257
+ if @options[:marginal]
258
+ self.mecab_lattice_add_request_type(@lattice,
259
+ MECAB_LATTICE_MARGINAL_PROB)
260
+ end
261
+ if @options[:all_morphs]
262
+ # required when node parsing
263
+ #self.mecab_lattice_add_request_type(@lattice, MECAB_LATTICE_NBEST)
264
+ self.mecab_lattice_add_request_type(@lattice,
265
+ MECAB_LATTICE_ALL_MORPHS)
266
+ end
267
+ if @options[:allocate_sentence]
268
+ self.mecab_lattice_add_request_type(@lattice,
269
+ MECAB_LATTICE_ALLOCATE_SENTENCE)
270
+ end
253
271
 
254
- @parse_tonodes = ->(text) {
255
- Enumerator.new do |y|
256
- if @options[:nbest] && @options[:nbest] > 1
257
- nlen = @options[:nbest]
258
- #self.mecab_set_lattice_level(@tagger, (@options[:lattice_level] || 1))
259
- self.mecab_nbest_init(@tagger, text)
260
- nptr = self.mecab_nbest_next_tonode(@tagger)
261
- else
262
- nlen = 1
263
- nptr = self.mecab_sparse_tonode(@tagger, text)
264
- end
265
- raise(MeCabError.new(self.mecab_strerror(@tagger))) if nptr.nil? || nptr.address==0x0
266
-
267
- nlen.times do
268
- s = text.bytes.to_a
269
- while nptr && nptr.address != 0x0
270
- mn = Natto::MeCabNode.new(nptr)
271
- # ignore BOS nodes, since mecab does so
272
- if !mn.is_bos?
273
- s = s.drop_while {|e| (e==0xa || e==0x20)}
274
- if !s.empty?
275
- sarr = []
276
- mn.length.times { sarr << s.shift }
277
- surf = sarr.pack('C*')
278
- mn.surface = surf.force_encoding(Encoding.default_external)
279
- end
280
- if @options[:output_format_type] || @options[:node_format]
281
- mn.feature = self.mecab_format_node(@tagger, nptr).force_encoding(Encoding.default_external)
282
- end
283
- y.yield mn
284
- end
285
- nptr = mn.next
286
- end
287
- if nlen > 1
288
- nptr = self.mecab_nbest_next_tonode(@tagger)
289
- end
290
- end
291
- end
292
- }
293
-
294
- @bcparse_tostr = ->(text, boundary_constraints=/./) {
295
- begin
296
- lattice = self.mecab_lattice_new()
297
- raise MeCabError.new("Could not create Lattice") if lattice.address == 0x0
272
+ if @options[:theta]
273
+ self.mecab_lattice_set_theta(@lattice, @options[:theta])
274
+ end
298
275
 
276
+ @parse_tostr = ->(text, constraints) {
277
+ begin
299
278
  if @options[:nbest] && @options[:nbest] > 1
300
279
  n = @options[:nbest]
301
- self.mecab_lattice_set_request_type(lattice, MECAB_LATTICE_NBEST)
302
280
  else
303
281
  n = 1
304
- self.mecab_lattice_set_request_type(lattice, MECAB_LATTICE_ONE_BEST)
305
- end
306
- if @options[:theta]
307
- self.mecab_lattice_set_theta(lattice, @options[:theta])
308
282
  end
309
283
 
310
- tokens = tokenize(text, boundary_constraints)
311
- text = tokens.map {|t| t.first}.join
312
- self.mecab_lattice_set_sentence(lattice, text)
313
-
314
- bpos = 0
315
- tokens.each do |token|
316
- c = token.first.bytes.count
284
+ if constraints[:boundary_constraints]
285
+ tokens = tokenize_by_pattern(text,
286
+ constraints[:boundary_constraints])
287
+ text = tokens.map {|t| t.first}.join
288
+ self.mecab_lattice_set_sentence(@lattice, text)
317
289
 
318
- self.mecab_lattice_set_boundary_constraint(lattice, bpos, MECAB_TOKEN_BOUNDARY)
319
- bpos += 1
290
+ bpos = 0
291
+ tokens.each do |token|
292
+ c = token.first.bytes.count
320
293
 
321
- mark = token.last ? MECAB_INSIDE_TOKEN : MECAB_ANY_BOUNDARY
322
- (c-1).times do
323
- self.mecab_lattice_set_boundary_constraint(lattice, bpos, mark)
294
+ self.mecab_lattice_set_boundary_constraint(@lattice,
295
+ bpos,
296
+ MECAB_TOKEN_BOUNDARY)
324
297
  bpos += 1
298
+
299
+ mark = token.last ? MECAB_INSIDE_TOKEN : MECAB_ANY_BOUNDARY
300
+ (c-1).times do
301
+ self.mecab_lattice_set_boundary_constraint(@lattice,
302
+ bpos,
303
+ mark)
304
+ bpos += 1
305
+ end
325
306
  end
307
+ elsif constraints[:feature_constraints]
308
+ features = constraints[:feature_constraints]
309
+ tokens = tokenize_by_features(text,
310
+ features.keys)
311
+ text = tokens.map {|t| t.first}.join
312
+ self.mecab_lattice_set_sentence(@lattice, text)
313
+
314
+ bpos = 0
315
+ tokens.each do |token|
316
+ chunk = token.first
317
+ c = chunk.bytes.count
318
+ if token.last
319
+ self.mecab_lattice_set_feature_constraint(@lattice,
320
+ bpos,
321
+ bpos+c,
322
+ features[chunk])
323
+ end
324
+ bpos += c
325
+ end
326
+ else
327
+ self.mecab_lattice_set_sentence(@lattice, text)
326
328
  end
327
329
 
328
- self.mecab_parse_lattice(@tagger, lattice)
330
+ self.mecab_parse_lattice(@tagger, @lattice)
329
331
 
330
332
  if n > 1
331
- retval = self.mecab_lattice_nbest_tostr(lattice, n)
333
+ retval = self.mecab_lattice_nbest_tostr(@lattice, n)
332
334
  else
333
- retval = self.mecab_lattice_tostr(lattice)
335
+ retval = self.mecab_lattice_tostr(@lattice)
334
336
  end
335
337
  retval.force_encoding(Encoding.default_external)
336
338
  rescue
337
- raise(MeCabError.new(self.mecab_lattice_strerror(lattice)))
338
- ensure
339
- if lattice.address != 0x0
340
- self.mecab_lattice_destroy(lattice)
341
- end
339
+ raise(MeCabError.new(self.mecab_lattice_strerror(@lattice)))
342
340
  end
343
341
  }
344
342
 
345
- @bcparse_tonodes = ->(text, boundary_constraints=/./) {
343
+ @parse_tonodes = ->(text, constraints) {
344
+ self.mecab_lattice_add_request_type(@lattice, MECAB_LATTICE_NBEST)
346
345
  Enumerator.new do |y|
347
346
  begin
348
- lattice = self.mecab_lattice_new()
349
- raise MeCabError.new("Could not create Lattice") if lattice.address == 0x0
350
-
351
347
  if @options[:nbest] && @options[:nbest] > 1
352
348
  n = @options[:nbest]
353
- self.mecab_lattice_set_request_type(lattice, MECAB_LATTICE_NBEST)
354
349
  else
355
350
  n = 1
356
- self.mecab_lattice_set_request_type(lattice, MECAB_LATTICE_ONE_BEST)
357
- end
358
- if @options[:theta]
359
- self.mecab_lattice_set_theta(lattice, @options[:theta])
360
351
  end
361
352
 
362
- tokens = tokenize(text, boundary_constraints)
363
- text = tokens.map {|t| t.first}.join
364
- self.mecab_lattice_set_sentence(lattice, text)
365
-
366
- bpos = 0
367
- tokens.each do |token|
368
- c = token.first.bytes.count
353
+ if constraints[:boundary_constraints]
354
+ tokens = tokenize_by_pattern(text,
355
+ constraints[:boundary_constraints])
356
+ text = tokens.map {|t| t.first}.join
357
+ self.mecab_lattice_set_sentence(@lattice, text)
369
358
 
370
- self.mecab_lattice_set_boundary_constraint(lattice, bpos, MECAB_TOKEN_BOUNDARY)
371
- bpos += 1
359
+ bpos = 0
360
+ tokens.each do |token|
361
+ c = token.first.bytes.count
372
362
 
373
- mark = token.last ? MECAB_INSIDE_TOKEN : MECAB_ANY_BOUNDARY
374
- (c-1).times do
375
- self.mecab_lattice_set_boundary_constraint(lattice, bpos, mark)
363
+ self.mecab_lattice_set_boundary_constraint(@lattice,
364
+ bpos,
365
+ MECAB_TOKEN_BOUNDARY)
376
366
  bpos += 1
367
+
368
+ mark = token.last ? MECAB_INSIDE_TOKEN : MECAB_ANY_BOUNDARY
369
+ (c-1).times do
370
+ self.mecab_lattice_set_boundary_constraint(@lattice, bpos, mark)
371
+ bpos += 1
372
+ end
373
+ end
374
+ elsif constraints[:feature_constraints]
375
+ features = constraints[:feature_constraints]
376
+ tokens = tokenize_by_features(text,
377
+ features.keys)
378
+ text = tokens.map {|t| t.first}.join
379
+ self.mecab_lattice_set_sentence(@lattice, text)
380
+
381
+ bpos = 0
382
+ tokens.each do |token|
383
+ chunk = token.first
384
+ c = chunk.bytes.count
385
+ if token.last
386
+ self.mecab_lattice_set_feature_constraint(@lattice,
387
+ bpos,
388
+ bpos+c,
389
+ features[chunk])
390
+ end
391
+ bpos += c
377
392
  end
393
+ else
394
+ self.mecab_lattice_set_sentence(@lattice, text)
378
395
  end
379
396
 
380
- self.mecab_parse_lattice(@tagger, lattice)
397
+ self.mecab_parse_lattice(@tagger, @lattice)
381
398
 
382
399
  n.times do
383
- check = self.mecab_lattice_next(lattice)
400
+ check = self.mecab_lattice_next(@lattice)
384
401
  if check
385
- nptr = self.mecab_lattice_get_bos_node(lattice)
402
+ nptr = self.mecab_lattice_get_bos_node(@lattice)
386
403
 
387
- s = text.bytes.to_a
388
404
  while nptr && nptr.address!=0x0
389
405
  mn = Natto::MeCabNode.new(nptr)
390
- s = s.drop_while {|e| (e==0xa || e==0x20)}
391
- if !s.empty?
392
- sarr = []
393
- mn.length.times { sarr << s.shift }
394
- surf = sarr.pack('C*')
406
+ if !mn.is_bos?
407
+ surf = mn[:surface].bytes.to_a.slice(0,mn.length).pack('C*')
395
408
  mn.surface = surf.force_encoding(Encoding.default_external)
409
+ if @options[:output_format_type] || @options[:node_format]
410
+ mn.feature = self.mecab_format_node(@tagger, nptr).force_encoding(Encoding.default_external)
411
+ end
412
+ y.yield mn
396
413
  end
397
- if @options[:output_format_type] || @options[:node_format]
398
- mn.feature = self.mecab_format_node(@tagger, nptr).force_encoding(Encoding.default_external)
399
- end
400
- y.yield mn
401
- nptr = mn.next
414
+ nptr = mn[:next]
402
415
  end
403
416
  end
404
417
  end
418
+ nil
405
419
  rescue
406
- raise(MeCabError.new(self.mecab_lattice_strerror(lattice)))
407
- ensure
408
- if lattice.address != 0x0
409
- self.mecab_lattice_destroy(lattice)
410
- end
420
+ raise(MeCabError.new(self.mecab_lattice_strerror(@lattice)))
411
421
  end
412
422
  end
413
423
  }
414
424
 
415
- @dicts << Natto::DictionaryInfo.new(Natto::Binding.mecab_dictionary_info(@tagger))
425
+ @dicts = []
426
+ @dicts << Natto::DictionaryInfo.new(self.mecab_model_dictionary_info(@model))
416
427
  while @dicts.last.next.address != 0x0
417
428
  @dicts << Natto::DictionaryInfo.new(@dicts.last.next)
418
429
  end
419
430
 
420
431
  @version = self.mecab_version
421
432
 
422
- ObjectSpace.define_finalizer(self, self.class.create_free_proc(@tagger))
433
+ ObjectSpace.define_finalizer(self, self.class.create_free_proc(@model,
434
+ @tagger,
435
+ @lattice))
423
436
  end
424
437
 
425
438
  # Parses the given `text`, returning the MeCab output as a single string.
@@ -430,36 +443,45 @@ module Natto
430
443
  # `boundary_constraints` key in the `options` hash. Boundary constraints
431
444
  # parsing provides hints to MeCab on where the morpheme boundaries in the
432
445
  # given `text` are located. `boundary_constraints` value may be either a
433
- # `Regexp` or `String`; please see
434
- # [String#scan](http://ruby-doc.org/core-2.2.0/String.html#method-i-scan String#scan.
446
+ # `Regexp` or `String`; please see [String#scan](http://ruby-doc.org/core-2.2.1/String.html#method-i-scan)
435
447
  # The boundary constraint parsed output will be returned as a single
436
448
  # string, unless a block is passed to this method for node parsing.
437
449
  #
450
+ # Feature constraint parsing is available by passing in the
451
+ # `feature_constraints` key in the `options` hash. Feature constraints
452
+ # parsing provides instructions to MeCab to use the feature indicated
453
+ # for any morpheme that is an exact match for the given key.
454
+ # `feature_constraints` is a hash mapping a specific morpheme (String)
455
+ # to a corresponding feature value (String).
438
456
  # @param text [String] the Japanese text to parse
439
- # @param options [Hash] only the `boundary_constraints` key is available
440
- # @return [String] parsing result from `mecab`
441
- # @raise [MeCabError] if the `mecab` tagger cannot parse the given `text`
457
+ # @param constraints [Hash] `boundary_constraints` or `feature_constraints`
458
+ # @return [String] parsing result from MeCab
459
+ # @raise [MeCabError] if the MeCab Tagger cannot parse the given `text`
442
460
  # @raise [ArgumentError] if the given string `text` argument is `nil`
443
461
  # @see MeCabNode
444
- def parse(text, options={})
445
- raise ArgumentError.new 'Text to parse cannot be nil' if text.nil?
446
- if options[:boundary_constraints]
447
- if block_given?
448
- @bcparse_tonodes.call(text, options[:boundary_constraints]).each {|n| yield n }
449
- else
450
- @bcparse_tostr.call(text, options[:boundary_constraints])
462
+ def parse(text, constraints={})
463
+ if text.nil?
464
+ raise ArgumentError.new 'Text to parse cannot be nil'
465
+ elsif constraints[:boundary_constraints]
466
+ if !(constraints[:boundary_constraints].is_a?(Regexp) ||
467
+ constraints[:boundary_constraints].is_a?(String))
468
+ raise ArgumentError.new 'boundary constraints must be a Regexp or String'
451
469
  end
470
+ elsif constraints[:feature_constraints] && !constraints[:feature_constraints].is_a?(Hash)
471
+ raise ArgumentError.new 'feature constraints must be a Hash'
472
+ elsif @options[:partial] && !text.end_with?("\n")
473
+ raise ArgumentError.new 'partial parsing requires new-line char at end of text'
474
+ end
475
+
476
+ if block_given?
477
+ @parse_tonodes.call(text, constraints).each {|n| yield n }
452
478
  else
453
- if block_given?
454
- @parse_tonodes.call(text).each {|n| yield n }
455
- else
456
- @parse_tostr.call(text)
457
- end
479
+ @parse_tostr.call(text, constraints)
458
480
  end
459
481
  end
460
482
 
461
483
  # Parses the given string `text`, returning an
462
- # {http://www.ruby-doc.org/core-2.1.5/Enumerator.html Enumerator} that may be
484
+ # [Enumerator](http://www.ruby-doc.org/core-2.2.1/Enumerator.html) that may be
463
485
  # used to iterate over the resulting {MeCabNode} objects. This is more
464
486
  # efficient than parsing to a simple string, since each node's
465
487
  # information will not be materialized all at once as it is with
@@ -469,45 +491,62 @@ module Natto
469
491
  # the morpheme. Node-formatting may also be used to customize
470
492
  # the resulting node's `feature` attribute.
471
493
  #
472
- # Boundary constraint parsing is available via passing in the
494
+ # Boundary constraint parsing is available by passing in the
473
495
  # `boundary_constraints` key in the `options` hash. Boundary constraints
474
496
  # parsing provides hints to MeCab on where the morpheme boundaries in the
475
497
  # given `text` are located. `boundary_constraints` value may be either a
476
498
  # `Regexp` or `String`; please see
477
- # [String#scan](http://ruby-doc.org/core-2.2.0/String.html#method-i-scan String#scan).
499
+ # [String#scan](http://ruby-doc.org/core-2.2.1/String.html#method-i-scan)
478
500
  #
501
+ # Feature constraint parsing is available by passing in the
502
+ # `feature_constraints` key in the `options` hash. Feature constraints
503
+ # parsing provides instructions to MeCab to use the feature indicated
504
+ # for any morpheme that is an exact match for the given key.
505
+ # `feature_constraints` is a hash mapping a specific morpheme (String)
506
+ # to a corresponding feature value (String).
479
507
  # @param text [String] the Japanese text to parse
480
- # @param options [Hash] only the `boundary_constraints` key is available
508
+ # @param constraints [Hash] `boundary_constraints` or `feature_constraints`
481
509
  # @return [Enumerator] of MeCabNode instances
482
- # @raise [MeCabError] if the `mecab` tagger cannot parse the given `text`
510
+ # @raise [MeCabError] if the MeCab Tagger cannot parse the given `text`
483
511
  # @raise [ArgumentError] if the given string `text` argument is `nil`
484
512
  # @see MeCabNode
485
513
  # @see http://ruby-doc.org/core-2.2.1/Enumerator.html
486
- def enum_parse(text, options={})
487
- raise ArgumentError.new 'Text to parse cannot be nil' if text.nil?
488
- if options[:boundary_constraints]
489
- @bcparse_tonodes.call(text, options[:boundary_constraints])
490
- else
491
- @parse_tonodes.call(text)
514
+ def enum_parse(text, constraints={})
515
+ if text.nil?
516
+ raise ArgumentError.new 'Text to parse cannot be nil'
517
+ elsif constraints[:boundary_constraints]
518
+ if !(constraints[:boundary_constraints].is_a?(Regexp) ||
519
+ constraints[:boundary_constraints].is_a?(String))
520
+ raise ArgumentError.new 'boundary constraints must be a Regexp or String'
521
+ end
522
+ elsif constraints[:feature_constraints] && !constraints[:feature_constraints].is_a?(Hash)
523
+ raise ArgumentError.new 'feature constraints must be a Hash'
524
+ elsif @options[:partial] && !text.end_with?("\n")
525
+ raise ArgumentError.new 'partial parsing requires new-line char at end of text'
492
526
  end
527
+
528
+ @parse_tonodes.call(text, constraints)
493
529
  end
494
530
 
495
- # Returns human-readable details for the wrapped `mecab` tagger.
531
+ # Returns human-readable details for the wrapped MeCab library.
496
532
  # Overrides `Object#to_s`.
497
533
  #
498
534
  # - encoded object id
499
- # - underlying FFI pointer to the `mecab` tagger
500
- # - real file path to `mecab` library
535
+ # - underlying FFI pointer to the MeCab Model
536
+ # - underlying FFI pointer to the MeCab Tagger
537
+ # - underlying FFI pointer to the MeCab Lattice
538
+ # - real file path to MeCab library
501
539
  # - options hash
502
540
  # - list of dictionaries
503
541
  # - MeCab version
504
- #
505
542
  # @return [String] encoded object id, underlying FFI pointer,
506
- # file path to `mecab` library, options hash,
543
+ # file path to MeCab library, options hash,
507
544
  # list of dictionaries and MeCab version
508
545
  def to_s
509
546
  [ super.chop,
547
+ "@model=#{@model},",
510
548
  "@tagger=#{@tagger},",
549
+ "@lattice=#{@lattice},",
511
550
  "@libpath=\"#{@libpath}\",",
512
551
  "@options=#{@options.inspect},",
513
552
  "@dicts=#{@dicts.to_s},",
@@ -515,7 +554,6 @@ module Natto
515
554
  end
516
555
 
517
556
  # Overrides `Object#inspect`.
518
- #
519
557
  # @return [String] encoded object id, FFI pointer, options hash,
520
558
  # list of dictionaries, and MeCab version
521
559
  # @see #to_s
@@ -524,26 +562,27 @@ module Natto
524
562
  end
525
563
 
526
564
  # Returns a `Proc` that will properly free resources
527
- # when this `Tagger` instance is garbage collected.
528
- # The `Proc` returned is registered to be invoked
529
- # after the `Tagger` instance owning `tptr`
530
- # has been destroyed.
531
- #
532
- # @param tptr [FFI::Pointer] pointer to `Tagger`
533
- # @return [Proc] to release `mecab` resources properly
534
- def self.create_free_proc(tptr)
565
+ # when this instance is garbage collected.
566
+ # @param mptr [FFI::Pointer] pointer to Model
567
+ # @param tptr [FFI::Pointer] pointer to Tagger
568
+ # @param lptr [FFI::Pointer] pointer to Lattice
569
+ # @return [Proc] to release MeCab resources properly
570
+ def self.create_free_proc(mptr, tptr, lptr)
535
571
  Proc.new do
572
+ self.mecab_lattice_destroy(lptr)
536
573
  self.mecab_destroy(tptr)
574
+ self.mecab_model_destory(mptr)
537
575
  end
538
576
  end
539
577
 
540
578
  private
541
579
 
542
580
  # @private
543
- def tokenize(text, pattern)
581
+ # MeCab eats all leading and training whitespace char
582
+ def tokenize_by_pattern(text, pattern)
544
583
  matches = text.scan(pattern)
545
584
 
546
- acc =[]
585
+ acc = []
547
586
  tmp = text
548
587
  matches.each_with_index do |m,i|
549
588
  bef, mat, aft = tmp.partition(m)
@@ -553,17 +592,34 @@ module Natto
553
592
  unless mat.empty?
554
593
  acc << [mat.strip, true]
555
594
  end
556
- if i==matches.size-1 and !aft.empty?
595
+ if i==matches.size-1 && !aft.empty?
557
596
  acc << [aft.strip, false]
558
597
  end
559
598
  tmp = aft
560
599
  end
561
600
  acc
562
601
  end
602
+
603
+ def tokenize_by_features(text, features)
604
+ acc = []
605
+ acc << [text.strip, false]
606
+
607
+ features.each do |feature|
608
+ acc.each_with_index do |e,i|
609
+ if !e.last
610
+ tmp = tokenize_by_pattern(e.first, feature)
611
+ if !tmp.empty?
612
+ acc.delete_at(i)
613
+ acc.insert(i, *tmp)
614
+ end
615
+ end
616
+ end
617
+ end
618
+ acc
619
+ end
563
620
  end
564
621
 
565
- # `MeCabError` is a general error class
566
- # for the `Natto` module.
622
+ # `MeCabError` is a general error class for the `Natto` module.
567
623
  class MeCabError < RuntimeError; end
568
624
  end
569
625