natto 0.9.9 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG +13 -0
- data/README.md +49 -12
- data/lib/natto.rb +1 -0
- data/lib/natto/binding.rb +114 -131
- data/lib/natto/natto.rb +266 -210
- data/lib/natto/option_parse.rb +6 -7
- data/lib/natto/struct.rb +21 -29
- data/lib/natto/version.rb +5 -6
- metadata +21 -31
data/lib/natto/natto.rb
CHANGED
@@ -4,9 +4,9 @@ require 'natto/option_parse'
|
|
4
4
|
require 'natto/struct'
|
5
5
|
|
6
6
|
module Natto
|
7
|
-
# `MeCab` is a
|
8
|
-
# Options to the MeCab
|
9
|
-
# (MeCab command-line style) or as a Ruby-style hash at
|
7
|
+
# `MeCab` is a class providing an interface to the MeCab library.
|
8
|
+
# Options to the MeCab Model, Tagger and Lattice are passed in
|
9
|
+
# as a string (MeCab command-line style) or as a Ruby-style hash at
|
10
10
|
# initialization.
|
11
11
|
#
|
12
12
|
# ## Usage
|
@@ -16,14 +16,16 @@ module Natto
|
|
16
16
|
# text = '凡人にしか見えねえ風景ってのがあるんだよ。'
|
17
17
|
#
|
18
18
|
# nm = Natto::MeCab.new
|
19
|
-
# => #<Natto::MeCab:
|
20
|
-
# @
|
21
|
-
# @
|
22
|
-
# @
|
23
|
-
# @
|
19
|
+
# => #<Natto::MeCab:0x0000080318d278 \
|
20
|
+
# @model=#<FFI::Pointer address=0x000008039174c0>, \
|
21
|
+
# @tagger=#<FFI::Pointer address=0x0000080329ba60>, \
|
22
|
+
# @lattice=#<FFI::Pointer address=0x000008045bd140>, \
|
23
|
+
# @libpath="/usr/local/lib/libmecab.so" \
|
24
|
+
# @options={}, \
|
25
|
+
# @dicts=[#<Natto::DictionaryInfo:0x0000080318ce90 \
|
24
26
|
# @filepath="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
|
25
|
-
# charset=utf8,
|
26
|
-
# type=0>],
|
27
|
+
# charset=utf8, \
|
28
|
+
# type=0>], \
|
27
29
|
# @version=0.996>
|
28
30
|
#
|
29
31
|
# # print entire MeCab result to stdout
|
@@ -104,7 +106,7 @@ module Natto
|
|
104
106
|
#
|
105
107
|
# # Boundary constraint parsing with output formatting.
|
106
108
|
# # %m ... morpheme surface
|
107
|
-
# # %
|
109
|
+
# # %f ... tab-delimited ChaSen feature values
|
108
110
|
# # part-of-speech (index 0)
|
109
111
|
# # %2 ... MeCab node status value (1 unknown)
|
110
112
|
# #
|
@@ -148,19 +150,22 @@ module Natto
|
|
148
150
|
MECAB_TOKEN_BOUNDARY = 1
|
149
151
|
MECAB_INSIDE_TOKEN = 2
|
150
152
|
|
151
|
-
# @return [FFI:Pointer] pointer to MeCab
|
153
|
+
# @return [FFI:Pointer] pointer to MeCab Model.
|
154
|
+
attr_reader :model
|
155
|
+
# @return [FFI:Pointer] pointer to MeCab Tagger.
|
152
156
|
attr_reader :tagger
|
157
|
+
# @return [FFI:Pointer] pointer to MeCab Lattice.
|
158
|
+
attr_reader :lattice
|
153
159
|
# @return [String] absolute filepath to MeCab library.
|
154
160
|
attr_reader :libpath
|
155
161
|
# @return [Hash] MeCab options as key-value pairs.
|
156
162
|
attr_reader :options
|
157
163
|
# @return [Array] listing of all of dictionaries referenced.
|
158
164
|
attr_reader :dicts
|
159
|
-
# @return [String]
|
165
|
+
# @return [String] MeCab version.
|
160
166
|
attr_reader :version
|
161
167
|
|
162
|
-
# Initializes the wrapped
|
163
|
-
# given `options`.
|
168
|
+
# Initializes the wrapped Tagger instance with the given `options`.
|
164
169
|
#
|
165
170
|
# Options supported are:
|
166
171
|
#
|
@@ -186,19 +191,21 @@ module Natto
|
|
186
191
|
# - :cost_factor -- cost factor (integer, default 700)
|
187
192
|
#
|
188
193
|
# <p>MeCab command-line arguments (-F) or long (--node-format) may be used in
|
189
|
-
# addition to Ruby-style
|
194
|
+
# addition to Ruby-style hashs</p>
|
190
195
|
# <i>Use single-quotes to preserve format options that contain escape chars.</i><br/>
|
191
196
|
# e.g.<br/>
|
192
197
|
#
|
193
198
|
# nm = Natto::MeCab.new(node_format: '%m¥t%f[7]¥n')
|
194
|
-
# => #<Natto::MeCab:
|
195
|
-
# @
|
196
|
-
# @
|
197
|
-
# @
|
198
|
-
# @
|
199
|
+
# => #<Natto::MeCab:0x00000803503ee8 \
|
200
|
+
# @model=#<FFI::Pointer address=0x00000802b6d9c0>, \
|
201
|
+
# @tagger=#<FFI::Pointer address=0x00000802ad3ec0>, \
|
202
|
+
# @lattice=#<FFI::Pointer address=0x000008035f3980>, \
|
203
|
+
# @libpath="/usr/local/lib/libmecab.so", \
|
204
|
+
# @options={:node_format=>"%m¥t%f[7]¥n"}, \
|
205
|
+
# @dicts=[#<Natto::DictionaryInfo:0x000008035038f8 \
|
199
206
|
# @filepath="/usr/local/lib/mecab/dic/ipadic/sys.dic" \
|
200
|
-
# charset=utf8,
|
201
|
-
# type=0>]
|
207
|
+
# charset=utf8, \
|
208
|
+
# type=0>] \
|
202
209
|
# @version=0.996>
|
203
210
|
#
|
204
211
|
# puts nm.parse('才能とは求める人間に与えられるものではない。')
|
@@ -216,210 +223,216 @@ module Natto
|
|
216
223
|
# ない ナイ
|
217
224
|
# 。 。
|
218
225
|
# EOS
|
219
|
-
#
|
220
|
-
# @
|
221
|
-
# @raise [MeCabError] if `mecab` cannot be initialized with the given `options`
|
226
|
+
# @param options [Hash, String] the MeCab options
|
227
|
+
# @raise [MeCabError] if MeCab cannot be initialized with the given `options`
|
222
228
|
def initialize(options={})
|
223
229
|
@options = self.class.parse_mecab_options(options)
|
224
|
-
@dicts = []
|
225
|
-
# TODO invoke function for enhancing MeCabNode after this point
|
226
|
-
|
227
230
|
opt_str = self.class.build_options_str(@options)
|
228
|
-
|
231
|
+
|
232
|
+
@model = self.class.mecab_model_new2(opt_str)
|
233
|
+
if @model.address == 0x0
|
234
|
+
raise MeCabError.new("Could not initialize Model with options: '#{opt_str}'")
|
235
|
+
end
|
236
|
+
|
237
|
+
@tagger = self.class.mecab_model_new_tagger(@model)
|
238
|
+
if @tagger.address == 0x0
|
239
|
+
raise MeCabError.new("Could not initialize Tagger with options: '#{opt_str}'")
|
240
|
+
end
|
241
|
+
|
242
|
+
@lattice = self.class.mecab_model_new_lattice(@model)
|
243
|
+
if @lattice.address == 0x0
|
244
|
+
raise MeCabError.new("Could not initialize Lattice with options: '#{opt_str}'")
|
245
|
+
end
|
246
|
+
|
229
247
|
@libpath = self.class.find_library
|
230
|
-
raise MeCabError.new("Could not initialize MeCab with options: '#{opt_str}'") if @tagger.address == 0x0
|
231
|
-
|
232
|
-
self.mecab_set_theta(@tagger, @options[:theta]) if @options[:theta]
|
233
|
-
self.mecab_set_lattice_level(@tagger, @options[:lattice_level]) if @options[:lattice_level]
|
234
|
-
self.mecab_set_all_morphs(@tagger, 1) if @options[:all_morphs]
|
235
|
-
self.mecab_set_partial(@tagger, 1) if @options[:partial]
|
236
|
-
|
237
|
-
# Define lambda for each major parsing type: _tostr, _tonode,
|
238
|
-
# boundary constraint _tostr, boundary constraint _node;
|
239
|
-
# and each parsing type will support both normal and N-best
|
240
|
-
# options
|
241
|
-
@parse_tostr = ->(text) {
|
242
|
-
if @options[:nbest] && @options[:nbest] > 1
|
243
|
-
#self.mecab_set_lattice_level(@tagger, (@options[:lattice_level] || 1))
|
244
|
-
retval = self.mecab_nbest_sparse_tostr(@tagger, @options[:nbest], text) ||
|
245
|
-
raise(MeCabError.new(self.mecab_strerror(@tagger)))
|
246
|
-
else
|
247
|
-
retval = self.mecab_sparse_tostr(@tagger, text) ||
|
248
|
-
raise(MeCabError.new(self.mecab_strerror(@tagger)))
|
249
|
-
end
|
250
248
|
|
251
|
-
|
252
|
-
|
249
|
+
if @options[:nbest] && @options[:nbest] > 1
|
250
|
+
self.mecab_lattice_set_request_type(@lattice, MECAB_LATTICE_NBEST)
|
251
|
+
else
|
252
|
+
self.mecab_lattice_set_request_type(@lattice, MECAB_LATTICE_ONE_BEST)
|
253
|
+
end
|
254
|
+
if @options[:partial]
|
255
|
+
self.mecab_lattice_add_request_type(@lattice, MECAB_LATTICE_PARTIAL)
|
256
|
+
end
|
257
|
+
if @options[:marginal]
|
258
|
+
self.mecab_lattice_add_request_type(@lattice,
|
259
|
+
MECAB_LATTICE_MARGINAL_PROB)
|
260
|
+
end
|
261
|
+
if @options[:all_morphs]
|
262
|
+
# required when node parsing
|
263
|
+
#self.mecab_lattice_add_request_type(@lattice, MECAB_LATTICE_NBEST)
|
264
|
+
self.mecab_lattice_add_request_type(@lattice,
|
265
|
+
MECAB_LATTICE_ALL_MORPHS)
|
266
|
+
end
|
267
|
+
if @options[:allocate_sentence]
|
268
|
+
self.mecab_lattice_add_request_type(@lattice,
|
269
|
+
MECAB_LATTICE_ALLOCATE_SENTENCE)
|
270
|
+
end
|
253
271
|
|
254
|
-
@
|
255
|
-
|
256
|
-
|
257
|
-
nlen = @options[:nbest]
|
258
|
-
#self.mecab_set_lattice_level(@tagger, (@options[:lattice_level] || 1))
|
259
|
-
self.mecab_nbest_init(@tagger, text)
|
260
|
-
nptr = self.mecab_nbest_next_tonode(@tagger)
|
261
|
-
else
|
262
|
-
nlen = 1
|
263
|
-
nptr = self.mecab_sparse_tonode(@tagger, text)
|
264
|
-
end
|
265
|
-
raise(MeCabError.new(self.mecab_strerror(@tagger))) if nptr.nil? || nptr.address==0x0
|
266
|
-
|
267
|
-
nlen.times do
|
268
|
-
s = text.bytes.to_a
|
269
|
-
while nptr && nptr.address != 0x0
|
270
|
-
mn = Natto::MeCabNode.new(nptr)
|
271
|
-
# ignore BOS nodes, since mecab does so
|
272
|
-
if !mn.is_bos?
|
273
|
-
s = s.drop_while {|e| (e==0xa || e==0x20)}
|
274
|
-
if !s.empty?
|
275
|
-
sarr = []
|
276
|
-
mn.length.times { sarr << s.shift }
|
277
|
-
surf = sarr.pack('C*')
|
278
|
-
mn.surface = surf.force_encoding(Encoding.default_external)
|
279
|
-
end
|
280
|
-
if @options[:output_format_type] || @options[:node_format]
|
281
|
-
mn.feature = self.mecab_format_node(@tagger, nptr).force_encoding(Encoding.default_external)
|
282
|
-
end
|
283
|
-
y.yield mn
|
284
|
-
end
|
285
|
-
nptr = mn.next
|
286
|
-
end
|
287
|
-
if nlen > 1
|
288
|
-
nptr = self.mecab_nbest_next_tonode(@tagger)
|
289
|
-
end
|
290
|
-
end
|
291
|
-
end
|
292
|
-
}
|
293
|
-
|
294
|
-
@bcparse_tostr = ->(text, boundary_constraints=/./) {
|
295
|
-
begin
|
296
|
-
lattice = self.mecab_lattice_new()
|
297
|
-
raise MeCabError.new("Could not create Lattice") if lattice.address == 0x0
|
272
|
+
if @options[:theta]
|
273
|
+
self.mecab_lattice_set_theta(@lattice, @options[:theta])
|
274
|
+
end
|
298
275
|
|
276
|
+
@parse_tostr = ->(text, constraints) {
|
277
|
+
begin
|
299
278
|
if @options[:nbest] && @options[:nbest] > 1
|
300
279
|
n = @options[:nbest]
|
301
|
-
self.mecab_lattice_set_request_type(lattice, MECAB_LATTICE_NBEST)
|
302
280
|
else
|
303
281
|
n = 1
|
304
|
-
self.mecab_lattice_set_request_type(lattice, MECAB_LATTICE_ONE_BEST)
|
305
|
-
end
|
306
|
-
if @options[:theta]
|
307
|
-
self.mecab_lattice_set_theta(lattice, @options[:theta])
|
308
282
|
end
|
309
283
|
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
tokens.each do |token|
|
316
|
-
c = token.first.bytes.count
|
284
|
+
if constraints[:boundary_constraints]
|
285
|
+
tokens = tokenize_by_pattern(text,
|
286
|
+
constraints[:boundary_constraints])
|
287
|
+
text = tokens.map {|t| t.first}.join
|
288
|
+
self.mecab_lattice_set_sentence(@lattice, text)
|
317
289
|
|
318
|
-
|
319
|
-
|
290
|
+
bpos = 0
|
291
|
+
tokens.each do |token|
|
292
|
+
c = token.first.bytes.count
|
320
293
|
|
321
|
-
|
322
|
-
|
323
|
-
|
294
|
+
self.mecab_lattice_set_boundary_constraint(@lattice,
|
295
|
+
bpos,
|
296
|
+
MECAB_TOKEN_BOUNDARY)
|
324
297
|
bpos += 1
|
298
|
+
|
299
|
+
mark = token.last ? MECAB_INSIDE_TOKEN : MECAB_ANY_BOUNDARY
|
300
|
+
(c-1).times do
|
301
|
+
self.mecab_lattice_set_boundary_constraint(@lattice,
|
302
|
+
bpos,
|
303
|
+
mark)
|
304
|
+
bpos += 1
|
305
|
+
end
|
325
306
|
end
|
307
|
+
elsif constraints[:feature_constraints]
|
308
|
+
features = constraints[:feature_constraints]
|
309
|
+
tokens = tokenize_by_features(text,
|
310
|
+
features.keys)
|
311
|
+
text = tokens.map {|t| t.first}.join
|
312
|
+
self.mecab_lattice_set_sentence(@lattice, text)
|
313
|
+
|
314
|
+
bpos = 0
|
315
|
+
tokens.each do |token|
|
316
|
+
chunk = token.first
|
317
|
+
c = chunk.bytes.count
|
318
|
+
if token.last
|
319
|
+
self.mecab_lattice_set_feature_constraint(@lattice,
|
320
|
+
bpos,
|
321
|
+
bpos+c,
|
322
|
+
features[chunk])
|
323
|
+
end
|
324
|
+
bpos += c
|
325
|
+
end
|
326
|
+
else
|
327
|
+
self.mecab_lattice_set_sentence(@lattice, text)
|
326
328
|
end
|
327
329
|
|
328
|
-
self.mecab_parse_lattice(@tagger, lattice)
|
330
|
+
self.mecab_parse_lattice(@tagger, @lattice)
|
329
331
|
|
330
332
|
if n > 1
|
331
|
-
retval = self.mecab_lattice_nbest_tostr(lattice, n)
|
333
|
+
retval = self.mecab_lattice_nbest_tostr(@lattice, n)
|
332
334
|
else
|
333
|
-
retval = self.mecab_lattice_tostr(lattice)
|
335
|
+
retval = self.mecab_lattice_tostr(@lattice)
|
334
336
|
end
|
335
337
|
retval.force_encoding(Encoding.default_external)
|
336
338
|
rescue
|
337
|
-
raise(MeCabError.new(self.mecab_lattice_strerror(lattice)))
|
338
|
-
ensure
|
339
|
-
if lattice.address != 0x0
|
340
|
-
self.mecab_lattice_destroy(lattice)
|
341
|
-
end
|
339
|
+
raise(MeCabError.new(self.mecab_lattice_strerror(@lattice)))
|
342
340
|
end
|
343
341
|
}
|
344
342
|
|
345
|
-
@
|
343
|
+
@parse_tonodes = ->(text, constraints) {
|
344
|
+
self.mecab_lattice_add_request_type(@lattice, MECAB_LATTICE_NBEST)
|
346
345
|
Enumerator.new do |y|
|
347
346
|
begin
|
348
|
-
lattice = self.mecab_lattice_new()
|
349
|
-
raise MeCabError.new("Could not create Lattice") if lattice.address == 0x0
|
350
|
-
|
351
347
|
if @options[:nbest] && @options[:nbest] > 1
|
352
348
|
n = @options[:nbest]
|
353
|
-
self.mecab_lattice_set_request_type(lattice, MECAB_LATTICE_NBEST)
|
354
349
|
else
|
355
350
|
n = 1
|
356
|
-
self.mecab_lattice_set_request_type(lattice, MECAB_LATTICE_ONE_BEST)
|
357
|
-
end
|
358
|
-
if @options[:theta]
|
359
|
-
self.mecab_lattice_set_theta(lattice, @options[:theta])
|
360
351
|
end
|
361
352
|
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
tokens.each do |token|
|
368
|
-
c = token.first.bytes.count
|
353
|
+
if constraints[:boundary_constraints]
|
354
|
+
tokens = tokenize_by_pattern(text,
|
355
|
+
constraints[:boundary_constraints])
|
356
|
+
text = tokens.map {|t| t.first}.join
|
357
|
+
self.mecab_lattice_set_sentence(@lattice, text)
|
369
358
|
|
370
|
-
|
371
|
-
|
359
|
+
bpos = 0
|
360
|
+
tokens.each do |token|
|
361
|
+
c = token.first.bytes.count
|
372
362
|
|
373
|
-
|
374
|
-
|
375
|
-
|
363
|
+
self.mecab_lattice_set_boundary_constraint(@lattice,
|
364
|
+
bpos,
|
365
|
+
MECAB_TOKEN_BOUNDARY)
|
376
366
|
bpos += 1
|
367
|
+
|
368
|
+
mark = token.last ? MECAB_INSIDE_TOKEN : MECAB_ANY_BOUNDARY
|
369
|
+
(c-1).times do
|
370
|
+
self.mecab_lattice_set_boundary_constraint(@lattice, bpos, mark)
|
371
|
+
bpos += 1
|
372
|
+
end
|
373
|
+
end
|
374
|
+
elsif constraints[:feature_constraints]
|
375
|
+
features = constraints[:feature_constraints]
|
376
|
+
tokens = tokenize_by_features(text,
|
377
|
+
features.keys)
|
378
|
+
text = tokens.map {|t| t.first}.join
|
379
|
+
self.mecab_lattice_set_sentence(@lattice, text)
|
380
|
+
|
381
|
+
bpos = 0
|
382
|
+
tokens.each do |token|
|
383
|
+
chunk = token.first
|
384
|
+
c = chunk.bytes.count
|
385
|
+
if token.last
|
386
|
+
self.mecab_lattice_set_feature_constraint(@lattice,
|
387
|
+
bpos,
|
388
|
+
bpos+c,
|
389
|
+
features[chunk])
|
390
|
+
end
|
391
|
+
bpos += c
|
377
392
|
end
|
393
|
+
else
|
394
|
+
self.mecab_lattice_set_sentence(@lattice, text)
|
378
395
|
end
|
379
396
|
|
380
|
-
self.mecab_parse_lattice(@tagger, lattice)
|
397
|
+
self.mecab_parse_lattice(@tagger, @lattice)
|
381
398
|
|
382
399
|
n.times do
|
383
|
-
check = self.mecab_lattice_next(lattice)
|
400
|
+
check = self.mecab_lattice_next(@lattice)
|
384
401
|
if check
|
385
|
-
nptr = self.mecab_lattice_get_bos_node(lattice)
|
402
|
+
nptr = self.mecab_lattice_get_bos_node(@lattice)
|
386
403
|
|
387
|
-
s = text.bytes.to_a
|
388
404
|
while nptr && nptr.address!=0x0
|
389
405
|
mn = Natto::MeCabNode.new(nptr)
|
390
|
-
|
391
|
-
|
392
|
-
sarr = []
|
393
|
-
mn.length.times { sarr << s.shift }
|
394
|
-
surf = sarr.pack('C*')
|
406
|
+
if !mn.is_bos?
|
407
|
+
surf = mn[:surface].bytes.to_a.slice(0,mn.length).pack('C*')
|
395
408
|
mn.surface = surf.force_encoding(Encoding.default_external)
|
409
|
+
if @options[:output_format_type] || @options[:node_format]
|
410
|
+
mn.feature = self.mecab_format_node(@tagger, nptr).force_encoding(Encoding.default_external)
|
411
|
+
end
|
412
|
+
y.yield mn
|
396
413
|
end
|
397
|
-
|
398
|
-
mn.feature = self.mecab_format_node(@tagger, nptr).force_encoding(Encoding.default_external)
|
399
|
-
end
|
400
|
-
y.yield mn
|
401
|
-
nptr = mn.next
|
414
|
+
nptr = mn[:next]
|
402
415
|
end
|
403
416
|
end
|
404
417
|
end
|
418
|
+
nil
|
405
419
|
rescue
|
406
|
-
raise(MeCabError.new(self.mecab_lattice_strerror(lattice)))
|
407
|
-
ensure
|
408
|
-
if lattice.address != 0x0
|
409
|
-
self.mecab_lattice_destroy(lattice)
|
410
|
-
end
|
420
|
+
raise(MeCabError.new(self.mecab_lattice_strerror(@lattice)))
|
411
421
|
end
|
412
422
|
end
|
413
423
|
}
|
414
424
|
|
415
|
-
@dicts
|
425
|
+
@dicts = []
|
426
|
+
@dicts << Natto::DictionaryInfo.new(self.mecab_model_dictionary_info(@model))
|
416
427
|
while @dicts.last.next.address != 0x0
|
417
428
|
@dicts << Natto::DictionaryInfo.new(@dicts.last.next)
|
418
429
|
end
|
419
430
|
|
420
431
|
@version = self.mecab_version
|
421
432
|
|
422
|
-
ObjectSpace.define_finalizer(self, self.class.create_free_proc(@
|
433
|
+
ObjectSpace.define_finalizer(self, self.class.create_free_proc(@model,
|
434
|
+
@tagger,
|
435
|
+
@lattice))
|
423
436
|
end
|
424
437
|
|
425
438
|
# Parses the given `text`, returning the MeCab output as a single string.
|
@@ -430,36 +443,45 @@ module Natto
|
|
430
443
|
# `boundary_constraints` key in the `options` hash. Boundary constraints
|
431
444
|
# parsing provides hints to MeCab on where the morpheme boundaries in the
|
432
445
|
# given `text` are located. `boundary_constraints` value may be either a
|
433
|
-
# `Regexp` or `String`; please see
|
434
|
-
# [String#scan](http://ruby-doc.org/core-2.2.0/String.html#method-i-scan String#scan.
|
446
|
+
# `Regexp` or `String`; please see [String#scan](http://ruby-doc.org/core-2.2.1/String.html#method-i-scan)
|
435
447
|
# The boundary constraint parsed output will be returned as a single
|
436
448
|
# string, unless a block is passed to this method for node parsing.
|
437
449
|
#
|
450
|
+
# Feature constraint parsing is available by passing in the
|
451
|
+
# `feature_constraints` key in the `options` hash. Feature constraints
|
452
|
+
# parsing provides instructions to MeCab to use the feature indicated
|
453
|
+
# for any morpheme that is an exact match for the given key.
|
454
|
+
# `feature_constraints` is a hash mapping a specific morpheme (String)
|
455
|
+
# to a corresponding feature value (String).
|
438
456
|
# @param text [String] the Japanese text to parse
|
439
|
-
# @param
|
440
|
-
# @return [String] parsing result from
|
441
|
-
# @raise [MeCabError] if the
|
457
|
+
# @param constraints [Hash] `boundary_constraints` or `feature_constraints`
|
458
|
+
# @return [String] parsing result from MeCab
|
459
|
+
# @raise [MeCabError] if the MeCab Tagger cannot parse the given `text`
|
442
460
|
# @raise [ArgumentError] if the given string `text` argument is `nil`
|
443
461
|
# @see MeCabNode
|
444
|
-
def parse(text,
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
462
|
+
def parse(text, constraints={})
|
463
|
+
if text.nil?
|
464
|
+
raise ArgumentError.new 'Text to parse cannot be nil'
|
465
|
+
elsif constraints[:boundary_constraints]
|
466
|
+
if !(constraints[:boundary_constraints].is_a?(Regexp) ||
|
467
|
+
constraints[:boundary_constraints].is_a?(String))
|
468
|
+
raise ArgumentError.new 'boundary constraints must be a Regexp or String'
|
451
469
|
end
|
470
|
+
elsif constraints[:feature_constraints] && !constraints[:feature_constraints].is_a?(Hash)
|
471
|
+
raise ArgumentError.new 'feature constraints must be a Hash'
|
472
|
+
elsif @options[:partial] && !text.end_with?("\n")
|
473
|
+
raise ArgumentError.new 'partial parsing requires new-line char at end of text'
|
474
|
+
end
|
475
|
+
|
476
|
+
if block_given?
|
477
|
+
@parse_tonodes.call(text, constraints).each {|n| yield n }
|
452
478
|
else
|
453
|
-
|
454
|
-
@parse_tonodes.call(text).each {|n| yield n }
|
455
|
-
else
|
456
|
-
@parse_tostr.call(text)
|
457
|
-
end
|
479
|
+
@parse_tostr.call(text, constraints)
|
458
480
|
end
|
459
481
|
end
|
460
482
|
|
461
483
|
# Parses the given string `text`, returning an
|
462
|
-
#
|
484
|
+
# [Enumerator](http://www.ruby-doc.org/core-2.2.1/Enumerator.html) that may be
|
463
485
|
# used to iterate over the resulting {MeCabNode} objects. This is more
|
464
486
|
# efficient than parsing to a simple string, since each node's
|
465
487
|
# information will not be materialized all at once as it is with
|
@@ -469,45 +491,62 @@ module Natto
|
|
469
491
|
# the morpheme. Node-formatting may also be used to customize
|
470
492
|
# the resulting node's `feature` attribute.
|
471
493
|
#
|
472
|
-
# Boundary constraint parsing is available
|
494
|
+
# Boundary constraint parsing is available by passing in the
|
473
495
|
# `boundary_constraints` key in the `options` hash. Boundary constraints
|
474
496
|
# parsing provides hints to MeCab on where the morpheme boundaries in the
|
475
497
|
# given `text` are located. `boundary_constraints` value may be either a
|
476
498
|
# `Regexp` or `String`; please see
|
477
|
-
# [String#scan](http://ruby-doc.org/core-2.2.
|
499
|
+
# [String#scan](http://ruby-doc.org/core-2.2.1/String.html#method-i-scan)
|
478
500
|
#
|
501
|
+
# Feature constraint parsing is available by passing in the
|
502
|
+
# `feature_constraints` key in the `options` hash. Feature constraints
|
503
|
+
# parsing provides instructions to MeCab to use the feature indicated
|
504
|
+
# for any morpheme that is an exact match for the given key.
|
505
|
+
# `feature_constraints` is a hash mapping a specific morpheme (String)
|
506
|
+
# to a corresponding feature value (String).
|
479
507
|
# @param text [String] the Japanese text to parse
|
480
|
-
# @param
|
508
|
+
# @param constraints [Hash] `boundary_constraints` or `feature_constraints`
|
481
509
|
# @return [Enumerator] of MeCabNode instances
|
482
|
-
# @raise [MeCabError] if the
|
510
|
+
# @raise [MeCabError] if the MeCab Tagger cannot parse the given `text`
|
483
511
|
# @raise [ArgumentError] if the given string `text` argument is `nil`
|
484
512
|
# @see MeCabNode
|
485
513
|
# @see http://ruby-doc.org/core-2.2.1/Enumerator.html
|
486
|
-
def enum_parse(text,
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
514
|
+
def enum_parse(text, constraints={})
|
515
|
+
if text.nil?
|
516
|
+
raise ArgumentError.new 'Text to parse cannot be nil'
|
517
|
+
elsif constraints[:boundary_constraints]
|
518
|
+
if !(constraints[:boundary_constraints].is_a?(Regexp) ||
|
519
|
+
constraints[:boundary_constraints].is_a?(String))
|
520
|
+
raise ArgumentError.new 'boundary constraints must be a Regexp or String'
|
521
|
+
end
|
522
|
+
elsif constraints[:feature_constraints] && !constraints[:feature_constraints].is_a?(Hash)
|
523
|
+
raise ArgumentError.new 'feature constraints must be a Hash'
|
524
|
+
elsif @options[:partial] && !text.end_with?("\n")
|
525
|
+
raise ArgumentError.new 'partial parsing requires new-line char at end of text'
|
492
526
|
end
|
527
|
+
|
528
|
+
@parse_tonodes.call(text, constraints)
|
493
529
|
end
|
494
530
|
|
495
|
-
# Returns human-readable details for the wrapped
|
531
|
+
# Returns human-readable details for the wrapped MeCab library.
|
496
532
|
# Overrides `Object#to_s`.
|
497
533
|
#
|
498
534
|
# - encoded object id
|
499
|
-
# - underlying FFI pointer to the
|
500
|
-
# -
|
535
|
+
# - underlying FFI pointer to the MeCab Model
|
536
|
+
# - underlying FFI pointer to the MeCab Tagger
|
537
|
+
# - underlying FFI pointer to the MeCab Lattice
|
538
|
+
# - real file path to MeCab library
|
501
539
|
# - options hash
|
502
540
|
# - list of dictionaries
|
503
541
|
# - MeCab version
|
504
|
-
#
|
505
542
|
# @return [String] encoded object id, underlying FFI pointer,
|
506
|
-
# file path to
|
543
|
+
# file path to MeCab library, options hash,
|
507
544
|
# list of dictionaries and MeCab version
|
508
545
|
def to_s
|
509
546
|
[ super.chop,
|
547
|
+
"@model=#{@model},",
|
510
548
|
"@tagger=#{@tagger},",
|
549
|
+
"@lattice=#{@lattice},",
|
511
550
|
"@libpath=\"#{@libpath}\",",
|
512
551
|
"@options=#{@options.inspect},",
|
513
552
|
"@dicts=#{@dicts.to_s},",
|
@@ -515,7 +554,6 @@ module Natto
|
|
515
554
|
end
|
516
555
|
|
517
556
|
# Overrides `Object#inspect`.
|
518
|
-
#
|
519
557
|
# @return [String] encoded object id, FFI pointer, options hash,
|
520
558
|
# list of dictionaries, and MeCab version
|
521
559
|
# @see #to_s
|
@@ -524,26 +562,27 @@ module Natto
|
|
524
562
|
end
|
525
563
|
|
526
564
|
# Returns a `Proc` that will properly free resources
|
527
|
-
# when this
|
528
|
-
#
|
529
|
-
#
|
530
|
-
#
|
531
|
-
#
|
532
|
-
|
533
|
-
# @return [Proc] to release `mecab` resources properly
|
534
|
-
def self.create_free_proc(tptr)
|
565
|
+
# when this instance is garbage collected.
|
566
|
+
# @param mptr [FFI::Pointer] pointer to Model
|
567
|
+
# @param tptr [FFI::Pointer] pointer to Tagger
|
568
|
+
# @param lptr [FFI::Pointer] pointer to Lattice
|
569
|
+
# @return [Proc] to release MeCab resources properly
|
570
|
+
def self.create_free_proc(mptr, tptr, lptr)
|
535
571
|
Proc.new do
|
572
|
+
self.mecab_lattice_destroy(lptr)
|
536
573
|
self.mecab_destroy(tptr)
|
574
|
+
self.mecab_model_destory(mptr)
|
537
575
|
end
|
538
576
|
end
|
539
577
|
|
540
578
|
private
|
541
579
|
|
542
580
|
# @private
|
543
|
-
|
581
|
+
# MeCab eats all leading and training whitespace char
|
582
|
+
def tokenize_by_pattern(text, pattern)
|
544
583
|
matches = text.scan(pattern)
|
545
584
|
|
546
|
-
acc =[]
|
585
|
+
acc = []
|
547
586
|
tmp = text
|
548
587
|
matches.each_with_index do |m,i|
|
549
588
|
bef, mat, aft = tmp.partition(m)
|
@@ -553,17 +592,34 @@ module Natto
|
|
553
592
|
unless mat.empty?
|
554
593
|
acc << [mat.strip, true]
|
555
594
|
end
|
556
|
-
if i==matches.size-1
|
595
|
+
if i==matches.size-1 && !aft.empty?
|
557
596
|
acc << [aft.strip, false]
|
558
597
|
end
|
559
598
|
tmp = aft
|
560
599
|
end
|
561
600
|
acc
|
562
601
|
end
|
602
|
+
|
603
|
+
def tokenize_by_features(text, features)
|
604
|
+
acc = []
|
605
|
+
acc << [text.strip, false]
|
606
|
+
|
607
|
+
features.each do |feature|
|
608
|
+
acc.each_with_index do |e,i|
|
609
|
+
if !e.last
|
610
|
+
tmp = tokenize_by_pattern(e.first, feature)
|
611
|
+
if !tmp.empty?
|
612
|
+
acc.delete_at(i)
|
613
|
+
acc.insert(i, *tmp)
|
614
|
+
end
|
615
|
+
end
|
616
|
+
end
|
617
|
+
end
|
618
|
+
acc
|
619
|
+
end
|
563
620
|
end
|
564
621
|
|
565
|
-
# `MeCabError` is a general error class
|
566
|
-
# for the `Natto` module.
|
622
|
+
# `MeCabError` is a general error class for the `Natto` module.
|
567
623
|
class MeCabError < RuntimeError; end
|
568
624
|
end
|
569
625
|
|