natto 0.9.9 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG +13 -0
- data/README.md +49 -12
- data/lib/natto.rb +1 -0
- data/lib/natto/binding.rb +114 -131
- data/lib/natto/natto.rb +266 -210
- data/lib/natto/option_parse.rb +6 -7
- data/lib/natto/struct.rb +21 -29
- data/lib/natto/version.rb +5 -6
- metadata +21 -31
data/lib/natto/natto.rb
CHANGED
@@ -4,9 +4,9 @@ require 'natto/option_parse'
|
|
4
4
|
require 'natto/struct'
|
5
5
|
|
6
6
|
module Natto
|
7
|
-
# `MeCab` is a
|
8
|
-
# Options to the MeCab
|
9
|
-
# (MeCab command-line style) or as a Ruby-style hash at
|
7
|
+
# `MeCab` is a class providing an interface to the MeCab library.
|
8
|
+
# Options to the MeCab Model, Tagger and Lattice are passed in
|
9
|
+
# as a string (MeCab command-line style) or as a Ruby-style hash at
|
10
10
|
# initialization.
|
11
11
|
#
|
12
12
|
# ## Usage
|
@@ -16,14 +16,16 @@ module Natto
|
|
16
16
|
# text = '凡人にしか見えねえ風景ってのがあるんだよ。'
|
17
17
|
#
|
18
18
|
# nm = Natto::MeCab.new
|
19
|
-
# => #<Natto::MeCab:
|
20
|
-
# @
|
21
|
-
# @
|
22
|
-
# @
|
23
|
-
# @
|
19
|
+
# => #<Natto::MeCab:0x0000080318d278 \
|
20
|
+
# @model=#<FFI::Pointer address=0x000008039174c0>, \
|
21
|
+
# @tagger=#<FFI::Pointer address=0x0000080329ba60>, \
|
22
|
+
# @lattice=#<FFI::Pointer address=0x000008045bd140>, \
|
23
|
+
# @libpath="/usr/local/lib/libmecab.so" \
|
24
|
+
# @options={}, \
|
25
|
+
# @dicts=[#<Natto::DictionaryInfo:0x0000080318ce90 \
|
24
26
|
# @filepath="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
|
25
|
-
# charset=utf8,
|
26
|
-
# type=0>],
|
27
|
+
# charset=utf8, \
|
28
|
+
# type=0>], \
|
27
29
|
# @version=0.996>
|
28
30
|
#
|
29
31
|
# # print entire MeCab result to stdout
|
@@ -104,7 +106,7 @@ module Natto
|
|
104
106
|
#
|
105
107
|
# # Boundary constraint parsing with output formatting.
|
106
108
|
# # %m ... morpheme surface
|
107
|
-
# # %
|
109
|
+
# # %f ... tab-delimited ChaSen feature values
|
108
110
|
# # part-of-speech (index 0)
|
109
111
|
# # %2 ... MeCab node status value (1 unknown)
|
110
112
|
# #
|
@@ -148,19 +150,22 @@ module Natto
|
|
148
150
|
MECAB_TOKEN_BOUNDARY = 1
|
149
151
|
MECAB_INSIDE_TOKEN = 2
|
150
152
|
|
151
|
-
# @return [FFI:Pointer] pointer to MeCab
|
153
|
+
# @return [FFI:Pointer] pointer to MeCab Model.
|
154
|
+
attr_reader :model
|
155
|
+
# @return [FFI:Pointer] pointer to MeCab Tagger.
|
152
156
|
attr_reader :tagger
|
157
|
+
# @return [FFI:Pointer] pointer to MeCab Lattice.
|
158
|
+
attr_reader :lattice
|
153
159
|
# @return [String] absolute filepath to MeCab library.
|
154
160
|
attr_reader :libpath
|
155
161
|
# @return [Hash] MeCab options as key-value pairs.
|
156
162
|
attr_reader :options
|
157
163
|
# @return [Array] listing of all of dictionaries referenced.
|
158
164
|
attr_reader :dicts
|
159
|
-
# @return [String]
|
165
|
+
# @return [String] MeCab version.
|
160
166
|
attr_reader :version
|
161
167
|
|
162
|
-
# Initializes the wrapped
|
163
|
-
# given `options`.
|
168
|
+
# Initializes the wrapped Tagger instance with the given `options`.
|
164
169
|
#
|
165
170
|
# Options supported are:
|
166
171
|
#
|
@@ -186,19 +191,21 @@ module Natto
|
|
186
191
|
# - :cost_factor -- cost factor (integer, default 700)
|
187
192
|
#
|
188
193
|
# <p>MeCab command-line arguments (-F) or long (--node-format) may be used in
|
189
|
-
# addition to Ruby-style
|
194
|
+
# addition to Ruby-style hashs</p>
|
190
195
|
# <i>Use single-quotes to preserve format options that contain escape chars.</i><br/>
|
191
196
|
# e.g.<br/>
|
192
197
|
#
|
193
198
|
# nm = Natto::MeCab.new(node_format: '%m¥t%f[7]¥n')
|
194
|
-
# => #<Natto::MeCab:
|
195
|
-
# @
|
196
|
-
# @
|
197
|
-
# @
|
198
|
-
# @
|
199
|
+
# => #<Natto::MeCab:0x00000803503ee8 \
|
200
|
+
# @model=#<FFI::Pointer address=0x00000802b6d9c0>, \
|
201
|
+
# @tagger=#<FFI::Pointer address=0x00000802ad3ec0>, \
|
202
|
+
# @lattice=#<FFI::Pointer address=0x000008035f3980>, \
|
203
|
+
# @libpath="/usr/local/lib/libmecab.so", \
|
204
|
+
# @options={:node_format=>"%m¥t%f[7]¥n"}, \
|
205
|
+
# @dicts=[#<Natto::DictionaryInfo:0x000008035038f8 \
|
199
206
|
# @filepath="/usr/local/lib/mecab/dic/ipadic/sys.dic" \
|
200
|
-
# charset=utf8,
|
201
|
-
# type=0>]
|
207
|
+
# charset=utf8, \
|
208
|
+
# type=0>] \
|
202
209
|
# @version=0.996>
|
203
210
|
#
|
204
211
|
# puts nm.parse('才能とは求める人間に与えられるものではない。')
|
@@ -216,210 +223,216 @@ module Natto
|
|
216
223
|
# ない ナイ
|
217
224
|
# 。 。
|
218
225
|
# EOS
|
219
|
-
#
|
220
|
-
# @
|
221
|
-
# @raise [MeCabError] if `mecab` cannot be initialized with the given `options`
|
226
|
+
# @param options [Hash, String] the MeCab options
|
227
|
+
# @raise [MeCabError] if MeCab cannot be initialized with the given `options`
|
222
228
|
def initialize(options={})
|
223
229
|
@options = self.class.parse_mecab_options(options)
|
224
|
-
@dicts = []
|
225
|
-
# TODO invoke function for enhancing MeCabNode after this point
|
226
|
-
|
227
230
|
opt_str = self.class.build_options_str(@options)
|
228
|
-
|
231
|
+
|
232
|
+
@model = self.class.mecab_model_new2(opt_str)
|
233
|
+
if @model.address == 0x0
|
234
|
+
raise MeCabError.new("Could not initialize Model with options: '#{opt_str}'")
|
235
|
+
end
|
236
|
+
|
237
|
+
@tagger = self.class.mecab_model_new_tagger(@model)
|
238
|
+
if @tagger.address == 0x0
|
239
|
+
raise MeCabError.new("Could not initialize Tagger with options: '#{opt_str}'")
|
240
|
+
end
|
241
|
+
|
242
|
+
@lattice = self.class.mecab_model_new_lattice(@model)
|
243
|
+
if @lattice.address == 0x0
|
244
|
+
raise MeCabError.new("Could not initialize Lattice with options: '#{opt_str}'")
|
245
|
+
end
|
246
|
+
|
229
247
|
@libpath = self.class.find_library
|
230
|
-
raise MeCabError.new("Could not initialize MeCab with options: '#{opt_str}'") if @tagger.address == 0x0
|
231
|
-
|
232
|
-
self.mecab_set_theta(@tagger, @options[:theta]) if @options[:theta]
|
233
|
-
self.mecab_set_lattice_level(@tagger, @options[:lattice_level]) if @options[:lattice_level]
|
234
|
-
self.mecab_set_all_morphs(@tagger, 1) if @options[:all_morphs]
|
235
|
-
self.mecab_set_partial(@tagger, 1) if @options[:partial]
|
236
|
-
|
237
|
-
# Define lambda for each major parsing type: _tostr, _tonode,
|
238
|
-
# boundary constraint _tostr, boundary constraint _node;
|
239
|
-
# and each parsing type will support both normal and N-best
|
240
|
-
# options
|
241
|
-
@parse_tostr = ->(text) {
|
242
|
-
if @options[:nbest] && @options[:nbest] > 1
|
243
|
-
#self.mecab_set_lattice_level(@tagger, (@options[:lattice_level] || 1))
|
244
|
-
retval = self.mecab_nbest_sparse_tostr(@tagger, @options[:nbest], text) ||
|
245
|
-
raise(MeCabError.new(self.mecab_strerror(@tagger)))
|
246
|
-
else
|
247
|
-
retval = self.mecab_sparse_tostr(@tagger, text) ||
|
248
|
-
raise(MeCabError.new(self.mecab_strerror(@tagger)))
|
249
|
-
end
|
250
248
|
|
251
|
-
|
252
|
-
|
249
|
+
if @options[:nbest] && @options[:nbest] > 1
|
250
|
+
self.mecab_lattice_set_request_type(@lattice, MECAB_LATTICE_NBEST)
|
251
|
+
else
|
252
|
+
self.mecab_lattice_set_request_type(@lattice, MECAB_LATTICE_ONE_BEST)
|
253
|
+
end
|
254
|
+
if @options[:partial]
|
255
|
+
self.mecab_lattice_add_request_type(@lattice, MECAB_LATTICE_PARTIAL)
|
256
|
+
end
|
257
|
+
if @options[:marginal]
|
258
|
+
self.mecab_lattice_add_request_type(@lattice,
|
259
|
+
MECAB_LATTICE_MARGINAL_PROB)
|
260
|
+
end
|
261
|
+
if @options[:all_morphs]
|
262
|
+
# required when node parsing
|
263
|
+
#self.mecab_lattice_add_request_type(@lattice, MECAB_LATTICE_NBEST)
|
264
|
+
self.mecab_lattice_add_request_type(@lattice,
|
265
|
+
MECAB_LATTICE_ALL_MORPHS)
|
266
|
+
end
|
267
|
+
if @options[:allocate_sentence]
|
268
|
+
self.mecab_lattice_add_request_type(@lattice,
|
269
|
+
MECAB_LATTICE_ALLOCATE_SENTENCE)
|
270
|
+
end
|
253
271
|
|
254
|
-
@
|
255
|
-
|
256
|
-
|
257
|
-
nlen = @options[:nbest]
|
258
|
-
#self.mecab_set_lattice_level(@tagger, (@options[:lattice_level] || 1))
|
259
|
-
self.mecab_nbest_init(@tagger, text)
|
260
|
-
nptr = self.mecab_nbest_next_tonode(@tagger)
|
261
|
-
else
|
262
|
-
nlen = 1
|
263
|
-
nptr = self.mecab_sparse_tonode(@tagger, text)
|
264
|
-
end
|
265
|
-
raise(MeCabError.new(self.mecab_strerror(@tagger))) if nptr.nil? || nptr.address==0x0
|
266
|
-
|
267
|
-
nlen.times do
|
268
|
-
s = text.bytes.to_a
|
269
|
-
while nptr && nptr.address != 0x0
|
270
|
-
mn = Natto::MeCabNode.new(nptr)
|
271
|
-
# ignore BOS nodes, since mecab does so
|
272
|
-
if !mn.is_bos?
|
273
|
-
s = s.drop_while {|e| (e==0xa || e==0x20)}
|
274
|
-
if !s.empty?
|
275
|
-
sarr = []
|
276
|
-
mn.length.times { sarr << s.shift }
|
277
|
-
surf = sarr.pack('C*')
|
278
|
-
mn.surface = surf.force_encoding(Encoding.default_external)
|
279
|
-
end
|
280
|
-
if @options[:output_format_type] || @options[:node_format]
|
281
|
-
mn.feature = self.mecab_format_node(@tagger, nptr).force_encoding(Encoding.default_external)
|
282
|
-
end
|
283
|
-
y.yield mn
|
284
|
-
end
|
285
|
-
nptr = mn.next
|
286
|
-
end
|
287
|
-
if nlen > 1
|
288
|
-
nptr = self.mecab_nbest_next_tonode(@tagger)
|
289
|
-
end
|
290
|
-
end
|
291
|
-
end
|
292
|
-
}
|
293
|
-
|
294
|
-
@bcparse_tostr = ->(text, boundary_constraints=/./) {
|
295
|
-
begin
|
296
|
-
lattice = self.mecab_lattice_new()
|
297
|
-
raise MeCabError.new("Could not create Lattice") if lattice.address == 0x0
|
272
|
+
if @options[:theta]
|
273
|
+
self.mecab_lattice_set_theta(@lattice, @options[:theta])
|
274
|
+
end
|
298
275
|
|
276
|
+
@parse_tostr = ->(text, constraints) {
|
277
|
+
begin
|
299
278
|
if @options[:nbest] && @options[:nbest] > 1
|
300
279
|
n = @options[:nbest]
|
301
|
-
self.mecab_lattice_set_request_type(lattice, MECAB_LATTICE_NBEST)
|
302
280
|
else
|
303
281
|
n = 1
|
304
|
-
self.mecab_lattice_set_request_type(lattice, MECAB_LATTICE_ONE_BEST)
|
305
|
-
end
|
306
|
-
if @options[:theta]
|
307
|
-
self.mecab_lattice_set_theta(lattice, @options[:theta])
|
308
282
|
end
|
309
283
|
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
tokens.each do |token|
|
316
|
-
c = token.first.bytes.count
|
284
|
+
if constraints[:boundary_constraints]
|
285
|
+
tokens = tokenize_by_pattern(text,
|
286
|
+
constraints[:boundary_constraints])
|
287
|
+
text = tokens.map {|t| t.first}.join
|
288
|
+
self.mecab_lattice_set_sentence(@lattice, text)
|
317
289
|
|
318
|
-
|
319
|
-
|
290
|
+
bpos = 0
|
291
|
+
tokens.each do |token|
|
292
|
+
c = token.first.bytes.count
|
320
293
|
|
321
|
-
|
322
|
-
|
323
|
-
|
294
|
+
self.mecab_lattice_set_boundary_constraint(@lattice,
|
295
|
+
bpos,
|
296
|
+
MECAB_TOKEN_BOUNDARY)
|
324
297
|
bpos += 1
|
298
|
+
|
299
|
+
mark = token.last ? MECAB_INSIDE_TOKEN : MECAB_ANY_BOUNDARY
|
300
|
+
(c-1).times do
|
301
|
+
self.mecab_lattice_set_boundary_constraint(@lattice,
|
302
|
+
bpos,
|
303
|
+
mark)
|
304
|
+
bpos += 1
|
305
|
+
end
|
325
306
|
end
|
307
|
+
elsif constraints[:feature_constraints]
|
308
|
+
features = constraints[:feature_constraints]
|
309
|
+
tokens = tokenize_by_features(text,
|
310
|
+
features.keys)
|
311
|
+
text = tokens.map {|t| t.first}.join
|
312
|
+
self.mecab_lattice_set_sentence(@lattice, text)
|
313
|
+
|
314
|
+
bpos = 0
|
315
|
+
tokens.each do |token|
|
316
|
+
chunk = token.first
|
317
|
+
c = chunk.bytes.count
|
318
|
+
if token.last
|
319
|
+
self.mecab_lattice_set_feature_constraint(@lattice,
|
320
|
+
bpos,
|
321
|
+
bpos+c,
|
322
|
+
features[chunk])
|
323
|
+
end
|
324
|
+
bpos += c
|
325
|
+
end
|
326
|
+
else
|
327
|
+
self.mecab_lattice_set_sentence(@lattice, text)
|
326
328
|
end
|
327
329
|
|
328
|
-
self.mecab_parse_lattice(@tagger, lattice)
|
330
|
+
self.mecab_parse_lattice(@tagger, @lattice)
|
329
331
|
|
330
332
|
if n > 1
|
331
|
-
retval = self.mecab_lattice_nbest_tostr(lattice, n)
|
333
|
+
retval = self.mecab_lattice_nbest_tostr(@lattice, n)
|
332
334
|
else
|
333
|
-
retval = self.mecab_lattice_tostr(lattice)
|
335
|
+
retval = self.mecab_lattice_tostr(@lattice)
|
334
336
|
end
|
335
337
|
retval.force_encoding(Encoding.default_external)
|
336
338
|
rescue
|
337
|
-
raise(MeCabError.new(self.mecab_lattice_strerror(lattice)))
|
338
|
-
ensure
|
339
|
-
if lattice.address != 0x0
|
340
|
-
self.mecab_lattice_destroy(lattice)
|
341
|
-
end
|
339
|
+
raise(MeCabError.new(self.mecab_lattice_strerror(@lattice)))
|
342
340
|
end
|
343
341
|
}
|
344
342
|
|
345
|
-
@
|
343
|
+
@parse_tonodes = ->(text, constraints) {
|
344
|
+
self.mecab_lattice_add_request_type(@lattice, MECAB_LATTICE_NBEST)
|
346
345
|
Enumerator.new do |y|
|
347
346
|
begin
|
348
|
-
lattice = self.mecab_lattice_new()
|
349
|
-
raise MeCabError.new("Could not create Lattice") if lattice.address == 0x0
|
350
|
-
|
351
347
|
if @options[:nbest] && @options[:nbest] > 1
|
352
348
|
n = @options[:nbest]
|
353
|
-
self.mecab_lattice_set_request_type(lattice, MECAB_LATTICE_NBEST)
|
354
349
|
else
|
355
350
|
n = 1
|
356
|
-
self.mecab_lattice_set_request_type(lattice, MECAB_LATTICE_ONE_BEST)
|
357
|
-
end
|
358
|
-
if @options[:theta]
|
359
|
-
self.mecab_lattice_set_theta(lattice, @options[:theta])
|
360
351
|
end
|
361
352
|
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
tokens.each do |token|
|
368
|
-
c = token.first.bytes.count
|
353
|
+
if constraints[:boundary_constraints]
|
354
|
+
tokens = tokenize_by_pattern(text,
|
355
|
+
constraints[:boundary_constraints])
|
356
|
+
text = tokens.map {|t| t.first}.join
|
357
|
+
self.mecab_lattice_set_sentence(@lattice, text)
|
369
358
|
|
370
|
-
|
371
|
-
|
359
|
+
bpos = 0
|
360
|
+
tokens.each do |token|
|
361
|
+
c = token.first.bytes.count
|
372
362
|
|
373
|
-
|
374
|
-
|
375
|
-
|
363
|
+
self.mecab_lattice_set_boundary_constraint(@lattice,
|
364
|
+
bpos,
|
365
|
+
MECAB_TOKEN_BOUNDARY)
|
376
366
|
bpos += 1
|
367
|
+
|
368
|
+
mark = token.last ? MECAB_INSIDE_TOKEN : MECAB_ANY_BOUNDARY
|
369
|
+
(c-1).times do
|
370
|
+
self.mecab_lattice_set_boundary_constraint(@lattice, bpos, mark)
|
371
|
+
bpos += 1
|
372
|
+
end
|
373
|
+
end
|
374
|
+
elsif constraints[:feature_constraints]
|
375
|
+
features = constraints[:feature_constraints]
|
376
|
+
tokens = tokenize_by_features(text,
|
377
|
+
features.keys)
|
378
|
+
text = tokens.map {|t| t.first}.join
|
379
|
+
self.mecab_lattice_set_sentence(@lattice, text)
|
380
|
+
|
381
|
+
bpos = 0
|
382
|
+
tokens.each do |token|
|
383
|
+
chunk = token.first
|
384
|
+
c = chunk.bytes.count
|
385
|
+
if token.last
|
386
|
+
self.mecab_lattice_set_feature_constraint(@lattice,
|
387
|
+
bpos,
|
388
|
+
bpos+c,
|
389
|
+
features[chunk])
|
390
|
+
end
|
391
|
+
bpos += c
|
377
392
|
end
|
393
|
+
else
|
394
|
+
self.mecab_lattice_set_sentence(@lattice, text)
|
378
395
|
end
|
379
396
|
|
380
|
-
self.mecab_parse_lattice(@tagger, lattice)
|
397
|
+
self.mecab_parse_lattice(@tagger, @lattice)
|
381
398
|
|
382
399
|
n.times do
|
383
|
-
check = self.mecab_lattice_next(lattice)
|
400
|
+
check = self.mecab_lattice_next(@lattice)
|
384
401
|
if check
|
385
|
-
nptr = self.mecab_lattice_get_bos_node(lattice)
|
402
|
+
nptr = self.mecab_lattice_get_bos_node(@lattice)
|
386
403
|
|
387
|
-
s = text.bytes.to_a
|
388
404
|
while nptr && nptr.address!=0x0
|
389
405
|
mn = Natto::MeCabNode.new(nptr)
|
390
|
-
|
391
|
-
|
392
|
-
sarr = []
|
393
|
-
mn.length.times { sarr << s.shift }
|
394
|
-
surf = sarr.pack('C*')
|
406
|
+
if !mn.is_bos?
|
407
|
+
surf = mn[:surface].bytes.to_a.slice(0,mn.length).pack('C*')
|
395
408
|
mn.surface = surf.force_encoding(Encoding.default_external)
|
409
|
+
if @options[:output_format_type] || @options[:node_format]
|
410
|
+
mn.feature = self.mecab_format_node(@tagger, nptr).force_encoding(Encoding.default_external)
|
411
|
+
end
|
412
|
+
y.yield mn
|
396
413
|
end
|
397
|
-
|
398
|
-
mn.feature = self.mecab_format_node(@tagger, nptr).force_encoding(Encoding.default_external)
|
399
|
-
end
|
400
|
-
y.yield mn
|
401
|
-
nptr = mn.next
|
414
|
+
nptr = mn[:next]
|
402
415
|
end
|
403
416
|
end
|
404
417
|
end
|
418
|
+
nil
|
405
419
|
rescue
|
406
|
-
raise(MeCabError.new(self.mecab_lattice_strerror(lattice)))
|
407
|
-
ensure
|
408
|
-
if lattice.address != 0x0
|
409
|
-
self.mecab_lattice_destroy(lattice)
|
410
|
-
end
|
420
|
+
raise(MeCabError.new(self.mecab_lattice_strerror(@lattice)))
|
411
421
|
end
|
412
422
|
end
|
413
423
|
}
|
414
424
|
|
415
|
-
@dicts
|
425
|
+
@dicts = []
|
426
|
+
@dicts << Natto::DictionaryInfo.new(self.mecab_model_dictionary_info(@model))
|
416
427
|
while @dicts.last.next.address != 0x0
|
417
428
|
@dicts << Natto::DictionaryInfo.new(@dicts.last.next)
|
418
429
|
end
|
419
430
|
|
420
431
|
@version = self.mecab_version
|
421
432
|
|
422
|
-
ObjectSpace.define_finalizer(self, self.class.create_free_proc(@
|
433
|
+
ObjectSpace.define_finalizer(self, self.class.create_free_proc(@model,
|
434
|
+
@tagger,
|
435
|
+
@lattice))
|
423
436
|
end
|
424
437
|
|
425
438
|
# Parses the given `text`, returning the MeCab output as a single string.
|
@@ -430,36 +443,45 @@ module Natto
|
|
430
443
|
# `boundary_constraints` key in the `options` hash. Boundary constraints
|
431
444
|
# parsing provides hints to MeCab on where the morpheme boundaries in the
|
432
445
|
# given `text` are located. `boundary_constraints` value may be either a
|
433
|
-
# `Regexp` or `String`; please see
|
434
|
-
# [String#scan](http://ruby-doc.org/core-2.2.0/String.html#method-i-scan String#scan.
|
446
|
+
# `Regexp` or `String`; please see [String#scan](http://ruby-doc.org/core-2.2.1/String.html#method-i-scan)
|
435
447
|
# The boundary constraint parsed output will be returned as a single
|
436
448
|
# string, unless a block is passed to this method for node parsing.
|
437
449
|
#
|
450
|
+
# Feature constraint parsing is available by passing in the
|
451
|
+
# `feature_constraints` key in the `options` hash. Feature constraints
|
452
|
+
# parsing provides instructions to MeCab to use the feature indicated
|
453
|
+
# for any morpheme that is an exact match for the given key.
|
454
|
+
# `feature_constraints` is a hash mapping a specific morpheme (String)
|
455
|
+
# to a corresponding feature value (String).
|
438
456
|
# @param text [String] the Japanese text to parse
|
439
|
-
# @param
|
440
|
-
# @return [String] parsing result from
|
441
|
-
# @raise [MeCabError] if the
|
457
|
+
# @param constraints [Hash] `boundary_constraints` or `feature_constraints`
|
458
|
+
# @return [String] parsing result from MeCab
|
459
|
+
# @raise [MeCabError] if the MeCab Tagger cannot parse the given `text`
|
442
460
|
# @raise [ArgumentError] if the given string `text` argument is `nil`
|
443
461
|
# @see MeCabNode
|
444
|
-
def parse(text,
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
462
|
+
def parse(text, constraints={})
|
463
|
+
if text.nil?
|
464
|
+
raise ArgumentError.new 'Text to parse cannot be nil'
|
465
|
+
elsif constraints[:boundary_constraints]
|
466
|
+
if !(constraints[:boundary_constraints].is_a?(Regexp) ||
|
467
|
+
constraints[:boundary_constraints].is_a?(String))
|
468
|
+
raise ArgumentError.new 'boundary constraints must be a Regexp or String'
|
451
469
|
end
|
470
|
+
elsif constraints[:feature_constraints] && !constraints[:feature_constraints].is_a?(Hash)
|
471
|
+
raise ArgumentError.new 'feature constraints must be a Hash'
|
472
|
+
elsif @options[:partial] && !text.end_with?("\n")
|
473
|
+
raise ArgumentError.new 'partial parsing requires new-line char at end of text'
|
474
|
+
end
|
475
|
+
|
476
|
+
if block_given?
|
477
|
+
@parse_tonodes.call(text, constraints).each {|n| yield n }
|
452
478
|
else
|
453
|
-
|
454
|
-
@parse_tonodes.call(text).each {|n| yield n }
|
455
|
-
else
|
456
|
-
@parse_tostr.call(text)
|
457
|
-
end
|
479
|
+
@parse_tostr.call(text, constraints)
|
458
480
|
end
|
459
481
|
end
|
460
482
|
|
461
483
|
# Parses the given string `text`, returning an
|
462
|
-
#
|
484
|
+
# [Enumerator](http://www.ruby-doc.org/core-2.2.1/Enumerator.html) that may be
|
463
485
|
# used to iterate over the resulting {MeCabNode} objects. This is more
|
464
486
|
# efficient than parsing to a simple string, since each node's
|
465
487
|
# information will not be materialized all at once as it is with
|
@@ -469,45 +491,62 @@ module Natto
|
|
469
491
|
# the morpheme. Node-formatting may also be used to customize
|
470
492
|
# the resulting node's `feature` attribute.
|
471
493
|
#
|
472
|
-
# Boundary constraint parsing is available
|
494
|
+
# Boundary constraint parsing is available by passing in the
|
473
495
|
# `boundary_constraints` key in the `options` hash. Boundary constraints
|
474
496
|
# parsing provides hints to MeCab on where the morpheme boundaries in the
|
475
497
|
# given `text` are located. `boundary_constraints` value may be either a
|
476
498
|
# `Regexp` or `String`; please see
|
477
|
-
# [String#scan](http://ruby-doc.org/core-2.2.
|
499
|
+
# [String#scan](http://ruby-doc.org/core-2.2.1/String.html#method-i-scan)
|
478
500
|
#
|
501
|
+
# Feature constraint parsing is available by passing in the
|
502
|
+
# `feature_constraints` key in the `options` hash. Feature constraints
|
503
|
+
# parsing provides instructions to MeCab to use the feature indicated
|
504
|
+
# for any morpheme that is an exact match for the given key.
|
505
|
+
# `feature_constraints` is a hash mapping a specific morpheme (String)
|
506
|
+
# to a corresponding feature value (String).
|
479
507
|
# @param text [String] the Japanese text to parse
|
480
|
-
# @param
|
508
|
+
# @param constraints [Hash] `boundary_constraints` or `feature_constraints`
|
481
509
|
# @return [Enumerator] of MeCabNode instances
|
482
|
-
# @raise [MeCabError] if the
|
510
|
+
# @raise [MeCabError] if the MeCab Tagger cannot parse the given `text`
|
483
511
|
# @raise [ArgumentError] if the given string `text` argument is `nil`
|
484
512
|
# @see MeCabNode
|
485
513
|
# @see http://ruby-doc.org/core-2.2.1/Enumerator.html
|
486
|
-
def enum_parse(text,
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
514
|
+
def enum_parse(text, constraints={})
|
515
|
+
if text.nil?
|
516
|
+
raise ArgumentError.new 'Text to parse cannot be nil'
|
517
|
+
elsif constraints[:boundary_constraints]
|
518
|
+
if !(constraints[:boundary_constraints].is_a?(Regexp) ||
|
519
|
+
constraints[:boundary_constraints].is_a?(String))
|
520
|
+
raise ArgumentError.new 'boundary constraints must be a Regexp or String'
|
521
|
+
end
|
522
|
+
elsif constraints[:feature_constraints] && !constraints[:feature_constraints].is_a?(Hash)
|
523
|
+
raise ArgumentError.new 'feature constraints must be a Hash'
|
524
|
+
elsif @options[:partial] && !text.end_with?("\n")
|
525
|
+
raise ArgumentError.new 'partial parsing requires new-line char at end of text'
|
492
526
|
end
|
527
|
+
|
528
|
+
@parse_tonodes.call(text, constraints)
|
493
529
|
end
|
494
530
|
|
495
|
-
# Returns human-readable details for the wrapped
|
531
|
+
# Returns human-readable details for the wrapped MeCab library.
|
496
532
|
# Overrides `Object#to_s`.
|
497
533
|
#
|
498
534
|
# - encoded object id
|
499
|
-
# - underlying FFI pointer to the
|
500
|
-
# -
|
535
|
+
# - underlying FFI pointer to the MeCab Model
|
536
|
+
# - underlying FFI pointer to the MeCab Tagger
|
537
|
+
# - underlying FFI pointer to the MeCab Lattice
|
538
|
+
# - real file path to MeCab library
|
501
539
|
# - options hash
|
502
540
|
# - list of dictionaries
|
503
541
|
# - MeCab version
|
504
|
-
#
|
505
542
|
# @return [String] encoded object id, underlying FFI pointer,
|
506
|
-
# file path to
|
543
|
+
# file path to MeCab library, options hash,
|
507
544
|
# list of dictionaries and MeCab version
|
508
545
|
def to_s
|
509
546
|
[ super.chop,
|
547
|
+
"@model=#{@model},",
|
510
548
|
"@tagger=#{@tagger},",
|
549
|
+
"@lattice=#{@lattice},",
|
511
550
|
"@libpath=\"#{@libpath}\",",
|
512
551
|
"@options=#{@options.inspect},",
|
513
552
|
"@dicts=#{@dicts.to_s},",
|
@@ -515,7 +554,6 @@ module Natto
|
|
515
554
|
end
|
516
555
|
|
517
556
|
# Overrides `Object#inspect`.
|
518
|
-
#
|
519
557
|
# @return [String] encoded object id, FFI pointer, options hash,
|
520
558
|
# list of dictionaries, and MeCab version
|
521
559
|
# @see #to_s
|
@@ -524,26 +562,27 @@ module Natto
|
|
524
562
|
end
|
525
563
|
|
526
564
|
# Returns a `Proc` that will properly free resources
|
527
|
-
# when this
|
528
|
-
#
|
529
|
-
#
|
530
|
-
#
|
531
|
-
#
|
532
|
-
|
533
|
-
# @return [Proc] to release `mecab` resources properly
|
534
|
-
def self.create_free_proc(tptr)
|
565
|
+
# when this instance is garbage collected.
|
566
|
+
# @param mptr [FFI::Pointer] pointer to Model
|
567
|
+
# @param tptr [FFI::Pointer] pointer to Tagger
|
568
|
+
# @param lptr [FFI::Pointer] pointer to Lattice
|
569
|
+
# @return [Proc] to release MeCab resources properly
|
570
|
+
def self.create_free_proc(mptr, tptr, lptr)
|
535
571
|
Proc.new do
|
572
|
+
self.mecab_lattice_destroy(lptr)
|
536
573
|
self.mecab_destroy(tptr)
|
574
|
+
self.mecab_model_destory(mptr)
|
537
575
|
end
|
538
576
|
end
|
539
577
|
|
540
578
|
private
|
541
579
|
|
542
580
|
# @private
|
543
|
-
|
581
|
+
# MeCab eats all leading and training whitespace char
|
582
|
+
def tokenize_by_pattern(text, pattern)
|
544
583
|
matches = text.scan(pattern)
|
545
584
|
|
546
|
-
acc =[]
|
585
|
+
acc = []
|
547
586
|
tmp = text
|
548
587
|
matches.each_with_index do |m,i|
|
549
588
|
bef, mat, aft = tmp.partition(m)
|
@@ -553,17 +592,34 @@ module Natto
|
|
553
592
|
unless mat.empty?
|
554
593
|
acc << [mat.strip, true]
|
555
594
|
end
|
556
|
-
if i==matches.size-1
|
595
|
+
if i==matches.size-1 && !aft.empty?
|
557
596
|
acc << [aft.strip, false]
|
558
597
|
end
|
559
598
|
tmp = aft
|
560
599
|
end
|
561
600
|
acc
|
562
601
|
end
|
602
|
+
|
603
|
+
def tokenize_by_features(text, features)
|
604
|
+
acc = []
|
605
|
+
acc << [text.strip, false]
|
606
|
+
|
607
|
+
features.each do |feature|
|
608
|
+
acc.each_with_index do |e,i|
|
609
|
+
if !e.last
|
610
|
+
tmp = tokenize_by_pattern(e.first, feature)
|
611
|
+
if !tmp.empty?
|
612
|
+
acc.delete_at(i)
|
613
|
+
acc.insert(i, *tmp)
|
614
|
+
end
|
615
|
+
end
|
616
|
+
end
|
617
|
+
end
|
618
|
+
acc
|
619
|
+
end
|
563
620
|
end
|
564
621
|
|
565
|
-
# `MeCabError` is a general error class
|
566
|
-
# for the `Natto` module.
|
622
|
+
# `MeCabError` is a general error class for the `Natto` module.
|
567
623
|
class MeCabError < RuntimeError; end
|
568
624
|
end
|
569
625
|
|