natto 0.9.5 → 0.9.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG +11 -0
- data/README.md +17 -22
- data/lib/natto.rb +1 -599
- data/lib/natto/binding.rb +36 -17
- data/lib/natto/natto.rb +295 -0
- data/lib/natto/option_parse.rb +36 -28
- data/lib/natto/struct.rb +310 -0
- data/lib/natto/version.rb +16 -16
- metadata +30 -33
- data/lib/natto/utils.rb +0 -16
data/lib/natto/binding.rb
CHANGED
@@ -1,17 +1,17 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
module Natto
|
3
3
|
|
4
|
-
# Module
|
5
|
-
# which are made available via
|
6
|
-
#
|
4
|
+
# Module `Binding` encapsulates methods and behavior
|
5
|
+
# which are made available via `FFI` bindings to
|
6
|
+
# `mecab`.
|
7
7
|
module Binding
|
8
8
|
require 'ffi'
|
9
9
|
require 'rbconfig'
|
10
10
|
extend FFI::Library
|
11
11
|
|
12
12
|
# String name for the environment variable used by
|
13
|
-
#
|
14
|
-
# to the
|
13
|
+
# `Natto` to indicate the exact name / full path
|
14
|
+
# to the `mecab` library.
|
15
15
|
MECAB_PATH = 'MECAB_PATH'.freeze
|
16
16
|
|
17
17
|
# @private
|
@@ -19,14 +19,14 @@ module Natto
|
|
19
19
|
base.extend(ClassMethods)
|
20
20
|
end
|
21
21
|
|
22
|
-
# Returns the name of the
|
22
|
+
# Returns the name of the `mecab` library based on
|
23
23
|
# the runtime environment. The value of the environment
|
24
|
-
# parameter
|
24
|
+
# parameter `MECAB_PATH` is checked before this
|
25
25
|
# function is invoked, and in the case of Windows, a
|
26
|
-
#
|
27
|
-
# is
|
26
|
+
# `LoadError` will be raised if `MECAB_PATH`
|
27
|
+
# is _not_ set to the full path of the `mecab`
|
28
28
|
# library.
|
29
|
-
# @return name of the
|
29
|
+
# @return name of the `mecab` library
|
30
30
|
# @raise [LoadError] if MECAB_PATH environment variable is not set in Windows
|
31
31
|
# <br/>
|
32
32
|
# e.g., for bash on UNIX/Linux
|
@@ -37,20 +37,14 @@ module Natto
|
|
37
37
|
#
|
38
38
|
# set MECAB_PATH=C:\Program Files\MeCab\bin\libmecab.dll
|
39
39
|
#
|
40
|
-
# e.g., for Cygwin
|
41
|
-
#
|
42
|
-
# export MECAB_PATH=cygmecab-1
|
43
|
-
#
|
44
40
|
# e.g., from within a Ruby program
|
45
41
|
#
|
46
|
-
# ENV['MECAB_PATH']
|
42
|
+
# ENV['MECAB_PATH']='usr/local/lib/libmecab.so'
|
47
43
|
def self.find_library
|
48
44
|
host_os = RbConfig::CONFIG['host_os']
|
49
45
|
|
50
46
|
if host_os =~ /mswin|mingw/i
|
51
47
|
raise LoadError, "Please set #{MECAB_PATH} to the full path to libmecab.dll"
|
52
|
-
elsif host_os =~ /cygwin/i
|
53
|
-
'cygmecab-1'
|
54
48
|
else
|
55
49
|
'mecab'
|
56
50
|
end
|
@@ -58,10 +52,17 @@ module Natto
|
|
58
52
|
|
59
53
|
ffi_lib(ENV[MECAB_PATH] || find_library)
|
60
54
|
|
55
|
+
# new interface
|
56
|
+
attach_function :mecab_model_new2, [:string], :pointer
|
57
|
+
attach_function :mecab_model_destroy, [:pointer], :void
|
58
|
+
attach_function :mecab_model_dictionary_info, [:pointer], :pointer
|
59
|
+
|
60
|
+
# old interface
|
61
61
|
attach_function :mecab_new2, [:string], :pointer
|
62
62
|
attach_function :mecab_version, [], :string
|
63
63
|
attach_function :mecab_strerror, [:pointer],:string
|
64
64
|
attach_function :mecab_destroy, [:pointer], :void
|
65
|
+
attach_function :mecab_set_partial, [:pointer, :int], :void
|
65
66
|
attach_function :mecab_set_theta, [:pointer, :float], :void
|
66
67
|
attach_function :mecab_set_lattice_level, [:pointer, :int], :void
|
67
68
|
attach_function :mecab_set_all_morphs, [:pointer, :int], :void
|
@@ -75,6 +76,20 @@ module Natto
|
|
75
76
|
|
76
77
|
# @private
|
77
78
|
module ClassMethods
|
79
|
+
|
80
|
+
def mecab_model_new2(options_str)
|
81
|
+
Natto::Binding.mecab_model_new2(options_str)
|
82
|
+
end
|
83
|
+
|
84
|
+
def mecab_model_destroy(m_ptr)
|
85
|
+
Natto::Binding.mecab_model_destroy(m_ptr)
|
86
|
+
end
|
87
|
+
|
88
|
+
def mecab_model_dictionary_info(m_ptr)
|
89
|
+
Natto::Binding.mecab_model_dictionary_info(m_ptr)
|
90
|
+
end
|
91
|
+
|
92
|
+
# ----------------------------------------
|
78
93
|
def mecab_new2(options_str)
|
79
94
|
Natto::Binding.mecab_new2(options_str)
|
80
95
|
end
|
@@ -91,6 +106,10 @@ module Natto
|
|
91
106
|
Natto::Binding.mecab_destroy(m_ptr)
|
92
107
|
end
|
93
108
|
|
109
|
+
def mecab_set_partial(m_ptr, ll)
|
110
|
+
Natto::Binding.mecab_set_partial(m_ptr, ll)
|
111
|
+
end
|
112
|
+
|
94
113
|
def mecab_set_theta(m_ptr, t)
|
95
114
|
Natto::Binding.mecab_set_theta(m_ptr, t)
|
96
115
|
end
|
data/lib/natto/natto.rb
ADDED
@@ -0,0 +1,295 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'natto/binding'
|
3
|
+
require 'natto/option_parse'
|
4
|
+
require 'natto/struct'
|
5
|
+
|
6
|
+
module Natto
|
7
|
+
# `MeCab` is a wrapper class for the `mecab` tagger.
|
8
|
+
# Options to the `mecab` tagger are passed in as a string
|
9
|
+
# (MeCab command-line style) or as a Ruby-style hash at
|
10
|
+
# initialization.
|
11
|
+
#
|
12
|
+
# ## Usage
|
13
|
+
#
|
14
|
+
# require 'rubygems' if RUBY_VERSION.to_f < 1.9
|
15
|
+
# require 'natto'
|
16
|
+
#
|
17
|
+
# nm = Natto::MeCab.new('-Ochasen')
|
18
|
+
# => #<Natto::MeCab:0x28d3bdc8 \
|
19
|
+
# @tagger=#<FFI::Pointer address=0x28afb980>, \
|
20
|
+
# @options={:output_format_type=>"chasen"}, \
|
21
|
+
# @dicts=[#<Natto::DictionaryInfo:0x289a1f14 \
|
22
|
+
# type="0", \
|
23
|
+
# filename="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
|
24
|
+
# charset="utf8">], \
|
25
|
+
# @version="0.996">
|
26
|
+
#
|
27
|
+
# nm.parse('凡人にしか見えねえ風景ってのがあるんだよ。') do |n|
|
28
|
+
# puts "#{n.surface}\t#{n.feature}"
|
29
|
+
# end
|
30
|
+
# 凡人 名詞,一般,*,*,*,*,凡人,ボンジン,ボンジン
|
31
|
+
# に 助詞,格助詞,一般,*,*,*,に,ニ,ニ
|
32
|
+
# しか 助詞,係助詞,*,*,*,*,しか,シカ,シカ
|
33
|
+
# 見え 動詞,自立,*,*,一段,未然形,見える,ミエ,ミエ
|
34
|
+
# ねえ 助動詞,*,*,*,特殊・ナイ,音便基本形,ない,ネエ,ネー
|
35
|
+
# 風景 名詞,一般,*,*,*,*,風景,フウケイ,フーケイ
|
36
|
+
# って 助詞,格助詞,連語,*,*,*,って,ッテ,ッテ
|
37
|
+
# の 名詞,非自立,一般,*,*,*,の,ノ,ノ
|
38
|
+
# が 助詞,格助詞,一般,*,*,*,が,ガ,ガ
|
39
|
+
# ある 動詞,自立,*,*,五段・ラ行,基本形,ある,アル,アル
|
40
|
+
# ん 名詞,非自立,一般,*,*,*,ん,ン,ン
|
41
|
+
# だ 助動詞,*,*,*一般,特殊・ダ,基本形,だ,ダ,ダ
|
42
|
+
# よ 助詞,終助詞,*,*,*,*,よ,ã¨,ヨ
|
43
|
+
# 。 記号,句点,*,*,*,*,。,。,。
|
44
|
+
# BOS/EOS,*,*,*,*,*,*,*,*BOS
|
45
|
+
#
|
46
|
+
class MeCab
|
47
|
+
include Natto::Binding
|
48
|
+
include Natto::OptionParse
|
49
|
+
|
50
|
+
attr_reader :tagger, :options, :dicts, :version
|
51
|
+
|
52
|
+
# Initializes the wrapped `mecab` instance with the
|
53
|
+
# given `options`.
|
54
|
+
#
|
55
|
+
# Options supported are:
|
56
|
+
#
|
57
|
+
# - :rcfile -- resource file
|
58
|
+
# - :dicdir -- system dicdir
|
59
|
+
# - :userdic -- user dictionary
|
60
|
+
# - :lattice_level -- lattice information level (DEPRECATED)
|
61
|
+
# - :output_format_type -- output format type (wakati, chasen, yomi, etc.)
|
62
|
+
# - :all_morphs -- output all morphs (default false)
|
63
|
+
# - :nbest -- output N best results (integer, default 1), requires lattice level >= 1
|
64
|
+
# - :partial -- partial parsing mode
|
65
|
+
# - :marginal -- output marginal probability
|
66
|
+
# - :max_grouping_size -- maximum grouping size for unknown words (default 24)
|
67
|
+
# - :node_format -- user-defined node format
|
68
|
+
# - :unk_format -- user-defined unknown node format
|
69
|
+
# - :bos_format -- user-defined beginning-of-sentence format
|
70
|
+
# - :eos_format -- user-defined end-of-sentence format
|
71
|
+
# - :eon_format -- user-defined end-of-NBest format
|
72
|
+
# - :unk_feature -- feature for unknown word
|
73
|
+
# - :input_buffer_size -- set input buffer size (default 8192)
|
74
|
+
# - :allocate_sentence -- allocate new memory for input sentence
|
75
|
+
# - :theta -- temperature parameter theta (float, default 0.75)
|
76
|
+
# - :cost_factor -- cost factor (integer, default 700)
|
77
|
+
#
|
78
|
+
# <p>MeCab command-line arguments (-F) or long (--node-format) may be used in
|
79
|
+
# addition to Ruby-style `Hash`es</p>
|
80
|
+
# <i>Use single-quotes to preserve format options that contain escape chars.</i><br/>
|
81
|
+
# e.g.<br/>
|
82
|
+
#
|
83
|
+
# nm = Natto::MeCab.new(:node_format=>'%m¥t%f[7]¥n')
|
84
|
+
# => #<Natto::MeCab:0x28d2ae10
|
85
|
+
# @tagger=#<FFI::Pointer address=0x28a97980>, \
|
86
|
+
# @options={:node_format=>"%m¥t%f[7]¥n"}, \
|
87
|
+
# @dicts=[#<Natto::DictionaryInfo:0x28d2a85c \
|
88
|
+
# type="0", \
|
89
|
+
# filename="/usr/local/lib/mecab/dic/ipadic/sys.dic" \
|
90
|
+
# charset="utf8">], \
|
91
|
+
# @version="0.996">
|
92
|
+
#
|
93
|
+
# puts nm.parse('才能とは求める人間に与えられるものではない。')
|
94
|
+
# 才能 サイノウ
|
95
|
+
# と ト
|
96
|
+
# は ハ
|
97
|
+
# 求 モトメル
|
98
|
+
# 人間 ニンゲン
|
99
|
+
# に ニ
|
100
|
+
# 与え アタエ
|
101
|
+
# られる ラレル
|
102
|
+
# もの モノ
|
103
|
+
# で デ
|
104
|
+
# は ハ
|
105
|
+
# ない ナイ
|
106
|
+
# 。 。
|
107
|
+
# EOS
|
108
|
+
#
|
109
|
+
# @param [Hash or String]
|
110
|
+
# @raise [MeCabError] if `mecab` cannot be initialized with the given `options`
|
111
|
+
def initialize(options={})
|
112
|
+
@options = self.class.parse_mecab_options(options)
|
113
|
+
@dicts = []
|
114
|
+
|
115
|
+
opt_str = self.class.build_options_str(@options)
|
116
|
+
@tagger = self.mecab_new2(opt_str)
|
117
|
+
raise MeCabError.new("Could not initialize MeCab with options: '#{opt_str}'") if @tagger.address == 0x0
|
118
|
+
|
119
|
+
self.mecab_set_theta(@tagger, @options[:theta]) if @options[:theta]
|
120
|
+
self.mecab_set_lattice_level(@tagger, @options[:lattice_level]) if @options[:lattice_level]
|
121
|
+
self.mecab_set_all_morphs(@tagger, 1) if @options[:all_morphs]
|
122
|
+
self.mecab_set_partial(@tagger, 1) if @options[:partial]
|
123
|
+
|
124
|
+
# Set mecab parsing implementations for N-best and regular parsing,
|
125
|
+
# for both parsing as string and yielding a node object
|
126
|
+
# N-Best parsing implementations
|
127
|
+
if @options[:nbest] && @options[:nbest] > 1
|
128
|
+
self.mecab_set_lattice_level(@tagger, (@options[:lattice_level] || 1))
|
129
|
+
@parse_tostr = lambda do |str|
|
130
|
+
return self.mecab_nbest_sparse_tostr(@tagger, @options[:nbest], str) ||
|
131
|
+
raise(MeCabError.new(self.mecab_strerror(@tagger)))
|
132
|
+
end
|
133
|
+
@parse_tonodes = lambda do |str|
|
134
|
+
nodes = []
|
135
|
+
if @options[:nbest] && @options[:nbest] > 1
|
136
|
+
self.mecab_nbest_init(@tagger, str)
|
137
|
+
n = self.mecab_nbest_next_tonode(@tagger)
|
138
|
+
raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
|
139
|
+
nlen = @options[:nbest]
|
140
|
+
nlen.times do |i|
|
141
|
+
s = str.bytes.to_a
|
142
|
+
while n && n.address != 0x0
|
143
|
+
mn = Natto::MeCabNode.new(n)
|
144
|
+
s = s.drop_while {|e| (e==0xa || e==0x20)}
|
145
|
+
if !s.empty?
|
146
|
+
sarr = []
|
147
|
+
mn.length.times { sarr << s.shift }
|
148
|
+
surf = sarr.pack('C*')
|
149
|
+
#mn.surface = self.class.force_enc(surf)
|
150
|
+
mn.surface = surf.force_encoding(Encoding.default_external)
|
151
|
+
end
|
152
|
+
if @options[:output_format_type] || @options[:node_format]
|
153
|
+
mn.feature = self.mecab_format_node(@tagger, n).force_encoding(Encoding.default_external)
|
154
|
+
end
|
155
|
+
nodes << mn if !mn.is_bos?
|
156
|
+
n = mn.next
|
157
|
+
end
|
158
|
+
n = self.mecab_nbest_next_tonode(@tagger)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
return nodes
|
162
|
+
end
|
163
|
+
else
|
164
|
+
# default parsing implementations
|
165
|
+
@parse_tostr = lambda do |str|
|
166
|
+
return self.mecab_sparse_tostr(@tagger, str) ||
|
167
|
+
raise(MeCabError.new(self.mecab_strerror(@tagger)))
|
168
|
+
end
|
169
|
+
@parse_tonodes = lambda do |str|
|
170
|
+
nodes = []
|
171
|
+
n = self.mecab_sparse_tonode(@tagger, str)
|
172
|
+
raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
|
173
|
+
mn = Natto::MeCabNode.new(n)
|
174
|
+
n = mn.next if mn.next.address!=0x0
|
175
|
+
s = str.bytes.to_a
|
176
|
+
while n && n.address!=0x0
|
177
|
+
mn = Natto::MeCabNode.new(n)
|
178
|
+
s = s.drop_while {|e| (e==0xa || e==0x20)}
|
179
|
+
if !s.empty?
|
180
|
+
sarr = []
|
181
|
+
mn.length.times { sarr << s.shift }
|
182
|
+
surf = sarr.pack('C*')
|
183
|
+
mn.surface = surf.force_encoding(Encoding.default_external)
|
184
|
+
end
|
185
|
+
nodes << mn
|
186
|
+
n = mn.next
|
187
|
+
end
|
188
|
+
return nodes
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
@dicts << Natto::DictionaryInfo.new(Natto::Binding.mecab_dictionary_info(@tagger))
|
193
|
+
while @dicts.last.next.address != 0x0
|
194
|
+
@dicts << Natto::DictionaryInfo.new(@dicts.last.next)
|
195
|
+
end
|
196
|
+
|
197
|
+
@version = self.mecab_version
|
198
|
+
|
199
|
+
ObjectSpace.define_finalizer(self, self.class.create_free_proc(@tagger))
|
200
|
+
end
|
201
|
+
|
202
|
+
# Parses the given string `str`. If a block is passed to this method,
|
203
|
+
# then node parsing will be used and each node yielded to the given block.
|
204
|
+
#
|
205
|
+
# @param [String] str
|
206
|
+
# @return parsing result from `mecab`
|
207
|
+
# @raise [MeCabError] if the `mecab` tagger cannot parse the given string `str`
|
208
|
+
# @raise [ArgumentError] if the given string `str` argument is `nil`
|
209
|
+
# @see MeCabNode
|
210
|
+
def parse(str)
|
211
|
+
raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
|
212
|
+
if block_given?
|
213
|
+
nodes = @parse_tonodes.call(str)
|
214
|
+
nodes.each {|n| yield n }
|
215
|
+
else
|
216
|
+
@parse_tostr.call(str).force_encoding(Encoding.default_external)
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
# Parses the given string `str`, and returns
|
221
|
+
# a list of `mecab` nodes.
|
222
|
+
# @param [String] str
|
223
|
+
# @return [Array] of parsed `mecab` nodes.
|
224
|
+
# @raise [MeCabError] if the `mecab` tagger cannot parse the given string `str`
|
225
|
+
# @raise [ArgumentError] if the given string `str` argument is `nil`
|
226
|
+
# @see MeCabNode
|
227
|
+
def parse_as_nodes(str)
|
228
|
+
raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
|
229
|
+
@parse_tonodes.call(str)
|
230
|
+
end
|
231
|
+
|
232
|
+
# Parses the given string `str`, and returns
|
233
|
+
# a list of `mecab` result strings.
|
234
|
+
# @param [String] str
|
235
|
+
# @return [Array] of parsed `mecab` result strings.
|
236
|
+
# @raise [MeCabError] if the `mecab` tagger cannot parse the given string `str`
|
237
|
+
# @raise [ArgumentError] if the given string `str` argument is `nil`
|
238
|
+
def parse_as_strings(str)
|
239
|
+
raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
|
240
|
+
@parse_tostr.call(str).force_encoding(Encoding.default_external).lines.to_a
|
241
|
+
end
|
242
|
+
|
243
|
+
# DEPRECATED: use parse_as_nodes instead.
|
244
|
+
def readnodes(str)
|
245
|
+
$stdout.puts 'DEPRECATED: use parse_as_nodes instead'
|
246
|
+
parse_as_nodes(str)
|
247
|
+
end
|
248
|
+
|
249
|
+
# DEPRECATED: use parse_as_strings instead.
|
250
|
+
def readlines(str)
|
251
|
+
$stdout.puts 'DEPRECATED: use parse_as_strings instead'
|
252
|
+
parse_as_strings(str)
|
253
|
+
end
|
254
|
+
|
255
|
+
# Returns human-readable details for the wrapped `mecab` tagger.
|
256
|
+
# Overrides `Object#to_s`.
|
257
|
+
#
|
258
|
+
# - encoded object id
|
259
|
+
# - underlying FFI pointer to the `mecab` tagger
|
260
|
+
# - options hash
|
261
|
+
# - list of dictionaries
|
262
|
+
# - MeCab version
|
263
|
+
#
|
264
|
+
# @return [String] encoded object id, underlying FFI pointer, options hash, list of dictionaries, and MeCab version
|
265
|
+
def to_s
|
266
|
+
%(#{super.chop} @tagger=#{@tagger}, @options=#{@options.inspect}, @dicts=#{@dicts.to_s}, @version="#{@version.to_s}">)
|
267
|
+
end
|
268
|
+
|
269
|
+
# Overrides `Object#inspect`.
|
270
|
+
#
|
271
|
+
# @return [String] encoded object id, FFI pointer, options hash, list of dictionaries, and MeCab version
|
272
|
+
# @see #to_s
|
273
|
+
def inspect
|
274
|
+
self.to_s
|
275
|
+
end
|
276
|
+
|
277
|
+
# Returns a `Proc` that will properly free resources
|
278
|
+
# when this `MeCab` instance is garbage collected.
|
279
|
+
# The `Proc` returned is registered to be invoked
|
280
|
+
# after the `MeCab` instance owning `ptr`
|
281
|
+
# has been destroyed.
|
282
|
+
#
|
283
|
+
# @param [FFI::Pointer] ptr
|
284
|
+
# @return [Proc] to release `mecab` resources properly
|
285
|
+
def self.create_free_proc(ptr)
|
286
|
+
Proc.new do
|
287
|
+
self.mecab_destroy(ptr)
|
288
|
+
end
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
# `MeCabError` is a general error class
|
293
|
+
# for the `Natto` module.
|
294
|
+
class MeCabError < RuntimeError; end
|
295
|
+
end
|
data/lib/natto/option_parse.rb
CHANGED
@@ -1,13 +1,16 @@
|
|
1
1
|
module Natto
|
2
2
|
|
3
|
-
# Module
|
4
|
-
# for parsing the various
|
5
|
-
#
|
3
|
+
# Module `OptionParse` encapsulates methods and behavior
|
4
|
+
# for parsing the various `mecab` options supported by
|
5
|
+
# `Natto`.
|
6
6
|
module OptionParse
|
7
7
|
require 'optparse'
|
8
8
|
|
9
|
-
|
10
|
-
|
9
|
+
WARNING_LATTICE_LEVEL =
|
10
|
+
":lattice-level is DEPRECATED, please use :marginal or :nbest\n".freeze
|
11
|
+
|
12
|
+
# Mapping of mecab short-style configuration options to the `mecab`
|
13
|
+
# tagger. See the `mecab` help for more details.
|
11
14
|
SUPPORTED_OPTS = { '-r' => :rcfile,
|
12
15
|
'-d' => :dicdir,
|
13
16
|
'-u' => :userdic,
|
@@ -15,6 +18,9 @@ module Natto
|
|
15
18
|
'-O' => :output_format_type,
|
16
19
|
'-a' => :all_morphs,
|
17
20
|
'-N' => :nbest,
|
21
|
+
'-p' => :partial,
|
22
|
+
'-m' => :marginal,
|
23
|
+
'-M' => :max_grouping_size,
|
18
24
|
'-F' => :node_format,
|
19
25
|
'-U' => :unk_format,
|
20
26
|
'-B' => :bos_format,
|
@@ -42,35 +48,36 @@ module Natto
|
|
42
48
|
h = {}
|
43
49
|
if options.is_a? String
|
44
50
|
opts = OptionParser.new do |opts|
|
45
|
-
opts.on('-r', '--rcfile ARG')
|
46
|
-
opts.on('-d', '--dicdir ARG')
|
47
|
-
opts.on('-u', '--userdic ARG')
|
48
|
-
opts.on('-l', '--lattice-level ARG')
|
49
|
-
opts.on('-O', '--output-format-type ARG') { |arg| h[:output_format_type]
|
50
|
-
opts.on('-a', '--all-morphs')
|
51
|
-
opts.on('-N', '--nbest ARG')
|
52
|
-
|
53
|
-
opts.on('-
|
54
|
-
opts.on('-
|
55
|
-
opts.on('-
|
56
|
-
opts.on('-
|
57
|
-
opts.on('-
|
58
|
-
opts.on('-
|
59
|
-
opts.on('-
|
60
|
-
|
61
|
-
opts.on('-
|
62
|
-
opts.on('-
|
63
|
-
opts.on('-
|
51
|
+
opts.on('-r', '--rcfile ARG') { |arg| h[:rcfile] = arg.strip }
|
52
|
+
opts.on('-d', '--dicdir ARG') { |arg| h[:dicdir] = arg.strip }
|
53
|
+
opts.on('-u', '--userdic ARG') { |arg| h[:userdic] = arg.strip }
|
54
|
+
opts.on('-l', '--lattice-level ARG') { |arg| h[:lattice_level] = arg.strip.to_i } # !deprecated in 0.99!!!
|
55
|
+
opts.on('-O', '--output-format-type ARG') { |arg| h[:output_format_type] = arg.strip }
|
56
|
+
opts.on('-a', '--all-morphs') { |arg| h[:all_morphs] = true }
|
57
|
+
opts.on('-N', '--nbest ARG') { |arg| h[:nbest] = arg.strip.to_i }
|
58
|
+
opts.on('-p', '--partial') { |arg| h[:partial] = true }
|
59
|
+
opts.on('-m', '--marginal') { |arg| h[:marginal] = true }
|
60
|
+
opts.on('-M', '--max-grouping-size ARG'){ |arg| h[:max_grouping_size] = arg.strip.to_i }
|
61
|
+
opts.on('-F', '--node-format ARG') { |arg| h[:node_format] = arg.strip }
|
62
|
+
opts.on('-U', '--unk-format ARG') { |arg| h[:unk_format] = arg.strip }
|
63
|
+
opts.on('-B', '--bos-format ARG') { |arg| h[:bos_format] = arg.strip }
|
64
|
+
opts.on('-E', '--eos-format ARG') { |arg| h[:eos_format] = arg.strip }
|
65
|
+
opts.on('-S', '--eon-format ARG') { |arg| h[:eon_format] = arg.strip }
|
66
|
+
opts.on('-x', '--unk-feature ARG') { |arg| h[:unk_feature] = arg.strip }
|
67
|
+
opts.on('-b', '--input-buffer-size ARG'){ |arg| h[:input_buffer_size] = arg.strip.to_i }
|
68
|
+
opts.on('-C', '--allocate-sentence') { |arg| h[:allocate_sentence] = true }
|
69
|
+
opts.on('-t', '--theta ARG') { |arg| h[:theta] = arg.strip.to_f }
|
70
|
+
opts.on('-c', '--cost-factor ARG') { |arg| h[:cost_factor] = arg.strip.to_i }
|
64
71
|
end
|
65
72
|
opts.parse!(options.split)
|
66
73
|
else
|
67
74
|
SUPPORTED_OPTS.values.each do |k|
|
68
75
|
if options.has_key?(k)
|
69
|
-
if [ :all_morphs, :allocate_sentence ].include?(k)
|
76
|
+
if [ :all_morphs, :partial, :marginal, :allocate_sentence ].include?(k)
|
70
77
|
h[k] = true
|
71
78
|
else
|
72
79
|
v = options[k]
|
73
|
-
if [ :lattice_level, :
|
80
|
+
if [ :lattice_level, :nbest, :max_grouping_size, :input_buffer_size, :cost_factor ].include?(k)
|
74
81
|
h[k] = v.to_i
|
75
82
|
elsif k == :theta
|
76
83
|
h[k] = v.to_f
|
@@ -81,15 +88,16 @@ module Natto
|
|
81
88
|
end
|
82
89
|
end
|
83
90
|
end
|
91
|
+
$stderr.print WARNING_LATTICE_LEVEL if h.include? :lattice_level
|
84
92
|
raise MeCabError.new("Invalid N value") if h[:nbest] && (h[:nbest] < 1 || h[:nbest] > 512)
|
85
93
|
h
|
86
94
|
end
|
87
95
|
|
88
96
|
# Returns a string-representation of the options to
|
89
|
-
# be passed in the construction of the
|
97
|
+
# be passed in the construction of the `mecab` tagger.
|
90
98
|
#
|
91
99
|
# @param [Hash] options
|
92
|
-
# @return [String] representation of the options to the
|
100
|
+
# @return [String] representation of the options to the `mecab` tagger
|
93
101
|
def build_options_str(options={})
|
94
102
|
opt = []
|
95
103
|
SUPPORTED_OPTS.values.each do |k|
|