natto 0.9.5 → 0.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG +11 -0
- data/README.md +17 -22
- data/lib/natto.rb +1 -599
- data/lib/natto/binding.rb +36 -17
- data/lib/natto/natto.rb +295 -0
- data/lib/natto/option_parse.rb +36 -28
- data/lib/natto/struct.rb +310 -0
- data/lib/natto/version.rb +16 -16
- metadata +30 -33
- data/lib/natto/utils.rb +0 -16
data/lib/natto/binding.rb
CHANGED
@@ -1,17 +1,17 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
module Natto
|
3
3
|
|
4
|
-
# Module
|
5
|
-
# which are made available via
|
6
|
-
#
|
4
|
+
# Module `Binding` encapsulates methods and behavior
|
5
|
+
# which are made available via `FFI` bindings to
|
6
|
+
# `mecab`.
|
7
7
|
module Binding
|
8
8
|
require 'ffi'
|
9
9
|
require 'rbconfig'
|
10
10
|
extend FFI::Library
|
11
11
|
|
12
12
|
# String name for the environment variable used by
|
13
|
-
#
|
14
|
-
# to the
|
13
|
+
# `Natto` to indicate the exact name / full path
|
14
|
+
# to the `mecab` library.
|
15
15
|
MECAB_PATH = 'MECAB_PATH'.freeze
|
16
16
|
|
17
17
|
# @private
|
@@ -19,14 +19,14 @@ module Natto
|
|
19
19
|
base.extend(ClassMethods)
|
20
20
|
end
|
21
21
|
|
22
|
-
# Returns the name of the
|
22
|
+
# Returns the name of the `mecab` library based on
|
23
23
|
# the runtime environment. The value of the environment
|
24
|
-
# parameter
|
24
|
+
# parameter `MECAB_PATH` is checked before this
|
25
25
|
# function is invoked, and in the case of Windows, a
|
26
|
-
#
|
27
|
-
# is
|
26
|
+
# `LoadError` will be raised if `MECAB_PATH`
|
27
|
+
# is _not_ set to the full path of the `mecab`
|
28
28
|
# library.
|
29
|
-
# @return name of the
|
29
|
+
# @return name of the `mecab` library
|
30
30
|
# @raise [LoadError] if MECAB_PATH environment variable is not set in Windows
|
31
31
|
# <br/>
|
32
32
|
# e.g., for bash on UNIX/Linux
|
@@ -37,20 +37,14 @@ module Natto
|
|
37
37
|
#
|
38
38
|
# set MECAB_PATH=C:\Program Files\MeCab\bin\libmecab.dll
|
39
39
|
#
|
40
|
-
# e.g., for Cygwin
|
41
|
-
#
|
42
|
-
# export MECAB_PATH=cygmecab-1
|
43
|
-
#
|
44
40
|
# e.g., from within a Ruby program
|
45
41
|
#
|
46
|
-
# ENV['MECAB_PATH']
|
42
|
+
# ENV['MECAB_PATH']='usr/local/lib/libmecab.so'
|
47
43
|
def self.find_library
|
48
44
|
host_os = RbConfig::CONFIG['host_os']
|
49
45
|
|
50
46
|
if host_os =~ /mswin|mingw/i
|
51
47
|
raise LoadError, "Please set #{MECAB_PATH} to the full path to libmecab.dll"
|
52
|
-
elsif host_os =~ /cygwin/i
|
53
|
-
'cygmecab-1'
|
54
48
|
else
|
55
49
|
'mecab'
|
56
50
|
end
|
@@ -58,10 +52,17 @@ module Natto
|
|
58
52
|
|
59
53
|
ffi_lib(ENV[MECAB_PATH] || find_library)
|
60
54
|
|
55
|
+
# new interface
|
56
|
+
attach_function :mecab_model_new2, [:string], :pointer
|
57
|
+
attach_function :mecab_model_destroy, [:pointer], :void
|
58
|
+
attach_function :mecab_model_dictionary_info, [:pointer], :pointer
|
59
|
+
|
60
|
+
# old interface
|
61
61
|
attach_function :mecab_new2, [:string], :pointer
|
62
62
|
attach_function :mecab_version, [], :string
|
63
63
|
attach_function :mecab_strerror, [:pointer],:string
|
64
64
|
attach_function :mecab_destroy, [:pointer], :void
|
65
|
+
attach_function :mecab_set_partial, [:pointer, :int], :void
|
65
66
|
attach_function :mecab_set_theta, [:pointer, :float], :void
|
66
67
|
attach_function :mecab_set_lattice_level, [:pointer, :int], :void
|
67
68
|
attach_function :mecab_set_all_morphs, [:pointer, :int], :void
|
@@ -75,6 +76,20 @@ module Natto
|
|
75
76
|
|
76
77
|
# @private
|
77
78
|
module ClassMethods
|
79
|
+
|
80
|
+
def mecab_model_new2(options_str)
|
81
|
+
Natto::Binding.mecab_model_new2(options_str)
|
82
|
+
end
|
83
|
+
|
84
|
+
def mecab_model_destroy(m_ptr)
|
85
|
+
Natto::Binding.mecab_model_destroy(m_ptr)
|
86
|
+
end
|
87
|
+
|
88
|
+
def mecab_model_dictionary_info(m_ptr)
|
89
|
+
Natto::Binding.mecab_model_dictionary_info(m_ptr)
|
90
|
+
end
|
91
|
+
|
92
|
+
# ----------------------------------------
|
78
93
|
def mecab_new2(options_str)
|
79
94
|
Natto::Binding.mecab_new2(options_str)
|
80
95
|
end
|
@@ -91,6 +106,10 @@ module Natto
|
|
91
106
|
Natto::Binding.mecab_destroy(m_ptr)
|
92
107
|
end
|
93
108
|
|
109
|
+
def mecab_set_partial(m_ptr, ll)
|
110
|
+
Natto::Binding.mecab_set_partial(m_ptr, ll)
|
111
|
+
end
|
112
|
+
|
94
113
|
def mecab_set_theta(m_ptr, t)
|
95
114
|
Natto::Binding.mecab_set_theta(m_ptr, t)
|
96
115
|
end
|
data/lib/natto/natto.rb
ADDED
@@ -0,0 +1,295 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'natto/binding'
|
3
|
+
require 'natto/option_parse'
|
4
|
+
require 'natto/struct'
|
5
|
+
|
6
|
+
module Natto
|
7
|
+
# `MeCab` is a wrapper class for the `mecab` tagger.
|
8
|
+
# Options to the `mecab` tagger are passed in as a string
|
9
|
+
# (MeCab command-line style) or as a Ruby-style hash at
|
10
|
+
# initialization.
|
11
|
+
#
|
12
|
+
# ## Usage
|
13
|
+
#
|
14
|
+
# require 'rubygems' if RUBY_VERSION.to_f < 1.9
|
15
|
+
# require 'natto'
|
16
|
+
#
|
17
|
+
# nm = Natto::MeCab.new('-Ochasen')
|
18
|
+
# => #<Natto::MeCab:0x28d3bdc8 \
|
19
|
+
# @tagger=#<FFI::Pointer address=0x28afb980>, \
|
20
|
+
# @options={:output_format_type=>"chasen"}, \
|
21
|
+
# @dicts=[#<Natto::DictionaryInfo:0x289a1f14 \
|
22
|
+
# type="0", \
|
23
|
+
# filename="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
|
24
|
+
# charset="utf8">], \
|
25
|
+
# @version="0.996">
|
26
|
+
#
|
27
|
+
# nm.parse('凡人にしか見えねえ風景ってのがあるんだよ。') do |n|
|
28
|
+
# puts "#{n.surface}\t#{n.feature}"
|
29
|
+
# end
|
30
|
+
# 凡人 名詞,一般,*,*,*,*,凡人,ボンジン,ボンジン
|
31
|
+
# に 助詞,格助詞,一般,*,*,*,に,ニ,ニ
|
32
|
+
# しか 助詞,係助詞,*,*,*,*,しか,シカ,シカ
|
33
|
+
# 見え 動詞,自立,*,*,一段,未然形,見える,ミエ,ミエ
|
34
|
+
# ねえ 助動詞,*,*,*,特殊・ナイ,音便基本形,ない,ネエ,ネー
|
35
|
+
# 風景 名詞,一般,*,*,*,*,風景,フウケイ,フーケイ
|
36
|
+
# って 助詞,格助詞,連語,*,*,*,って,ッテ,ッテ
|
37
|
+
# の 名詞,非自立,一般,*,*,*,の,ノ,ノ
|
38
|
+
# が 助詞,格助詞,一般,*,*,*,が,ガ,ガ
|
39
|
+
# ある 動詞,自立,*,*,五段・ラ行,基本形,ある,アル,アル
|
40
|
+
# ん 名詞,非自立,一般,*,*,*,ん,ン,ン
|
41
|
+
# だ 助動詞,*,*,*一般,特殊・ダ,基本形,だ,ダ,ダ
|
42
|
+
# よ 助詞,終助詞,*,*,*,*,よ,ã¨,ヨ
|
43
|
+
# 。 記号,句点,*,*,*,*,。,。,。
|
44
|
+
# BOS/EOS,*,*,*,*,*,*,*,*BOS
|
45
|
+
#
|
46
|
+
class MeCab
|
47
|
+
include Natto::Binding
|
48
|
+
include Natto::OptionParse
|
49
|
+
|
50
|
+
attr_reader :tagger, :options, :dicts, :version
|
51
|
+
|
52
|
+
# Initializes the wrapped `mecab` instance with the
|
53
|
+
# given `options`.
|
54
|
+
#
|
55
|
+
# Options supported are:
|
56
|
+
#
|
57
|
+
# - :rcfile -- resource file
|
58
|
+
# - :dicdir -- system dicdir
|
59
|
+
# - :userdic -- user dictionary
|
60
|
+
# - :lattice_level -- lattice information level (DEPRECATED)
|
61
|
+
# - :output_format_type -- output format type (wakati, chasen, yomi, etc.)
|
62
|
+
# - :all_morphs -- output all morphs (default false)
|
63
|
+
# - :nbest -- output N best results (integer, default 1), requires lattice level >= 1
|
64
|
+
# - :partial -- partial parsing mode
|
65
|
+
# - :marginal -- output marginal probability
|
66
|
+
# - :max_grouping_size -- maximum grouping size for unknown words (default 24)
|
67
|
+
# - :node_format -- user-defined node format
|
68
|
+
# - :unk_format -- user-defined unknown node format
|
69
|
+
# - :bos_format -- user-defined beginning-of-sentence format
|
70
|
+
# - :eos_format -- user-defined end-of-sentence format
|
71
|
+
# - :eon_format -- user-defined end-of-NBest format
|
72
|
+
# - :unk_feature -- feature for unknown word
|
73
|
+
# - :input_buffer_size -- set input buffer size (default 8192)
|
74
|
+
# - :allocate_sentence -- allocate new memory for input sentence
|
75
|
+
# - :theta -- temperature parameter theta (float, default 0.75)
|
76
|
+
# - :cost_factor -- cost factor (integer, default 700)
|
77
|
+
#
|
78
|
+
# <p>MeCab command-line arguments (-F) or long (--node-format) may be used in
|
79
|
+
# addition to Ruby-style `Hash`es</p>
|
80
|
+
# <i>Use single-quotes to preserve format options that contain escape chars.</i><br/>
|
81
|
+
# e.g.<br/>
|
82
|
+
#
|
83
|
+
# nm = Natto::MeCab.new(:node_format=>'%m¥t%f[7]¥n')
|
84
|
+
# => #<Natto::MeCab:0x28d2ae10
|
85
|
+
# @tagger=#<FFI::Pointer address=0x28a97980>, \
|
86
|
+
# @options={:node_format=>"%m¥t%f[7]¥n"}, \
|
87
|
+
# @dicts=[#<Natto::DictionaryInfo:0x28d2a85c \
|
88
|
+
# type="0", \
|
89
|
+
# filename="/usr/local/lib/mecab/dic/ipadic/sys.dic" \
|
90
|
+
# charset="utf8">], \
|
91
|
+
# @version="0.996">
|
92
|
+
#
|
93
|
+
# puts nm.parse('才能とは求める人間に与えられるものではない。')
|
94
|
+
# 才能 サイノウ
|
95
|
+
# と ト
|
96
|
+
# は ハ
|
97
|
+
# 求 モトメル
|
98
|
+
# 人間 ニンゲン
|
99
|
+
# に ニ
|
100
|
+
# 与え アタエ
|
101
|
+
# られる ラレル
|
102
|
+
# もの モノ
|
103
|
+
# で デ
|
104
|
+
# は ハ
|
105
|
+
# ない ナイ
|
106
|
+
# 。 。
|
107
|
+
# EOS
|
108
|
+
#
|
109
|
+
# @param [Hash or String]
|
110
|
+
# @raise [MeCabError] if `mecab` cannot be initialized with the given `options`
|
111
|
+
def initialize(options={})
|
112
|
+
@options = self.class.parse_mecab_options(options)
|
113
|
+
@dicts = []
|
114
|
+
|
115
|
+
opt_str = self.class.build_options_str(@options)
|
116
|
+
@tagger = self.mecab_new2(opt_str)
|
117
|
+
raise MeCabError.new("Could not initialize MeCab with options: '#{opt_str}'") if @tagger.address == 0x0
|
118
|
+
|
119
|
+
self.mecab_set_theta(@tagger, @options[:theta]) if @options[:theta]
|
120
|
+
self.mecab_set_lattice_level(@tagger, @options[:lattice_level]) if @options[:lattice_level]
|
121
|
+
self.mecab_set_all_morphs(@tagger, 1) if @options[:all_morphs]
|
122
|
+
self.mecab_set_partial(@tagger, 1) if @options[:partial]
|
123
|
+
|
124
|
+
# Set mecab parsing implementations for N-best and regular parsing,
|
125
|
+
# for both parsing as string and yielding a node object
|
126
|
+
# N-Best parsing implementations
|
127
|
+
if @options[:nbest] && @options[:nbest] > 1
|
128
|
+
self.mecab_set_lattice_level(@tagger, (@options[:lattice_level] || 1))
|
129
|
+
@parse_tostr = lambda do |str|
|
130
|
+
return self.mecab_nbest_sparse_tostr(@tagger, @options[:nbest], str) ||
|
131
|
+
raise(MeCabError.new(self.mecab_strerror(@tagger)))
|
132
|
+
end
|
133
|
+
@parse_tonodes = lambda do |str|
|
134
|
+
nodes = []
|
135
|
+
if @options[:nbest] && @options[:nbest] > 1
|
136
|
+
self.mecab_nbest_init(@tagger, str)
|
137
|
+
n = self.mecab_nbest_next_tonode(@tagger)
|
138
|
+
raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
|
139
|
+
nlen = @options[:nbest]
|
140
|
+
nlen.times do |i|
|
141
|
+
s = str.bytes.to_a
|
142
|
+
while n && n.address != 0x0
|
143
|
+
mn = Natto::MeCabNode.new(n)
|
144
|
+
s = s.drop_while {|e| (e==0xa || e==0x20)}
|
145
|
+
if !s.empty?
|
146
|
+
sarr = []
|
147
|
+
mn.length.times { sarr << s.shift }
|
148
|
+
surf = sarr.pack('C*')
|
149
|
+
#mn.surface = self.class.force_enc(surf)
|
150
|
+
mn.surface = surf.force_encoding(Encoding.default_external)
|
151
|
+
end
|
152
|
+
if @options[:output_format_type] || @options[:node_format]
|
153
|
+
mn.feature = self.mecab_format_node(@tagger, n).force_encoding(Encoding.default_external)
|
154
|
+
end
|
155
|
+
nodes << mn if !mn.is_bos?
|
156
|
+
n = mn.next
|
157
|
+
end
|
158
|
+
n = self.mecab_nbest_next_tonode(@tagger)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
return nodes
|
162
|
+
end
|
163
|
+
else
|
164
|
+
# default parsing implementations
|
165
|
+
@parse_tostr = lambda do |str|
|
166
|
+
return self.mecab_sparse_tostr(@tagger, str) ||
|
167
|
+
raise(MeCabError.new(self.mecab_strerror(@tagger)))
|
168
|
+
end
|
169
|
+
@parse_tonodes = lambda do |str|
|
170
|
+
nodes = []
|
171
|
+
n = self.mecab_sparse_tonode(@tagger, str)
|
172
|
+
raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
|
173
|
+
mn = Natto::MeCabNode.new(n)
|
174
|
+
n = mn.next if mn.next.address!=0x0
|
175
|
+
s = str.bytes.to_a
|
176
|
+
while n && n.address!=0x0
|
177
|
+
mn = Natto::MeCabNode.new(n)
|
178
|
+
s = s.drop_while {|e| (e==0xa || e==0x20)}
|
179
|
+
if !s.empty?
|
180
|
+
sarr = []
|
181
|
+
mn.length.times { sarr << s.shift }
|
182
|
+
surf = sarr.pack('C*')
|
183
|
+
mn.surface = surf.force_encoding(Encoding.default_external)
|
184
|
+
end
|
185
|
+
nodes << mn
|
186
|
+
n = mn.next
|
187
|
+
end
|
188
|
+
return nodes
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
@dicts << Natto::DictionaryInfo.new(Natto::Binding.mecab_dictionary_info(@tagger))
|
193
|
+
while @dicts.last.next.address != 0x0
|
194
|
+
@dicts << Natto::DictionaryInfo.new(@dicts.last.next)
|
195
|
+
end
|
196
|
+
|
197
|
+
@version = self.mecab_version
|
198
|
+
|
199
|
+
ObjectSpace.define_finalizer(self, self.class.create_free_proc(@tagger))
|
200
|
+
end
|
201
|
+
|
202
|
+
# Parses the given string `str`. If a block is passed to this method,
|
203
|
+
# then node parsing will be used and each node yielded to the given block.
|
204
|
+
#
|
205
|
+
# @param [String] str
|
206
|
+
# @return parsing result from `mecab`
|
207
|
+
# @raise [MeCabError] if the `mecab` tagger cannot parse the given string `str`
|
208
|
+
# @raise [ArgumentError] if the given string `str` argument is `nil`
|
209
|
+
# @see MeCabNode
|
210
|
+
def parse(str)
|
211
|
+
raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
|
212
|
+
if block_given?
|
213
|
+
nodes = @parse_tonodes.call(str)
|
214
|
+
nodes.each {|n| yield n }
|
215
|
+
else
|
216
|
+
@parse_tostr.call(str).force_encoding(Encoding.default_external)
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
# Parses the given string `str`, and returns
|
221
|
+
# a list of `mecab` nodes.
|
222
|
+
# @param [String] str
|
223
|
+
# @return [Array] of parsed `mecab` nodes.
|
224
|
+
# @raise [MeCabError] if the `mecab` tagger cannot parse the given string `str`
|
225
|
+
# @raise [ArgumentError] if the given string `str` argument is `nil`
|
226
|
+
# @see MeCabNode
|
227
|
+
def parse_as_nodes(str)
|
228
|
+
raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
|
229
|
+
@parse_tonodes.call(str)
|
230
|
+
end
|
231
|
+
|
232
|
+
# Parses the given string `str`, and returns
|
233
|
+
# a list of `mecab` result strings.
|
234
|
+
# @param [String] str
|
235
|
+
# @return [Array] of parsed `mecab` result strings.
|
236
|
+
# @raise [MeCabError] if the `mecab` tagger cannot parse the given string `str`
|
237
|
+
# @raise [ArgumentError] if the given string `str` argument is `nil`
|
238
|
+
def parse_as_strings(str)
|
239
|
+
raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
|
240
|
+
@parse_tostr.call(str).force_encoding(Encoding.default_external).lines.to_a
|
241
|
+
end
|
242
|
+
|
243
|
+
# DEPRECATED: use parse_as_nodes instead.
|
244
|
+
def readnodes(str)
|
245
|
+
$stdout.puts 'DEPRECATED: use parse_as_nodes instead'
|
246
|
+
parse_as_nodes(str)
|
247
|
+
end
|
248
|
+
|
249
|
+
# DEPRECATED: use parse_as_strings instead.
|
250
|
+
def readlines(str)
|
251
|
+
$stdout.puts 'DEPRECATED: use parse_as_strings instead'
|
252
|
+
parse_as_strings(str)
|
253
|
+
end
|
254
|
+
|
255
|
+
# Returns human-readable details for the wrapped `mecab` tagger.
|
256
|
+
# Overrides `Object#to_s`.
|
257
|
+
#
|
258
|
+
# - encoded object id
|
259
|
+
# - underlying FFI pointer to the `mecab` tagger
|
260
|
+
# - options hash
|
261
|
+
# - list of dictionaries
|
262
|
+
# - MeCab version
|
263
|
+
#
|
264
|
+
# @return [String] encoded object id, underlying FFI pointer, options hash, list of dictionaries, and MeCab version
|
265
|
+
def to_s
|
266
|
+
%(#{super.chop} @tagger=#{@tagger}, @options=#{@options.inspect}, @dicts=#{@dicts.to_s}, @version="#{@version.to_s}">)
|
267
|
+
end
|
268
|
+
|
269
|
+
# Overrides `Object#inspect`.
|
270
|
+
#
|
271
|
+
# @return [String] encoded object id, FFI pointer, options hash, list of dictionaries, and MeCab version
|
272
|
+
# @see #to_s
|
273
|
+
def inspect
|
274
|
+
self.to_s
|
275
|
+
end
|
276
|
+
|
277
|
+
# Returns a `Proc` that will properly free resources
|
278
|
+
# when this `MeCab` instance is garbage collected.
|
279
|
+
# The `Proc` returned is registered to be invoked
|
280
|
+
# after the `MeCab` instance owning `ptr`
|
281
|
+
# has been destroyed.
|
282
|
+
#
|
283
|
+
# @param [FFI::Pointer] ptr
|
284
|
+
# @return [Proc] to release `mecab` resources properly
|
285
|
+
def self.create_free_proc(ptr)
|
286
|
+
Proc.new do
|
287
|
+
self.mecab_destroy(ptr)
|
288
|
+
end
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
# `MeCabError` is a general error class
|
293
|
+
# for the `Natto` module.
|
294
|
+
class MeCabError < RuntimeError; end
|
295
|
+
end
|
data/lib/natto/option_parse.rb
CHANGED
@@ -1,13 +1,16 @@
|
|
1
1
|
module Natto
|
2
2
|
|
3
|
-
# Module
|
4
|
-
# for parsing the various
|
5
|
-
#
|
3
|
+
# Module `OptionParse` encapsulates methods and behavior
|
4
|
+
# for parsing the various `mecab` options supported by
|
5
|
+
# `Natto`.
|
6
6
|
module OptionParse
|
7
7
|
require 'optparse'
|
8
8
|
|
9
|
-
|
10
|
-
|
9
|
+
WARNING_LATTICE_LEVEL =
|
10
|
+
":lattice-level is DEPRECATED, please use :marginal or :nbest\n".freeze
|
11
|
+
|
12
|
+
# Mapping of mecab short-style configuration options to the `mecab`
|
13
|
+
# tagger. See the `mecab` help for more details.
|
11
14
|
SUPPORTED_OPTS = { '-r' => :rcfile,
|
12
15
|
'-d' => :dicdir,
|
13
16
|
'-u' => :userdic,
|
@@ -15,6 +18,9 @@ module Natto
|
|
15
18
|
'-O' => :output_format_type,
|
16
19
|
'-a' => :all_morphs,
|
17
20
|
'-N' => :nbest,
|
21
|
+
'-p' => :partial,
|
22
|
+
'-m' => :marginal,
|
23
|
+
'-M' => :max_grouping_size,
|
18
24
|
'-F' => :node_format,
|
19
25
|
'-U' => :unk_format,
|
20
26
|
'-B' => :bos_format,
|
@@ -42,35 +48,36 @@ module Natto
|
|
42
48
|
h = {}
|
43
49
|
if options.is_a? String
|
44
50
|
opts = OptionParser.new do |opts|
|
45
|
-
opts.on('-r', '--rcfile ARG')
|
46
|
-
opts.on('-d', '--dicdir ARG')
|
47
|
-
opts.on('-u', '--userdic ARG')
|
48
|
-
opts.on('-l', '--lattice-level ARG')
|
49
|
-
opts.on('-O', '--output-format-type ARG') { |arg| h[:output_format_type]
|
50
|
-
opts.on('-a', '--all-morphs')
|
51
|
-
opts.on('-N', '--nbest ARG')
|
52
|
-
|
53
|
-
opts.on('-
|
54
|
-
opts.on('-
|
55
|
-
opts.on('-
|
56
|
-
opts.on('-
|
57
|
-
opts.on('-
|
58
|
-
opts.on('-
|
59
|
-
opts.on('-
|
60
|
-
|
61
|
-
opts.on('-
|
62
|
-
opts.on('-
|
63
|
-
opts.on('-
|
51
|
+
opts.on('-r', '--rcfile ARG') { |arg| h[:rcfile] = arg.strip }
|
52
|
+
opts.on('-d', '--dicdir ARG') { |arg| h[:dicdir] = arg.strip }
|
53
|
+
opts.on('-u', '--userdic ARG') { |arg| h[:userdic] = arg.strip }
|
54
|
+
opts.on('-l', '--lattice-level ARG') { |arg| h[:lattice_level] = arg.strip.to_i } # !deprecated in 0.99!!!
|
55
|
+
opts.on('-O', '--output-format-type ARG') { |arg| h[:output_format_type] = arg.strip }
|
56
|
+
opts.on('-a', '--all-morphs') { |arg| h[:all_morphs] = true }
|
57
|
+
opts.on('-N', '--nbest ARG') { |arg| h[:nbest] = arg.strip.to_i }
|
58
|
+
opts.on('-p', '--partial') { |arg| h[:partial] = true }
|
59
|
+
opts.on('-m', '--marginal') { |arg| h[:marginal] = true }
|
60
|
+
opts.on('-M', '--max-grouping-size ARG'){ |arg| h[:max_grouping_size] = arg.strip.to_i }
|
61
|
+
opts.on('-F', '--node-format ARG') { |arg| h[:node_format] = arg.strip }
|
62
|
+
opts.on('-U', '--unk-format ARG') { |arg| h[:unk_format] = arg.strip }
|
63
|
+
opts.on('-B', '--bos-format ARG') { |arg| h[:bos_format] = arg.strip }
|
64
|
+
opts.on('-E', '--eos-format ARG') { |arg| h[:eos_format] = arg.strip }
|
65
|
+
opts.on('-S', '--eon-format ARG') { |arg| h[:eon_format] = arg.strip }
|
66
|
+
opts.on('-x', '--unk-feature ARG') { |arg| h[:unk_feature] = arg.strip }
|
67
|
+
opts.on('-b', '--input-buffer-size ARG'){ |arg| h[:input_buffer_size] = arg.strip.to_i }
|
68
|
+
opts.on('-C', '--allocate-sentence') { |arg| h[:allocate_sentence] = true }
|
69
|
+
opts.on('-t', '--theta ARG') { |arg| h[:theta] = arg.strip.to_f }
|
70
|
+
opts.on('-c', '--cost-factor ARG') { |arg| h[:cost_factor] = arg.strip.to_i }
|
64
71
|
end
|
65
72
|
opts.parse!(options.split)
|
66
73
|
else
|
67
74
|
SUPPORTED_OPTS.values.each do |k|
|
68
75
|
if options.has_key?(k)
|
69
|
-
if [ :all_morphs, :allocate_sentence ].include?(k)
|
76
|
+
if [ :all_morphs, :partial, :marginal, :allocate_sentence ].include?(k)
|
70
77
|
h[k] = true
|
71
78
|
else
|
72
79
|
v = options[k]
|
73
|
-
if [ :lattice_level, :
|
80
|
+
if [ :lattice_level, :nbest, :max_grouping_size, :input_buffer_size, :cost_factor ].include?(k)
|
74
81
|
h[k] = v.to_i
|
75
82
|
elsif k == :theta
|
76
83
|
h[k] = v.to_f
|
@@ -81,15 +88,16 @@ module Natto
|
|
81
88
|
end
|
82
89
|
end
|
83
90
|
end
|
91
|
+
$stderr.print WARNING_LATTICE_LEVEL if h.include? :lattice_level
|
84
92
|
raise MeCabError.new("Invalid N value") if h[:nbest] && (h[:nbest] < 1 || h[:nbest] > 512)
|
85
93
|
h
|
86
94
|
end
|
87
95
|
|
88
96
|
# Returns a string-representation of the options to
|
89
|
-
# be passed in the construction of the
|
97
|
+
# be passed in the construction of the `mecab` tagger.
|
90
98
|
#
|
91
99
|
# @param [Hash] options
|
92
|
-
# @return [String] representation of the options to the
|
100
|
+
# @return [String] representation of the options to the `mecab` tagger
|
93
101
|
def build_options_str(options={})
|
94
102
|
opt = []
|
95
103
|
SUPPORTED_OPTS.values.each do |k|
|