natto 0.9.4 → 0.9.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +9 -0
- data/README.md +7 -7
- data/lib/natto.rb +43 -117
- data/lib/natto/option_parse.rb +109 -0
- data/lib/natto/version.rb +5 -1
- metadata +21 -21
data/CHANGELOG
CHANGED
|
@@ -1,5 +1,14 @@
|
|
|
1
1
|
## CHANGELOG
|
|
2
2
|
|
|
3
|
+
- __2012/09/16__: 0.9.5 release.
|
|
4
|
+
- Fixed [Issue 9: trimされていない文字列のparse](https://bitbucket.org/buruzaemon/natto/issue/9/trim-parse)
|
|
5
|
+
- Fixed [Issue 10: BUG Segmentation Fault](https://bitbucket.org/buruzaemon/natto/issue/10/bug-segmentation-fault)
|
|
6
|
+
- Adding parse_as_nodes to allow for method-chaining on list of parsed nodes
|
|
7
|
+
- Adding parse_as_strings to allow for method-chaining on list of string output
|
|
8
|
+
- Deprecating both readnodes and readlines (badly named methods, see parse_as_nodes and parse_as_strings, respectively)
|
|
9
|
+
- Refactored the option parsing logic into Natto::OptionParse
|
|
10
|
+
- Enhanced Natto::DictionaryInfo#type override for java or ruby < 1.9
|
|
11
|
+
|
|
3
12
|
- __2012/02/26__: 0.9.4 release.
|
|
4
13
|
- Exposing the underlying FFI pointer as @tagger in Natto::MeCab
|
|
5
14
|
- Exposing the underlying FFI pointer as @pointer in Natto::MeCabNode
|
data/README.md
CHANGED
|
@@ -11,7 +11,7 @@ You can learn more about [natto at bitbucket](https://bitbucket.org/buruzaemon/n
|
|
|
11
11
|
## Requirements
|
|
12
12
|
natto requires the following:
|
|
13
13
|
|
|
14
|
-
- [MeCab _0.
|
|
14
|
+
- [MeCab _0.994_](http://code.google.com/p/mecab/downloads/list)
|
|
15
15
|
- [ffi _0.6.3 or greater_](http://rubygems.org/gems/ffi)
|
|
16
16
|
- Ruby _1.8.7 or greater_
|
|
17
17
|
|
|
@@ -20,16 +20,16 @@ Install natto with the following gem command:
|
|
|
20
20
|
|
|
21
21
|
gem install natto
|
|
22
22
|
|
|
23
|
-
This will automatically install the [ffi](http://rubygems.org/gems/ffi) rubygem, which
|
|
23
|
+
This will automatically install the [ffi](http://rubygems.org/gems/ffi) rubygem, which natto uses to bind to the <tt>mecab</tt> library.
|
|
24
24
|
|
|
25
25
|
## Installation on Windows
|
|
26
|
-
However, if you are using a CRuby on Windows, then you will first need to install the [RubyInstaller Development Kit (DevKit)](https://github.com/oneclick/rubyinstaller/wiki/Development-Kit),
|
|
26
|
+
However, if you are using a CRuby on Windows, then you will first need to install the [RubyInstaller Development Kit (DevKit)](https://github.com/oneclick/rubyinstaller/wiki/Development-Kit), a MSYS/MinGW based toolkit than enables your Windows Ruby installation to build many of the native C/C++ extensions available, including <tt>ffi</tt>.
|
|
27
27
|
|
|
28
28
|
1. Download the latest release for RubyInstaller for Windows platforms and the corresponding DevKit from the [RubyInstaller for Windows downloads page](http://rubyinstaller.org/downloads/).
|
|
29
29
|
2. After installing RubyInstaller for Windows, double-click on the DevKit-tdm installer <tt>.exe</tt>, and expand the contents to an appropriate location, for example <tt>C:\devkit</tt>.
|
|
30
30
|
3. Open a command window under <tt>C:\devkit</tt>, and execute: <tt>ruby dk.rb init</tt>. This will locate all known ruby installations, and add them to <tt>C:\devkit\config.yml</tt>.
|
|
31
|
-
4. Next, execute: <tt>ruby dk.rb install</tt>, which will add the DevKit to all of the installed rubies listed in your <tt>C:\devkit\config.yml</tt>.
|
|
32
|
-
5.
|
|
31
|
+
4. Next, execute: <tt>ruby dk.rb install</tt>, which will add the DevKit to all of the installed rubies listed in your <tt>C:\devkit\config.yml</tt>. Now you should be able to install and build the <tt>ffi</tt> rubygem correctly on your Windows-installed ruby.
|
|
32
|
+
5. Install <tt>natto</tt> with:
|
|
33
33
|
|
|
34
34
|
gem install natto
|
|
35
35
|
|
|
@@ -65,10 +65,10 @@ e.g., from within a Ruby program
|
|
|
65
65
|
type="0", \
|
|
66
66
|
filename="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
|
|
67
67
|
charset="utf8">], \
|
|
68
|
-
@version="0.
|
|
68
|
+
@version="0.994">
|
|
69
69
|
|
|
70
70
|
puts nm.version
|
|
71
|
-
=> "0.
|
|
71
|
+
=> "0.994"
|
|
72
72
|
|
|
73
73
|
sysdic = nm.dicts.first
|
|
74
74
|
|
data/lib/natto.rb
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
# coding: utf-8
|
|
2
2
|
require 'rubygems' if RUBY_VERSION.to_f < 1.9
|
|
3
3
|
require 'natto/binding'
|
|
4
|
+
require 'natto/option_parse'
|
|
4
5
|
require 'natto/utils'
|
|
5
6
|
|
|
6
7
|
module Natto
|
|
7
8
|
require 'ffi'
|
|
8
|
-
require 'optparse'
|
|
9
9
|
|
|
10
10
|
# <tt>MeCab</tt> is a wrapper class for the <tt>mecab</tt> tagger.
|
|
11
11
|
# Options to the <tt>mecab</tt> tagger are passed in as a string
|
|
@@ -20,12 +20,12 @@ module Natto
|
|
|
20
20
|
# nm = Natto::MeCab.new('-Ochasen')
|
|
21
21
|
# => #<Natto::MeCab:0x28d3bdc8 \
|
|
22
22
|
# @tagger=#<FFI::Pointer address=0x28afb980>, \
|
|
23
|
-
# @options={:output_format_type=>"chasen"},
|
|
23
|
+
# @options={:output_format_type=>"chasen"}, \
|
|
24
24
|
# @dicts=[#<Natto::DictionaryInfo:0x289a1f14 \
|
|
25
25
|
# type="0", \
|
|
26
26
|
# filename="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
|
|
27
27
|
# charset="utf8">], \
|
|
28
|
-
# @version="0.
|
|
28
|
+
# @version="0.994">
|
|
29
29
|
#
|
|
30
30
|
# nm.parse('凡人にしか見えねえ風景ってのがあるんだよ。') do |n|
|
|
31
31
|
# puts "#{n.surface}\t#{n.feature}"
|
|
@@ -48,30 +48,11 @@ module Natto
|
|
|
48
48
|
#
|
|
49
49
|
class MeCab
|
|
50
50
|
include Natto::Binding
|
|
51
|
+
include Natto::OptionParse
|
|
51
52
|
include Natto::Utils
|
|
52
53
|
|
|
53
54
|
attr_reader :tagger, :options, :dicts, :version
|
|
54
55
|
|
|
55
|
-
# Mapping of mecab short-style configuration options to the <tt>mecab</tt> tagger.
|
|
56
|
-
# See the <tt>mecab</tt> help for more details.
|
|
57
|
-
SUPPORTED_OPTS = { '-r' => :rcfile,
|
|
58
|
-
'-d' => :dicdir,
|
|
59
|
-
'-u' => :userdic,
|
|
60
|
-
'-l' => :lattice_level,
|
|
61
|
-
'-O' => :output_format_type,
|
|
62
|
-
'-a' => :all_morphs,
|
|
63
|
-
'-N' => :nbest,
|
|
64
|
-
'-F' => :node_format,
|
|
65
|
-
'-U' => :unk_format,
|
|
66
|
-
'-B' => :bos_format,
|
|
67
|
-
'-E' => :eos_format,
|
|
68
|
-
'-S' => :eon_format,
|
|
69
|
-
'-x' => :unk_feature,
|
|
70
|
-
'-b' => :input_buffer_size,
|
|
71
|
-
'-C' => :allocate_sentence,
|
|
72
|
-
'-t' => :theta,
|
|
73
|
-
'-c' => :cost_factor }.freeze
|
|
74
|
-
|
|
75
56
|
# Initializes the wrapped <tt>mecab</tt> instance with the
|
|
76
57
|
# given <tt>options</tt>.
|
|
77
58
|
#
|
|
@@ -108,7 +89,7 @@ module Natto
|
|
|
108
89
|
# type="0", \
|
|
109
90
|
# filename="/usr/local/lib/mecab/dic/ipadic/sys.dic" \
|
|
110
91
|
# charset="utf8">], \
|
|
111
|
-
# @version="0.
|
|
92
|
+
# @version="0.994">
|
|
112
93
|
#
|
|
113
94
|
# puts nm.parse('才能とは求める人間に与えられるものではない。')
|
|
114
95
|
# 才能 サイノウ
|
|
@@ -128,10 +109,8 @@ module Natto
|
|
|
128
109
|
#
|
|
129
110
|
# @param [Hash or String]
|
|
130
111
|
# @raise [MeCabError] if <tt>mecab</tt> cannot be initialized with the given <tt>options</tt>
|
|
131
|
-
# @see MeCab::SUPPORTED_OPTS
|
|
132
112
|
def initialize(options={})
|
|
133
113
|
@options = self.class.parse_mecab_options(options)
|
|
134
|
-
|
|
135
114
|
@dicts = []
|
|
136
115
|
|
|
137
116
|
opt_str = self.class.build_options_str(@options)
|
|
@@ -146,7 +125,6 @@ module Natto
|
|
|
146
125
|
# for both parsing as string and yielding a node object
|
|
147
126
|
# N-Best parsing implementations
|
|
148
127
|
if @options[:nbest] && @options[:nbest] > 1
|
|
149
|
-
# nbest parsing require lattice level >= 1
|
|
150
128
|
self.mecab_set_lattice_level(@tagger, (@options[:lattice_level] || 1))
|
|
151
129
|
@parse_tostr = lambda do |str|
|
|
152
130
|
return self.mecab_nbest_sparse_tostr(@tagger, @options[:nbest], str) ||
|
|
@@ -159,18 +137,19 @@ module Natto
|
|
|
159
137
|
n = self.mecab_nbest_next_tonode(@tagger)
|
|
160
138
|
raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
|
|
161
139
|
nlen = @options[:nbest]
|
|
162
|
-
nlen.times do
|
|
140
|
+
nlen.times do |i|
|
|
163
141
|
s = str.bytes.to_a
|
|
164
142
|
while n && n.address != 0x0
|
|
165
143
|
mn = Natto::MeCabNode.new(n)
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
144
|
+
s = s.drop_while {|e| (e==0xa || e==0x20)}
|
|
145
|
+
if !s.empty?
|
|
146
|
+
sarr = []
|
|
147
|
+
mn.length.times { sarr << s.shift }
|
|
169
148
|
surf = sarr.pack('C*')
|
|
170
149
|
mn.surface = self.class.force_enc(surf)
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
150
|
+
end
|
|
151
|
+
if @options[:output_format_type] || @options[:node_format]
|
|
152
|
+
mn.feature = self.class.force_enc(self.mecab_format_node(@tagger, n))
|
|
174
153
|
end
|
|
175
154
|
nodes << mn if !mn.is_bos?
|
|
176
155
|
n = mn.next
|
|
@@ -191,13 +170,14 @@ module Natto
|
|
|
191
170
|
n = self.mecab_sparse_tonode(@tagger, str)
|
|
192
171
|
raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
|
|
193
172
|
mn = Natto::MeCabNode.new(n)
|
|
194
|
-
n = mn.next if mn.next.address!=0x0
|
|
173
|
+
n = mn.next if mn.next.address!=0x0
|
|
195
174
|
s = str.bytes.to_a
|
|
196
175
|
while n && n.address!=0x0
|
|
197
176
|
mn = Natto::MeCabNode.new(n)
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
177
|
+
s = s.drop_while {|e| (e==0xa || e==0x20)}
|
|
178
|
+
if !s.empty?
|
|
179
|
+
sarr = []
|
|
180
|
+
mn.length.times { sarr << s.shift }
|
|
201
181
|
surf = sarr.pack('C*')
|
|
202
182
|
mn.surface = self.class.force_enc(surf)
|
|
203
183
|
end
|
|
@@ -224,8 +204,10 @@ module Natto
|
|
|
224
204
|
# @param [String] str
|
|
225
205
|
# @return parsing result from <tt>mecab</tt>
|
|
226
206
|
# @raise [MeCabError] if the <tt>mecab</tt> tagger cannot parse the given string <tt>str</tt>
|
|
207
|
+
# @raise [ArgumentError] if the given string <tt>str</tt> argument is <tt>nil</tt>
|
|
227
208
|
# @see MeCabNode
|
|
228
209
|
def parse(str)
|
|
210
|
+
raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
|
|
229
211
|
if block_given?
|
|
230
212
|
nodes = @parse_tonodes.call(str)
|
|
231
213
|
nodes.each {|n| yield n }
|
|
@@ -239,8 +221,10 @@ module Natto
|
|
|
239
221
|
# @param [String] str
|
|
240
222
|
# @return [Array] of parsed <tt>mecab</tt> nodes.
|
|
241
223
|
# @raise [MeCabError] if the <tt>mecab</tt> tagger cannot parse the given string <tt>str</tt>
|
|
224
|
+
# @raise [ArgumentError] if the given string <tt>str</tt> argument is <tt>nil</tt>
|
|
242
225
|
# @see MeCabNode
|
|
243
|
-
def
|
|
226
|
+
def parse_as_nodes(str)
|
|
227
|
+
raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
|
|
244
228
|
@parse_tonodes.call(str)
|
|
245
229
|
end
|
|
246
230
|
|
|
@@ -249,10 +233,24 @@ module Natto
|
|
|
249
233
|
# @param [String] str
|
|
250
234
|
# @return [Array] of parsed <tt>mecab</tt> result strings.
|
|
251
235
|
# @raise [MeCabError] if the <tt>mecab</tt> tagger cannot parse the given string <tt>str</tt>
|
|
252
|
-
|
|
236
|
+
# @raise [ArgumentError] if the given string <tt>str</tt> argument is <tt>nil</tt>
|
|
237
|
+
def parse_as_strings(str)
|
|
238
|
+
raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
|
|
253
239
|
self.class.force_enc(@parse_tostr.call(str)).lines.to_a
|
|
254
240
|
end
|
|
255
241
|
|
|
242
|
+
# DEPRECATED: use parse_as_nodes instead.
|
|
243
|
+
def readnodes(str)
|
|
244
|
+
$stdout.puts 'DEPRECATED: use parse_as_nodes instead'
|
|
245
|
+
parse_as_nodes(str)
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
# DEPRECATED: use parse_as_strings instead.
|
|
249
|
+
def readlines(str)
|
|
250
|
+
$stdout.puts 'DEPRECATED: use parse_as_strings instead'
|
|
251
|
+
parse_as_strings(str)
|
|
252
|
+
end
|
|
253
|
+
|
|
256
254
|
# Returns human-readable details for the wrapped <tt>mecab</tt> tagger.
|
|
257
255
|
# Overrides <tt>Object#to_s</tt>.
|
|
258
256
|
#
|
|
@@ -288,77 +286,6 @@ module Natto
|
|
|
288
286
|
self.mecab_destroy(ptr)
|
|
289
287
|
end
|
|
290
288
|
end
|
|
291
|
-
|
|
292
|
-
# Prepares and returns a hash mapping symbols for
|
|
293
|
-
# the specified, recognized MeCab options, and their
|
|
294
|
-
# values. Will parse and convert string (short or
|
|
295
|
-
# long argument styles) or hash.
|
|
296
|
-
def self.parse_mecab_options(options={})
|
|
297
|
-
h = {}
|
|
298
|
-
if options.is_a? String
|
|
299
|
-
opts = OptionParser.new do |opts|
|
|
300
|
-
opts.on('-r', '--rcfile ARG') { |arg| h[:rcfile] = arg.strip }
|
|
301
|
-
opts.on('-d', '--dicdir ARG') { |arg| h[:dicdir] = arg.strip }
|
|
302
|
-
opts.on('-u', '--userdic ARG') { |arg| h[:userdic] = arg.strip }
|
|
303
|
-
opts.on('-l', '--lattice-level ARG') { |arg| h[:lattice_level] = arg.strip.to_i } # !deprecated in 0.99!!!
|
|
304
|
-
opts.on('-O', '--output-format-type ARG') { |arg| h[:output_format_type] = arg.strip }
|
|
305
|
-
opts.on('-a', '--all-morphs') { |arg| h[:all_morphs] = true }
|
|
306
|
-
opts.on('-N', '--nbest ARG') { |arg| h[:nbest] = arg.strip.to_i }
|
|
307
|
-
#opts.on('-m', '--marginal') { |arg| h[:marginal] = true }
|
|
308
|
-
opts.on('-F', '--node-format ARG') { |arg| h[:node_format] = arg.strip }
|
|
309
|
-
opts.on('-U', '--unk-format ARG') { |arg| h[:unk_format] = arg.strip }
|
|
310
|
-
opts.on('-B', '--bos-format ARG') { |arg| h[:bos_format] = arg.strip }
|
|
311
|
-
opts.on('-E', '--eos-format ARG') { |arg| h[:eos_format] = arg.strip }
|
|
312
|
-
opts.on('-S', '--eon-format ARG') { |arg| h[:eon_format] = arg.strip }
|
|
313
|
-
opts.on('-x', '--unk-feature ARG') { |arg| h[:unk_feature] = arg.strip }
|
|
314
|
-
opts.on('-b', '--input-buffer-size ARG') { |arg| h[:input_buffer_size] = arg.strip.to_i }
|
|
315
|
-
#opts.on('-M', '--open-mutable-dictionary') { |arg| h[:open_mutable_dictionary] = true }
|
|
316
|
-
opts.on('-C', '--allocate-sentence') { |arg| h[:allocate_sentence] = true }
|
|
317
|
-
opts.on('-t', '--theta ARG') { |arg| h[:theta] = arg.strip.to_f }
|
|
318
|
-
opts.on('-c', '--cost-factor ARG') { |arg| h[:cost_factor] = arg.strip.to_i }
|
|
319
|
-
end
|
|
320
|
-
opts.parse!(options.split)
|
|
321
|
-
else
|
|
322
|
-
SUPPORTED_OPTS.values.each do |k|
|
|
323
|
-
if options.has_key?(k)
|
|
324
|
-
if [ :all_morphs, :allocate_sentence ].include?(k)
|
|
325
|
-
h[k] = true
|
|
326
|
-
else
|
|
327
|
-
v = options[k]
|
|
328
|
-
if [ :lattice_level, :input_buffer_size, :nbest, :cost_factor ].include?(k)
|
|
329
|
-
h[k] = v.to_i
|
|
330
|
-
elsif k == :theta
|
|
331
|
-
h[k] = v.to_f
|
|
332
|
-
else
|
|
333
|
-
h[k] = v
|
|
334
|
-
end
|
|
335
|
-
end
|
|
336
|
-
end
|
|
337
|
-
end
|
|
338
|
-
end
|
|
339
|
-
raise MeCabError.new("Invalid N value") if h[:nbest] && (h[:nbest] < 1 || h[:nbest] > 512)
|
|
340
|
-
h
|
|
341
|
-
end
|
|
342
|
-
|
|
343
|
-
# Returns a string-representation of the options to
|
|
344
|
-
# be passed in the construction of the <tt>mecab</tt> tagger.
|
|
345
|
-
#
|
|
346
|
-
# @param [Hash] options
|
|
347
|
-
# @return [String] representation of the options to the <tt>mecab</tt> tagger
|
|
348
|
-
def self.build_options_str(options={})
|
|
349
|
-
opt = []
|
|
350
|
-
SUPPORTED_OPTS.values.each do |k|
|
|
351
|
-
if options.has_key? k
|
|
352
|
-
key = k.to_s.gsub('_', '-')
|
|
353
|
-
if %w( all-morphs allocate-sentence ).include? key
|
|
354
|
-
opt << "--#{key}" if options[k]==true
|
|
355
|
-
else
|
|
356
|
-
opt << "--#{key}=#{options[k]}"
|
|
357
|
-
end
|
|
358
|
-
end
|
|
359
|
-
end
|
|
360
|
-
opt.empty? ? "" : opt.join(" ")
|
|
361
|
-
end
|
|
362
289
|
end
|
|
363
290
|
|
|
364
291
|
# <tt>MeCabError</tt> is a general error class
|
|
@@ -429,14 +356,13 @@ module Natto
|
|
|
429
356
|
:version, :ushort,
|
|
430
357
|
:next, :pointer
|
|
431
358
|
|
|
432
|
-
if
|
|
359
|
+
if Object.respond_to?(:type) && Object.respond_to?(:class)
|
|
433
360
|
alias_method :deprecated_type, :type
|
|
434
|
-
# <tt>Object#type</tt> override defined when <tt>
|
|
435
|
-
#
|
|
436
|
-
# deprecation warning thrown up in Ruby 1.8.7
|
|
361
|
+
# <tt>Object#type</tt> override defined when both <tt>type</tt> and
|
|
362
|
+
# <tt>class</tt> are Object methods. This is a hack to avoid the
|
|
363
|
+
# <tt>Object#type</tt> deprecation warning thrown up in Ruby 1.8.7
|
|
364
|
+
# and in JRuby.
|
|
437
365
|
#
|
|
438
|
-
# <i>This method override is not defined when the Ruby interpreter
|
|
439
|
-
# is 1.9 or greater.</i>
|
|
440
366
|
# @return [Fixnum] <tt>mecab</tt> dictionary type
|
|
441
367
|
def type
|
|
442
368
|
self[:type]
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
module Natto
|
|
2
|
+
|
|
3
|
+
# Module <tt>OptionParse</tt> encapsulates methods and behavior
|
|
4
|
+
# for parsing the various <tt>mecab</tt> options supported by
|
|
5
|
+
# <tt>Natto</tt>.
|
|
6
|
+
module OptionParse
|
|
7
|
+
require 'optparse'
|
|
8
|
+
|
|
9
|
+
# Mapping of mecab short-style configuration options to the <tt>mecab</tt> tagger.
|
|
10
|
+
# See the <tt>mecab</tt> help for more details.
|
|
11
|
+
SUPPORTED_OPTS = { '-r' => :rcfile,
|
|
12
|
+
'-d' => :dicdir,
|
|
13
|
+
'-u' => :userdic,
|
|
14
|
+
'-l' => :lattice_level,
|
|
15
|
+
'-O' => :output_format_type,
|
|
16
|
+
'-a' => :all_morphs,
|
|
17
|
+
'-N' => :nbest,
|
|
18
|
+
'-F' => :node_format,
|
|
19
|
+
'-U' => :unk_format,
|
|
20
|
+
'-B' => :bos_format,
|
|
21
|
+
'-E' => :eos_format,
|
|
22
|
+
'-S' => :eon_format,
|
|
23
|
+
'-x' => :unk_feature,
|
|
24
|
+
'-b' => :input_buffer_size,
|
|
25
|
+
'-C' => :allocate_sentence,
|
|
26
|
+
'-t' => :theta,
|
|
27
|
+
'-c' => :cost_factor }.freeze
|
|
28
|
+
|
|
29
|
+
# @private
|
|
30
|
+
def self.included(base)
|
|
31
|
+
base.extend(ClassMethods)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# @private
|
|
35
|
+
module ClassMethods
|
|
36
|
+
|
|
37
|
+
# Prepares and returns a hash mapping symbols for
|
|
38
|
+
# the specified, recognized MeCab options, and their
|
|
39
|
+
# values. Will parse and convert string (short or
|
|
40
|
+
# long argument styles) or hash.
|
|
41
|
+
def parse_mecab_options(options={})
|
|
42
|
+
h = {}
|
|
43
|
+
if options.is_a? String
|
|
44
|
+
opts = OptionParser.new do |opts|
|
|
45
|
+
opts.on('-r', '--rcfile ARG') { |arg| h[:rcfile] = arg.strip }
|
|
46
|
+
opts.on('-d', '--dicdir ARG') { |arg| h[:dicdir] = arg.strip }
|
|
47
|
+
opts.on('-u', '--userdic ARG') { |arg| h[:userdic] = arg.strip }
|
|
48
|
+
opts.on('-l', '--lattice-level ARG') { |arg| h[:lattice_level] = arg.strip.to_i } # !deprecated in 0.99!!!
|
|
49
|
+
opts.on('-O', '--output-format-type ARG') { |arg| h[:output_format_type] = arg.strip }
|
|
50
|
+
opts.on('-a', '--all-morphs') { |arg| h[:all_morphs] = true }
|
|
51
|
+
opts.on('-N', '--nbest ARG') { |arg| h[:nbest] = arg.strip.to_i }
|
|
52
|
+
#opts.on('-m', '--marginal') { |arg| h[:marginal] = true }
|
|
53
|
+
opts.on('-F', '--node-format ARG') { |arg| h[:node_format] = arg.strip }
|
|
54
|
+
opts.on('-U', '--unk-format ARG') { |arg| h[:unk_format] = arg.strip }
|
|
55
|
+
opts.on('-B', '--bos-format ARG') { |arg| h[:bos_format] = arg.strip }
|
|
56
|
+
opts.on('-E', '--eos-format ARG') { |arg| h[:eos_format] = arg.strip }
|
|
57
|
+
opts.on('-S', '--eon-format ARG') { |arg| h[:eon_format] = arg.strip }
|
|
58
|
+
opts.on('-x', '--unk-feature ARG') { |arg| h[:unk_feature] = arg.strip }
|
|
59
|
+
opts.on('-b', '--input-buffer-size ARG') { |arg| h[:input_buffer_size] = arg.strip.to_i }
|
|
60
|
+
#opts.on('-M', '--open-mutable-dictionary') { |arg| h[:open_mutable_dictionary] = true }
|
|
61
|
+
opts.on('-C', '--allocate-sentence') { |arg| h[:allocate_sentence] = true }
|
|
62
|
+
opts.on('-t', '--theta ARG') { |arg| h[:theta] = arg.strip.to_f }
|
|
63
|
+
opts.on('-c', '--cost-factor ARG') { |arg| h[:cost_factor] = arg.strip.to_i }
|
|
64
|
+
end
|
|
65
|
+
opts.parse!(options.split)
|
|
66
|
+
else
|
|
67
|
+
SUPPORTED_OPTS.values.each do |k|
|
|
68
|
+
if options.has_key?(k)
|
|
69
|
+
if [ :all_morphs, :allocate_sentence ].include?(k)
|
|
70
|
+
h[k] = true
|
|
71
|
+
else
|
|
72
|
+
v = options[k]
|
|
73
|
+
if [ :lattice_level, :input_buffer_size, :nbest, :cost_factor ].include?(k)
|
|
74
|
+
h[k] = v.to_i
|
|
75
|
+
elsif k == :theta
|
|
76
|
+
h[k] = v.to_f
|
|
77
|
+
else
|
|
78
|
+
h[k] = v
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
raise MeCabError.new("Invalid N value") if h[:nbest] && (h[:nbest] < 1 || h[:nbest] > 512)
|
|
85
|
+
h
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Returns a string-representation of the options to
|
|
89
|
+
# be passed in the construction of the <tt>mecab</tt> tagger.
|
|
90
|
+
#
|
|
91
|
+
# @param [Hash] options
|
|
92
|
+
# @return [String] representation of the options to the <tt>mecab</tt> tagger
|
|
93
|
+
def build_options_str(options={})
|
|
94
|
+
opt = []
|
|
95
|
+
SUPPORTED_OPTS.values.each do |k|
|
|
96
|
+
if options.has_key? k
|
|
97
|
+
key = k.to_s.gsub('_', '-')
|
|
98
|
+
if %w( all-morphs allocate-sentence ).include? key
|
|
99
|
+
opt << "--#{key}" if options[k]==true
|
|
100
|
+
else
|
|
101
|
+
opt << "--#{key}=#{options[k]}"
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
opt.empty? ? "" : opt.join(" ")
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
end
|
data/lib/natto/version.rb
CHANGED
|
@@ -21,7 +21,11 @@
|
|
|
21
21
|
#
|
|
22
22
|
# Module <tt>Natto::Binding</tt> encapsulates methods and behavior
|
|
23
23
|
# which are made available via <tt>FFI</tt> bindings to <tt>mecab</tt>.
|
|
24
|
+
#
|
|
25
|
+
# Module <tt>OptionParse</tt> encapsulates methods and behavior
|
|
26
|
+
# for parsing the various <tt>mecab</tt> options supported by
|
|
27
|
+
# <tt>Natto</tt>.
|
|
24
28
|
module Natto
|
|
25
29
|
# Version string for this Rubygem.
|
|
26
|
-
VERSION = "0.9.
|
|
30
|
+
VERSION = "0.9.5"
|
|
27
31
|
end
|
metadata
CHANGED
|
@@ -1,34 +1,34 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: natto
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
prerelease:
|
|
5
|
+
version: 0.9.5
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
8
8
|
- Brooke M. Fujita
|
|
9
|
-
autorequire:
|
|
9
|
+
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2012-
|
|
12
|
+
date: 2012-09-16 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: ffi
|
|
16
|
-
|
|
17
|
-
none: false
|
|
16
|
+
version_requirements: &2056 !ruby/object:Gem::Requirement
|
|
18
17
|
requirements:
|
|
19
18
|
- - ! '>='
|
|
20
19
|
- !ruby/object:Gem::Version
|
|
21
20
|
version: 0.6.3
|
|
22
|
-
|
|
21
|
+
none: false
|
|
22
|
+
requirement: *2056
|
|
23
23
|
prerelease: false
|
|
24
|
-
|
|
24
|
+
type: :runtime
|
|
25
25
|
description: ! 'natto is a gem bridging Ruby and MeCab using FFI (foreign function
|
|
26
26
|
interface). No compilation is necessary, and natto will run on CRuby (mri/yarv)
|
|
27
27
|
and JRuby (jvm) equally well, on any OS.
|
|
28
28
|
|
|
29
|
-
ruby
|
|
29
|
+
ruby ? mecab ????????? natto ???????????
|
|
30
30
|
|
|
31
|
-
'
|
|
31
|
+
'
|
|
32
32
|
email: buruzaemon@gmail.com
|
|
33
33
|
executables: []
|
|
34
34
|
extensions: []
|
|
@@ -36,8 +36,9 @@ extra_rdoc_files: []
|
|
|
36
36
|
files:
|
|
37
37
|
- lib/natto.rb
|
|
38
38
|
- lib/natto/binding.rb
|
|
39
|
-
- lib/natto/
|
|
39
|
+
- lib/natto/option_parse.rb
|
|
40
40
|
- lib/natto/utils.rb
|
|
41
|
+
- lib/natto/version.rb
|
|
41
42
|
- README.md
|
|
42
43
|
- LICENSE
|
|
43
44
|
- CHANGELOG
|
|
@@ -45,30 +46,29 @@ files:
|
|
|
45
46
|
homepage: https://bitbucket.org/buruzaemon/natto/overview
|
|
46
47
|
licenses:
|
|
47
48
|
- BSD
|
|
48
|
-
post_install_message:
|
|
49
|
+
post_install_message:
|
|
49
50
|
rdoc_options: []
|
|
50
51
|
require_paths:
|
|
51
52
|
- lib
|
|
52
53
|
required_ruby_version: !ruby/object:Gem::Requirement
|
|
53
|
-
none: false
|
|
54
54
|
requirements:
|
|
55
55
|
- - ! '>='
|
|
56
56
|
- !ruby/object:Gem::Version
|
|
57
57
|
version: 1.8.7
|
|
58
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
59
58
|
none: false
|
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
60
60
|
requirements:
|
|
61
61
|
- - ! '>='
|
|
62
62
|
- !ruby/object:Gem::Version
|
|
63
63
|
version: '0'
|
|
64
|
+
none: false
|
|
64
65
|
requirements:
|
|
65
|
-
- MeCab, 0.
|
|
66
|
+
- MeCab, 0.994 or greater
|
|
66
67
|
- FFI, 0.6.3 or greater
|
|
67
|
-
rubyforge_project:
|
|
68
|
-
rubygems_version: 1.8.
|
|
69
|
-
signing_key:
|
|
68
|
+
rubyforge_project:
|
|
69
|
+
rubygems_version: 1.8.15
|
|
70
|
+
signing_key:
|
|
70
71
|
specification_version: 3
|
|
71
|
-
summary: natto combines the Ruby programming language with MeCab, the part-of-speech
|
|
72
|
-
and morphological analyzer for the Japanese language.
|
|
72
|
+
summary: natto combines the Ruby programming language with MeCab, the part-of-speech and morphological analyzer for the Japanese language.
|
|
73
73
|
test_files: []
|
|
74
|
-
|
|
74
|
+
...
|