natto 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,5 +1,10 @@
1
1
  ## CHANGELOG
2
2
 
3
+ - __2011/02/2?__: 0.5.0 release.
4
+ - Added support for node parsing using blocks
5
+ - Added support for mecab options nbest, all-morphs
6
+ - Pulling support for mecab option partial, since it is more of a command-line feature
7
+
3
8
  - __2011/01/27__: 0.4.1 release.
4
9
  - Tweaking the description in natto.gemspec a bit
5
10
 
data/README.md CHANGED
@@ -4,7 +4,7 @@ A Tasty Ruby Binding with MeCab
4
4
  ## What is natto?
5
5
  natto combines the [Ruby programming language](http://www.ruby-lang.org/) with [MeCab](http://mecab.sourceforge.net/), the part-of-speech and morphological analyzer for the Japanese language.
6
6
 
7
- natto is a gem bridging Ruby and MeCab using FFI (foreign function interface). No compilation is necessary, and natto will run on CRuby (mri/yarv) and JRuby (jvm) equally well, on any OS.
7
+ natto is a gem bridging Ruby and MeCab using FFI (foreign function interface). No compilation is necessary, as natto is _not_ a C extension. natto will run on CRuby (mri/yarv) and JRuby (jvm) equally well. natto will also run on Windows, Unix/Linux, and Mac.
8
8
 
9
9
  You can learn more about [natto at Google Code Projects](http://code.google.com/p/natto/).
10
10
 
@@ -31,29 +31,36 @@ e.g., on Windows
31
31
  set MECAB_PATH=C:\Program Files\MeCab\bin\libmecab.dll
32
32
  e.g., for Cygwin
33
33
  export MECAB_PATH=cygmecab-1
34
+ e.g., from within a Ruby program
35
+ ENV['MECAB_PATH']=/usr/local/lib/libmecab.so
34
36
 
35
37
  ## Usage
36
38
  require 'rubygems' if RUBY_VERSION.to_f < 1.9
37
39
  require 'natto'
38
40
 
39
- mecab = Natto::MeCab.new
40
- => #<Natto::MeCab:0x289b88e0 @ptr=#<FFI::Pointer address=0x288865c8>, \
41
- @options={}, \
42
- @version="0.98", \
43
- @dicts=[/usr/local/lib/mecab/dic/ipadic/sys.dic]>
41
+ nm = Natto::MeCab.new
42
+ => #<Natto::MeCab:0x28d30748
43
+ @ptr=#<FFI::Pointer address=0x28a97d50>, \
44
+ @options={}, \
45
+ @dicts=[#<Natto::DictionaryInfo:0x28d3061c
46
+ filename="/usr/local/lib/mecab/dic/ipadic/sys.dic",
47
+ charset="utf8">],
48
+ @version="0.98">
44
49
 
45
- puts mecab.version
46
- => 0.98
50
+ puts nm.version
51
+ => "0.98"
47
52
 
48
- sysdic = mecab.dicts.first
53
+ sysdic = nm.dicts.first
49
54
 
50
55
  puts sysdic.filename
51
- => /usr/local/lib/mecab/dic/ipadic/sys.dic
56
+ => "/usr/local/lib/mecab/dic/ipadic/sys.dic"
52
57
 
53
58
  puts sysdic.charset
54
- => utf8
59
+ => "utf8"
55
60
 
56
- puts mecab.parse('暑い日にはもってこいの一品ですね。')
61
+ nm.parse('暑い日にはもってこいの一品ですね。') do |n|
62
+ puts "#{n.surface}\t#{n.feature}"
63
+ end
57
64
  暑い 形容詞,自立,*,*,形容詞・アウオ段,基本形,暑い,アツイ,アツイ
58
65
  日 名詞,非自立,副詞可能,*,*,*,日,ヒ,ヒ
59
66
  に 助詞,格助詞,一般,*,*,*,に,ニ,ニ
@@ -64,7 +71,7 @@ e.g., for Cygwin
64
71
  です 助動詞,*,*,*,特殊・デス,基本形,です,デス,デス
65
72
  ね 助詞,終助詞,*,*,*,*,ね,ネ,ネ
66
73
  。 終助詞記号,句点,*,*,*,*,。,。,。
67
- EOS
74
+ BOS/EOS,*,*,*,*,*,*,*,*
68
75
  => nil
69
76
 
70
77
  ## Contributing to natto
data/lib/natto/binding.rb CHANGED
@@ -13,7 +13,7 @@ module Natto
13
13
  # String name for the environment variable used by
14
14
  # <tt>Natto</tt> to indicate the exact name / full path
15
15
  # to the <tt>mecab</tt> library.
16
- MECAB_PATH = 'MECAB_PATH'
16
+ MECAB_PATH = 'MECAB_PATH'.freeze
17
17
 
18
18
  # @private
19
19
  def self.included(base)
@@ -31,11 +31,13 @@ module Natto
31
31
  # @raise [LoadError] if MECAB_PATH environment variable is not set in Windows
32
32
  # <br/>
33
33
  # e.g., for bash on UNIX/Linux
34
- # export MECAB_PATH=mecab.so
34
+ # export MECAB_PATH=/usr/local/lib/libmecab.so
35
35
  # e.g., on Windows
36
36
  # set MECAB_PATH=C:\Program Files\MeCab\bin\libmecab.dll
37
37
  # e.g., for Cygwin
38
38
  # export MECAB_PATH=cygmecab-1
39
+ # e.g., from within a Ruby program
40
+ # ENV['MECAB_PATH']=/usr/local/lib/libmecab.so
39
41
  def self.find_library
40
42
  host_os = RbConfig::CONFIG['host_os']
41
43
 
@@ -50,33 +52,72 @@ module Natto
50
52
 
51
53
  ffi_lib(ENV[MECAB_PATH] || find_library)
52
54
 
53
- attach_function :mecab_version, [], :string
54
55
  attach_function :mecab_new2, [:string], :pointer
56
+ attach_function :mecab_version, [], :string
57
+ attach_function :mecab_strerror, [:pointer],:string
55
58
  attach_function :mecab_destroy, [:pointer], :void
59
+
60
+ attach_function :mecab_set_theta, [:pointer, :float], :void
61
+ attach_function :mecab_set_lattice_level, [:pointer, :int], :void
62
+ attach_function :mecab_set_all_morphs, [:pointer, :int], :void
63
+
56
64
  attach_function :mecab_sparse_tostr, [:pointer, :string], :string
57
- attach_function :mecab_strerror, [:pointer],:string
65
+ attach_function :mecab_sparse_tonode, [:pointer, :string], :pointer
66
+
67
+ attach_function :mecab_nbest_init, [:pointer, :string], :int
68
+ attach_function :mecab_nbest_sparse_tostr, [:pointer, :int, :string], :string
69
+ attach_function :mecab_nbest_next_tonode, [:pointer], :pointer
70
+
58
71
  attach_function :mecab_dictionary_info, [:pointer], :pointer
59
72
 
60
73
  # @private
61
74
  module ClassMethods
75
+ def mecab_new2(options_str)
76
+ Natto::Binding.mecab_new2(options_str)
77
+ end
78
+
62
79
  def mecab_version
63
80
  Natto::Binding.mecab_version
64
81
  end
65
82
 
66
- def mecab_new2(options_str)
67
- Natto::Binding.mecab_new2(options_str)
83
+ def mecab_strerror(ptr)
84
+ Natto::Binding.mecab_strerror(ptr)
68
85
  end
69
86
 
70
87
  def mecab_destroy(ptr)
71
88
  Natto::Binding.mecab_destroy(ptr)
72
89
  end
73
90
 
91
+ def mecab_set_theta(ptr, t)
92
+ Natto::Binding.mecab_set_theta(ptr, t)
93
+ end
94
+
95
+ def mecab_set_lattice_level(ptr, ll)
96
+ Natto::Binding.mecab_set_lattice_level(ptr, ll)
97
+ end
98
+
99
+ def mecab_set_all_morphs(ptr, am)
100
+ Natto::Binding.mecab_set_all_morphs(ptr, am)
101
+ end
102
+
74
103
  def mecab_sparse_tostr(ptr, str)
75
104
  Natto::Binding.mecab_sparse_tostr(ptr, str)
76
105
  end
106
+
107
+ def mecab_sparse_tonode(ptr, str)
108
+ Natto::Binding.mecab_sparse_tonode(ptr, str)
109
+ end
77
110
 
78
- def mecab_strerror(ptr)
79
- Natto::Binding.mecab_strerror(ptr)
111
+ def mecab_nbest_next_tonode(ptr)
112
+ Natto::Binding.mecab_nbest_next_tonode(ptr)
113
+ end
114
+
115
+ def mecab_nbest_init(ptr, str)
116
+ Natto::Binding.mecab_nbest_init(ptr, str)
117
+ end
118
+
119
+ def mecab_nbest_sparse_tostr(ptr, n, str)
120
+ Natto::Binding.mecab_nbest_sparse_tostr(ptr, n, str)
80
121
  end
81
122
 
82
123
  def mecab_dictionary_info(ptr)
data/lib/natto/version.rb CHANGED
@@ -6,8 +6,15 @@
6
6
  # <tt>Natto::MeCab</tt> is a wrapper class for the <tt>mecab</tt>
7
7
  # parser.
8
8
  #
9
- # <tt>Natto::DictionaryInfo</tt> is a wrapper for a <tt>Natto::MeCab</tt>
10
- # instance's related dictionary information.
9
+ # <tt>Natto::MeCabStruct</tt> is a base class for a <tt>mecab</tt>
10
+ # struct.
11
+ #
12
+ # <tt>Natto::MeCabNode</tt> is a wrapper for the struct representing
13
+ # a <tt>mecab</tt>-parsed node.
14
+ #
15
+ # <tt>Natto::DictionaryInfo</tt> is a wrapper for the struct
16
+ # representing a <tt>Natto::MeCab</tt> instance's related
17
+ # dictionary information.
11
18
  #
12
19
  # <tt>Natto::MeCabError</tt> is a general error class for the
13
20
  # <tt>Natto</tt> module.
@@ -16,5 +23,5 @@
16
23
  # which are made available via <tt>FFI</tt> bindings to <tt>mecab</tt>.
17
24
  module Natto
18
25
  # Version string for this Rubygem.
19
- VERSION = "0.4.1"
26
+ VERSION = "0.5.0"
20
27
  end
data/lib/natto.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+
2
3
  require 'rubygems' if RUBY_VERSION.to_f < 1.9
3
4
  require 'natto/binding'
4
5
 
@@ -14,23 +15,25 @@ module Natto
14
15
  # require 'rubygems' if RUBY_VERSION.to_f < 1.9
15
16
  # require 'natto'
16
17
  #
17
- # mecab = Natto::MeCab.new(:output_format_type=>'wakati')
18
- # => #<Natto::MeCab:0x28dd471c @ptr=#<FFI::Pointer address=0x28a027d8>, \
19
- # @options={:output_format_type=>"wakati"}, \
20
- # @version="0.98", \
21
- # @dicts=[/usr/local/lib/mecab/dic/ipadic/sys.dic]>
22
- #
23
- # output = mecab.parse('ネバネバの組み合わせ美味しいです。').split
18
+ # nm = Natto::MeCab.new(:output_format_type=>'chasen2')
19
+ # => #<Natto::MeCab:0x28d3bdc8 \
20
+ # @ptr=#<FFI::Pointer address=0x28afb980>, \
21
+ # @options={:output_format_type=>"chasen2"}, \
22
+ # @dicts=[#<Natto::DictionaryInfo:0x289a1f14 \
23
+ # filename="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
24
+ # charset="utf8">], \
25
+ # @version="0.98">
24
26
  #
25
- # output.each do |token|
26
- # puts token
27
+ # nm.parse('ネバネバの組み合わせ美味しいです。') do |n|
28
+ # puts "#{n.surface}\t#{n.feature}"
27
29
  # end
28
- # => ネバネバ
29
- #
30
- # 組み合わせ
31
- # 美味しい
32
- # です
33
- #
30
+ #
31
+ # ネバネバ 名詞,サ変接続,*,*,*,*,ネバネバ,ネバネバ,ネバネバ
32
+ # の 助詞,連体化,*,*,*,*,の,ノ,ノ
33
+ # 組み合わせ 名詞,一般,*,*,*,*,組み合わせ,クミアワセ,クミアワセ
34
+ # 美味しい 形容詞,自立,*,*,形容詞・イ段,基本形,美味しい,オイシイ,オイシイ
35
+ # です 助動詞,*,*,*,特殊・デス,基本形,です,デス,デス
36
+ # 。 記号,句点,*,*,*,*,。,。,。
34
37
  #
35
38
  class MeCab
36
39
  include Natto::Binding
@@ -39,11 +42,11 @@ module Natto
39
42
 
40
43
  # Supported options to the <tt>mecab</tt> parser.
41
44
  # See the <tt>mecab</tt> help for more details.
42
- SUPPORTED_OPTS = [ :rcfile, :dicdir, :userdic, :lattice_level, :all_morphs,
43
- :output_format_type, :partial, :node_format, :unk_format,
44
- :bos_format, :eos_format, :eon_format, :unk_feature,
45
- :input_buffer_size, :allocate_sentence, :nbest, :theta,
46
- :cost_factor ].freeze
45
+ SUPPORTED_OPTS = [ :rcfile, :dicdir, :userdic, :lattice_level, :all_morphs,
46
+ :output_format_type, :node_format, :unk_format,
47
+ :bos_format, :eos_format, :eon_format, :unk_feature,
48
+ :input_buffer_size, :allocate_sentence, :nbest, :theta,
49
+ :cost_factor, :output ].freeze
47
50
 
48
51
  # Initializes the wrapped <tt>mecab</tt> instance with the
49
52
  # given <tt>options</tt> hash.
@@ -56,7 +59,6 @@ module Natto
56
59
  # - :lattice_level -- lattice information level (integer, default 0)
57
60
  # - :all_morphs -- output all morphs (default false)
58
61
  # - :output_format_type -- output format type (wakati, chasen, yomi, etc.)
59
- # - :partial -- partial parsing mode
60
62
  # - :node_format -- user-defined node format
61
63
  # - :unk_format -- user-defined unknown node format
62
64
  # - :bos_format -- user-defined beginning-of-sentence format
@@ -65,20 +67,23 @@ module Natto
65
67
  # - :unk_feature -- feature for unknown word
66
68
  # - :input_buffer_size -- set input buffer size (default 8192)
67
69
  # - :allocate_sentence -- allocate new memory for input sentence
68
- # - :nbest -- output N best results (integer, default 1)
70
+ # - :nbest -- output N best results (integer, default 1), requires lattice level >= 1
69
71
  # - :theta -- temperature parameter theta (float, default 0.75)
70
72
  # - :cost_factor -- cost factor (integer, default 700)
73
+ # - :output -- set the output file name
71
74
  #
72
75
  # <i>Use single-quotes to preserve format options that contain escape chars.</i><br/>
73
76
  # e.g.<br/>
74
77
  #
75
- # mecab = Natto::MeCab.new(:node_format=>'%m\t%f[7]\n')
76
- # => #<Natto::MeCab:0x289b88e0 @ptr=#<FFI::Pointer address=0x288865c8>, \
77
- # @options={:node_format=>"%m\\t%f[7]\\n"}, \
78
- # @version="0.98", \
79
- # @dicts=[/usr/local/lib/mecab/dic/ipadic/sys.dic]>
78
+ # nm = Natto::MeCab.new(:node_format=>'%m¥t%f[7]¥n')
79
+ # => #<Natto::MeCab:0x28d2ae10 @ptr=#<FFI::Pointer address=0x28a97980>, \
80
+ # @options={:node_format=>"%m¥t%f[7]¥n"}, \
81
+ # @dicts=[#<Natto::DictionaryInfo:0x28d2a85c \
82
+ # filename="/usr/local/lib/mecab/dic/ipadic/sys.dic" \
83
+ # charset="utf8">], \
84
+ # @version="0.98">
80
85
  #
81
- # puts mecab.parse('簡単で美味しくて良いですよね。')
86
+ # puts nm.parse('簡単で美味しくて良いですよね。')
82
87
  # 簡単 カンタン
83
88
  # で デ
84
89
  # 美味しくて オイシクテ
@@ -101,23 +106,92 @@ module Natto
101
106
  @ptr = self.mecab_new2(opt_str)
102
107
  raise MeCabError.new("Could not initialize MeCab with options: '#{opt_str}'") if @ptr.address == 0x0
103
108
 
109
+ # set mecab parsing options
110
+ self.mecab_set_theta(@ptr, @options[:theta].to_f) if @options[:theta]
111
+ self.mecab_set_lattice_level(@ptr, @options[:lattice_level].to_i) if @options[:lattice_level]
112
+ self.mecab_set_all_morphs(@ptr, 1) if @options[:all_morphs]
113
+
114
+ # set mecab parsing implementations
115
+ if @options[:nbest] && @options[:nbest] > 1
116
+ # N-Best parsing implementations
117
+ self.mecab_nbest_init(@ptr, str)
118
+ # nbest parsing require lattice level >= 1
119
+ self.mecab_set_lattice_level(@ptr, (@options[:lattice_level] || 1))
120
+ @parse_tostr = lambda { |str|
121
+ return self.mecab_nbest_sparse_tostr(@ptr, @options[:nbest], str) ||
122
+ raise(MeCabError.new(self.mecab_strerror(@ptr)))
123
+ }
124
+ @parse_tonode = lambda { |str| return self.mecab_nbest_next_tonode(@ptr) }
125
+ else
126
+ # default parsing implementations
127
+ @parse_tostr = lambda { |str|
128
+ return self.mecab_sparse_tostr(@ptr, str) || raise(MeCabError.new(self.mecab_strerror(@ptr)))
129
+ }
130
+ @parse_tonode = lambda { |str| return self.mecab_sparse_tonode(@ptr, str) }
131
+ end
132
+
133
+ # set ref to dictionaries
104
134
  @dicts << Natto::DictionaryInfo.new(Natto::Binding.mecab_dictionary_info(@ptr))
105
135
  while @dicts.last.next.address != 0x0
106
136
  @dicts << Natto::DictionaryInfo.new(@dicts.last.next)
107
137
  end
108
138
 
139
+ # set ref to mecab version string
109
140
  @version = self.mecab_version
110
141
 
142
+ # set Proc for freeing mecab pointer
111
143
  ObjectSpace.define_finalizer(self, self.class.create_free_proc(@ptr))
112
144
  end
113
-
114
- # Parses the given string <tt>str</tt>.
145
+
146
+ # Parses the given string <tt>str</tt>. If a block is passed to this method,
147
+ # then node parsing will be used and each node yielded to the given block.
115
148
  #
116
149
  # @param [String] str
117
150
  # @return parsing result from <tt>mecab</tt>
118
151
  # @raise [MeCabError] if the <tt>mecab</tt> parser cannot parse the given string <tt>str</tt>
152
+ # @see MeCabNode
119
153
  def parse(str)
120
- self.mecab_sparse_tostr(@ptr, str) || raise(MeCabError.new(self.mecab_strerror(@ptr)))
154
+ if block_given?
155
+ m_node_ptr = @parse_tonode.call(str)
156
+ head = Natto::MeCabNode.new(m_node_ptr)
157
+ if head && head[:next].address != 0x0
158
+ node = Natto::MeCabNode.new(head[:next])
159
+ while (node.nil? == false)
160
+ yield node
161
+ if node[:next].address != 0x0
162
+ node = Natto::MeCabNode.new(node[:next])
163
+ else
164
+ break
165
+ end
166
+ end
167
+ end
168
+ else
169
+ result = @parse_tostr.call(str)
170
+ result.force_encoding(Encoding.default_external) if result.respond_to?(:encoding) && result.encoding!=Encoding.default_external
171
+ result
172
+ end
173
+ end
174
+
175
+ # Returns human-readable details for the wrapped <tt>mecab</tt> parser.
176
+ # Overrides <tt>Object#to_s</tt>.
177
+ #
178
+ # - encoded object id
179
+ # - FFI pointer to <tt>mecab</tt> object
180
+ # - options hash
181
+ # - list of dictionaries
182
+ # - MeCab version
183
+ #
184
+ # @return [String] encoded object id, FFI pointer, options hash, list of dictionaries, and MeCab version
185
+ def to_s
186
+ %(#{super.chop} @ptr=#{@ptr.to_s}, @options=#{@options.to_s}, @dicts=#{@dicts.to_s}, @version="#{@version.to_s}">)
187
+ end
188
+
189
+ # Overrides <tt>Object#inspect</tt>.
190
+ #
191
+ # @return [String] encoded object id, FFI pointer, options hash, list of dictionaries, and MeCab version
192
+ # @see #to_s
193
+ def inspect
194
+ self.to_s
121
195
  end
122
196
 
123
197
  # Returns a <tt>Proc</tt> that will properly free resources
@@ -138,14 +212,14 @@ module Natto
138
212
  # be passed in the construction of <tt>mecab</tt>.
139
213
  #
140
214
  # @param [Hash] options
141
- # @return string-representation of the options to the <tt>mecab</tt> parser
215
+ # @return [String] representation of the options to the <tt>mecab</tt> parser
142
216
  def self.build_options_str(options={})
143
217
  opt = []
144
218
  SUPPORTED_OPTS.each do |k|
145
219
  if options.has_key? k
146
220
  key = k.to_s.gsub('_', '-')
147
- # all-morphs, partial, and allocate-sentence are just flags
148
- if %w( all-morphs partial allocate-sentence ).include? key
221
+ # all-morphs and allocate-sentence are just flags
222
+ if %w( all-morphs allocate-sentence ).include? key
149
223
  opt << "--#{key}" if options[k]==true
150
224
  else
151
225
  opt << "--#{key}=#{options[k]}"
@@ -160,6 +234,21 @@ module Natto
160
234
  # for the <tt>Natto</tt> module.
161
235
  class MeCabError < RuntimeError; end
162
236
 
237
+ # <tt>MeCabStruct</tt> is a general base class
238
+ # for <tt>FFI::Struct</tt> objects in the <tt>Natto</tt> module.
239
+ class MeCabStruct < FFI::Struct
240
+ # Provides accessor methods for the members of the <tt>mecab</tt> struct.
241
+ #
242
+ # @param [String] attr_name
243
+ # @return member values for the <tt>mecab</tt> struct
244
+ # @raise [NoMethodError] if <tt>attr_name</tt> is not a member of this <tt>mecab</tt> struct
245
+ def method_missing(attr_name)
246
+ member_sym = attr_name.id2name.to_sym
247
+ return self[member_sym] if self.members.include?(member_sym)
248
+ raise(NoMethodError.new("undefined method '#{attr_name}' for #{self}"))
249
+ end
250
+ end
251
+
163
252
  # <tt>DictionaryInfo</tt> is a wrapper for the structure holding
164
253
  # the <tt>MeCab</tt> instance's related dictionary information.
165
254
  #
@@ -180,27 +269,16 @@ module Natto
180
269
  # <tt>mecab</tt> dictionary attributes can be obtained by
181
270
  # using their corresponding accessor.
182
271
  #
183
- # mecab = Natto::MeCab.new
272
+ # nm = Natto::MeCab.new
184
273
  #
185
- # sysdic = m.dicts.first
274
+ # sysdic = nm.dicts.first
186
275
  #
187
276
  # puts sysdic.filename
188
- # => /usr/local/lib/mecab/dic/ipadic/sys.dic
277
+ # => "/usr/local/lib/mecab/dic/ipadic/sys.dic"
189
278
  #
190
279
  # puts sysdic.charset
191
- # => utf8
192
- #
193
- # It is also possible to use the <tt>Symbol</tt> for the
194
- # <tt>mecab</tt> dictionary member to index into the
195
- # <tt>FFI::Struct</tt> layout associative array like so:
196
- #
197
- # puts sysdic[:filename]
198
- # => /usr/local/lib/mecab/dic/ipadic/sys.dic
199
- #
200
- # puts sysdic[:charset]
201
- # => utf8
202
- #
203
- class DictionaryInfo < FFI::Struct
280
+ # => "utf8"
281
+ class DictionaryInfo < MeCabStruct
204
282
 
205
283
  layout :filename, :string,
206
284
  :charset, :string,
@@ -211,7 +289,6 @@ module Natto
211
289
  :version, :ushort,
212
290
  :next, :pointer
213
291
 
214
- # Hack to avoid that deprecation message Object#type thrown in Ruby 1.8.7.
215
292
  if RUBY_VERSION.to_f < 1.9
216
293
  alias_method :deprecated_type, :type
217
294
  # <tt>Object#type</tt> override defined when <tt>RUBY_VERSION</tt> is
@@ -226,22 +303,199 @@ module Natto
226
303
  end
227
304
  end
228
305
 
229
- # Provides accessor methods for the members of the <tt>DictionaryInfo</tt> structure.
306
+ # Returns human-readable details for this <tt>mecab</tt> dictionary.
307
+ # Overrides <tt>Object#to_s</tt>.
230
308
  #
231
- # @param [String] attr_name
232
- # @return member values for the <tt>mecab</tt> dictionary
233
- # @raise [NoMethodError] if <tt>attr_name</tt> is not a member of this <tt>mecab</tt> dictionary <tt>FFI::Struct</tt>
234
- def method_missing(attr_name)
235
- member_sym = attr_name.id2name.to_sym
236
- return self[member_sym] if self.members.include?(member_sym)
237
- raise(NoMethodError.new("undefined method '#{attr_name}' for #{self}"))
309
+ # - encoded object id
310
+ # - full-path dictionary filename
311
+ # - dictionary charset
312
+ #
313
+ # @return [String] encoded object id, dictionary filename, and charset
314
+ def to_s
315
+ %(#{super.chop} filename="#{self.filename}", charset="#{self.charset}">)
316
+ end
317
+
318
+ # Overrides <tt>Object#inspect</tt>.
319
+ #
320
+ # @return [String] encoded object id, dictionary filename, and charset
321
+ # @see #to_s
322
+ def inspect
323
+ self.to_s
238
324
  end
325
+ end
326
+
327
+ # <tt>MeCabNode</tt> is a wrapper for the structure holding
328
+ # the parsed <tt>node</tt>.
329
+ #
330
+ # Values for the <tt>mecab</tt> node attributes may be
331
+ # obtained by using the following <tt>Symbol</tt>s as keys
332
+ # to the layout associative array of <tt>FFI::Struct</tt> members.
333
+ #
334
+ # - :prev
335
+ # - :next
336
+ # - :enext
337
+ # - :bnext
338
+ # - :rpath
339
+ # - :lpath
340
+ # - :begin_node_list
341
+ # - :end_node_list
342
+ # - :surface
343
+ # - :feature
344
+ # - :id
345
+ # - :length
346
+ # - :rlength
347
+ # - :rcAttr
348
+ # - :lcAttr
349
+ # - :posid
350
+ # - :char_type
351
+ # - :stat
352
+ # - :isbest
353
+ # - :sentence_length
354
+ # - :alpha
355
+ # - :beta
356
+ # - :beta
357
+ # - :prob
358
+ # - :wcost
359
+ # - :cost
360
+ # - :token
361
+ #
362
+ # <h2>Usage</h2>
363
+ # An instance of <tt>MeCabNode</tt> is yielded to a block
364
+ # used with <tt>MeCab#parse</tt>. Each resulting node is
365
+ # yielded to the block passed in, where the above-mentioned
366
+ # node attributes may be accessed.
367
+ #
368
+ # nm = Natto::MeCab.new
369
+ #
370
+ # nm.parse('めかぶの使い方がわからなくて困ってました。') do |n|
371
+ # puts "#{n.surface}¥t#{n.cost}"
372
+ # end
373
+ #
374
+ # め 7961
375
+ # かぶ 19303
376
+ # の 25995
377
+ # 使い方 29182
378
+ # が 28327
379
+ # わから 33625
380
+ # なく 34256
381
+ # て 36454
382
+ # 困っ 43797
383
+ # て 42178
384
+ # まし 46708
385
+ # た 46111
386
+ # 。 42677
387
+ # 41141
388
+ # => nil
389
+ #
390
+ # It is also possible to use the <tt>Symbol</tt> for the
391
+ # <tt>mecab</tt> node member to index into the
392
+ # <tt>FFI::Struct</tt> layout associative array like so:
393
+ #
394
+ # nm.parse('納豆に乗っけて頂きます!') {|n| puts n[:feature] }
395
+ #
396
+ # 名詞,一般,*,*,*,*,納豆,ナットウ,ナットー
397
+ # 助詞,格助詞,一般,*,*,*,に,ニ,ニ
398
+ # 動詞,自立,*,*,一段,連用形,乗っける,ノッケ,ノッケ
399
+ # 助詞,接続助詞,*,*,*,*,て,テ,テ
400
+ # 動詞,非自立,*,*,五段・カ行イ音便,連用形,頂く,イタダキ,イタダキ
401
+ # 助動詞,*,*,*,特殊・マス,基本形,ます,マス,マス
402
+ # 記号,一般,*,*,*,*,!,!,!
403
+ # BOS/EOS,*,*,*,*,*,*,*,*
404
+ # => nil
405
+ #
406
+ class MeCabNode < MeCabStruct
407
+
408
+ # Normal <tt>mecab</tt> node.
409
+ NOR_NODE = 0
410
+ # Unknown <tt>mecab</tt> node.
411
+ UNK_NODE = 1
412
+ # Beginning-of-string <tt>mecab</tt> node.
413
+ BOS_NODE = 2
414
+ # End-of-string <tt>mecab</tt> node.
415
+ EOS_NODE = 3
416
+ # End-of-NBest <tt>mecab</tt> node list.
417
+ EON_NODE = 4
239
418
 
240
- # Returns the full-path file name for this dictionary. Overrides <tt>Object#to_s</tt>.
419
+ layout :prev, :pointer,
420
+ :next, :pointer,
421
+ :enext, :pointer,
422
+ :bnext, :pointer,
423
+ :rpath, :pointer,
424
+ :lpath, :pointer,
425
+ :begin_node_list, :pointer,
426
+ :end_node_list, :pointer,
427
+ :surface, :string,
428
+ :feature, :string,
429
+ :id, :uint,
430
+ :length, :ushort,
431
+ :rlength, :ushort,
432
+ :rcAttr, :ushort,
433
+ :lcAttr, :ushort,
434
+ :posid, :ushort,
435
+ :char_type, :uchar,
436
+ :stat, :uchar,
437
+ :isbest, :uchar,
438
+ :sentence_length, :uint,
439
+ :alpha, :float,
440
+ :beta, :float,
441
+ :prob, :float,
442
+ :wcost, :short,
443
+ :cost, :long,
444
+ :token, :pointer
445
+
446
+ if RUBY_VERSION.to_f < 1.9
447
+ alias_method :deprecated_id, :id
448
+ # <tt>Object#id</tt> override defined when <tt>RUBY_VERSION</tt> is
449
+ # older than 1.9. This is a hack to avoid the <tt>Object#id</tt>
450
+ # deprecation warning thrown up in Ruby 1.8.7.
451
+ #
452
+ # <i>This method override is not defined when the Ruby interpreter
453
+ # is 1.9 or greater.</i>
454
+ # @return [Fixnum] <tt>mecab</tt> node id
455
+ def id
456
+ self[:id]
457
+ end
458
+ end
459
+
460
+ # Returns the <tt>surface</tt> value for this node.
461
+ #
462
+ # @return [String] <tt>mecab</tt> node surface value
463
+ def surface
464
+ if self[:surface] && self[:length] > 0
465
+ @surface ||= self[:surface].bytes.to_a()[0,self[:length]].pack('C*')
466
+ @surface.force_encoding(Encoding.default_external) if @surface.respond_to?(:encoding) && @surface.encoding!=Encoding.default_external
467
+ end
468
+ @surface
469
+ end
470
+
471
+ # Returns the <tt>feature</tt> value for this node.
472
+ #
473
+ # @return [String] <tt>mecab</tt> node feature value
474
+ def feature
475
+ @feature ||= self[:feature]
476
+ @feature.force_encoding(Encoding.default_external) if @feature.respond_to?(:encoding) && @feature.encoding!=Encoding.default_external
477
+ @feature
478
+ end
479
+
480
+ # Returns human-readable details for the <tt>mecab</tt> node.
481
+ # Overrides <tt>Object#to_s</tt>.
482
+ #
483
+ # - encoded object id
484
+ # - stat
485
+ # - surface
486
+ # - feature
241
487
  #
242
- # @return [String] full-path filename for this dictionary
488
+ # @return [String] encoded object id, stat, surface, and feature
243
489
  def to_s
244
- self[:filename]
490
+ %(#{super.chop} stat=#{self[:stat]}, surface="#{self.surface}", feature="#{self.feature}">)
491
+ end
492
+
493
+ # Overrides <tt>Object#inspect</tt>.
494
+ #
495
+ # @return [String] encoded object id, stat, surface, and feature
496
+ # @see #to_s
497
+ def inspect
498
+ self.to_s
245
499
  end
246
500
  end
247
501
  end
@@ -21,11 +21,18 @@ class TestNattoBinding < Test::Unit::TestCase
21
21
  # Tests for the inclusion of mecab methods made available
22
22
  # to any classes including the Natto::Binding module.
23
23
  def test_functions_included
24
- [ :mecab_version,
25
- :mecab_new2,
24
+ [ :mecab_new2,
25
+ :mecab_version,
26
+ :mecab_strerror,
26
27
  :mecab_destroy,
28
+ :mecab_set_theta,
29
+ :mecab_set_lattice_level,
30
+ :mecab_set_all_morphs,
27
31
  :mecab_sparse_tostr,
28
- :mecab_strerror,
32
+ :mecab_nbest_sparse_tostr,
33
+ :mecab_nbest_init,
34
+ :mecab_nbest_sparse_tostr,
35
+ :mecab_nbest_next_tonode,
29
36
  :mecab_dictionary_info ].each do |f|
30
37
  assert(@klass.respond_to? f)
31
38
  end
@@ -4,11 +4,19 @@
4
4
  # behavior of Natto::DictionaryInfo
5
5
  class TestDictionaryInfo < Test::Unit::TestCase
6
6
  def setup
7
- @m = Natto::MeCab.new
7
+ m = Natto::MeCab.new
8
+ @dicts = m.dicts
9
+
10
+ out = `mecab -D`.lines.to_a
11
+ out.each do |l|
12
+ tokens = l.split("\t")
13
+ @sysdic_filename = tokens[1].strip if tokens[0] =~ /filename:/i
14
+ @sysdic_charset = tokens[1].strip if tokens[0] =~ /charset:/i
15
+ end
8
16
  end
9
17
 
10
18
  def teardown
11
- @m = nil
19
+ @dicts = nil
12
20
  end
13
21
 
14
22
  # Tests the dictionaries accessor method of Natto::MeCab.
@@ -17,28 +25,36 @@ class TestDictionaryInfo < Test::Unit::TestCase
17
25
  # b) system dictionary encoding is utf-8
18
26
  # c) only dealing w/ case of 1 dictionary being used
19
27
  def test_dictionaries_accessor
20
- dicts = @m.dicts
21
- assert dicts.empty? == false
22
- sysdic = dicts.first
23
- assert_equal('/usr/local/lib/mecab/dic/ipadic/sys.dic', sysdic[:filename])
24
- assert_equal('utf8', sysdic[:charset])
28
+ assert @dicts.empty? == false
29
+ sysdic = @dicts.first
30
+ assert_equal(@sysdic_filename, sysdic[:filename])
31
+ assert_equal(@sysdic_charset, sysdic[:charset])
25
32
  assert_equal(0x0, sysdic[:next].address)
26
- #assert_nil(sysdic.next)
27
33
  end
28
34
 
29
35
  # Tests the to_s method.
30
36
  def test_to_s
31
- assert_equal('/usr/local/lib/mecab/dic/ipadic/sys.dic', @m.dicts.first.to_s)
37
+ #<Natto::DictionaryInfo:0x288879bc @filename=\"/usr/local/lib/mecab/dic/ipadic/sys.dic\", @charset=\"utf8\">
38
+ assert(@dicts.first.to_s.include?("filename=\"#{@sysdic_filename}\", charset=\"#{@sysdic_charset}\""))
32
39
  end
33
40
 
34
41
  # Tests the accessors of Natto::DictionaryInfo.
35
42
  # Note: Object#type is deprecated in 1.9.n, but comes with a warning
36
43
  # in 1.8.n
37
44
  def test_dictionary_info_member_accessors
38
- sysdic = @m.dicts.first
39
- members = %w( filename charset type size lsize rsize version next )
45
+ sysdic = @dicts.first
46
+ members = [
47
+ :filename,
48
+ :charset,
49
+ :type,
50
+ :size,
51
+ :lsize,
52
+ :rsize,
53
+ :version,
54
+ :next
55
+ ]
40
56
  members.each do |nomme|
41
- assert_not_nil(sysdic.send nomme.to_sym )
57
+ assert_not_nil(sysdic.send nomme )
42
58
  end
43
59
 
44
60
  # NoMethodError will be raised for anything else!
@@ -1,8 +1,19 @@
1
1
  # coding: utf-8
2
+ require 'rbconfig'
3
+ require 'nkf'
2
4
 
3
5
  # TestMeCab encapsulates tests for the basic
4
6
  # behavior of Natto::MeCab.
5
7
  class TestMeCab < Test::Unit::TestCase
8
+
9
+ host_os = RbConfig::CONFIG['host_os']
10
+ # we need to transfrom from UTF-8 ot SJIS if we are on Windows!
11
+ if host_os =~ /mswin|mingw/i
12
+ TEST_STR = NKF.nkf("-Ws", '試験ですよ、これが。')
13
+ else
14
+ TEST_STR = '試験ですよ、これが。'
15
+ end
16
+
6
17
  def setup
7
18
  @m = Natto::MeCab.new
8
19
  end
@@ -37,9 +48,6 @@ class TestMeCab < Test::Unit::TestCase
37
48
  res = Natto::MeCab.build_options_str(:output_format_type=>"natto")
38
49
  assert_equal('--output-format-type=natto', res)
39
50
 
40
- res = Natto::MeCab.build_options_str(:partial=>true)
41
- assert_equal('--partial', res)
42
-
43
51
  res = Natto::MeCab.build_options_str(:node_format=>'%m\t%f[7]\n')
44
52
  assert_equal('--node-format=%m\t%f[7]\n', res)
45
53
 
@@ -76,10 +84,8 @@ class TestMeCab < Test::Unit::TestCase
76
84
  res = Natto::MeCab.build_options_str(:output_format_type=>"natto",
77
85
  :userdic=>"/some/file",
78
86
  :dicdir=>"/some/other/file",
79
- :partial=>true,
80
87
  :all_morphs=>true)
81
- assert_equal('--dicdir=/some/other/file --userdic=/some/file --all-morphs --output-format-type=natto --partial', res)
82
-
88
+ assert_equal('--dicdir=/some/other/file --userdic=/some/file --all-morphs --output-format-type=natto', res)
83
89
  end
84
90
 
85
91
  # Tests the construction and initial state of a Natto::MeCab instance.
@@ -96,7 +102,13 @@ class TestMeCab < Test::Unit::TestCase
96
102
  end
97
103
  assert_equal(opts, m.options)
98
104
 
99
- opts = {:all_morphs=>true, :partial=>true, :allocate_sentence=>true}
105
+ opts = {:all_morphs=>true, :allocate_sentence=>true}
106
+ assert_nothing_raised do
107
+ m = Natto::MeCab.new(opts)
108
+ end
109
+ assert_equal(opts, m.options)
110
+
111
+ opts = {:lattice_level=>999}
100
112
  assert_nothing_raised do
101
113
  m = Natto::MeCab.new(opts)
102
114
  end
@@ -126,4 +138,41 @@ class TestMeCab < Test::Unit::TestCase
126
138
  def test_version_accessor
127
139
  assert_equal('0.98', @m.version)
128
140
  end
141
+
142
+ # Tests Natto::MeCab parsing using the --all-morphs option.
143
+ def test_all_morphs
144
+ m = Natto::MeCab.new(:all_morphs=>true)
145
+ expected = `echo #{TEST_STR} | mecab --all-morphs`.lines.to_a
146
+ expected.delete_if {|e| e =~ /^(EOS|BOS)/ }
147
+
148
+ actual = m.parse(TEST_STR).lines.to_a
149
+ actual.delete_if {|e| e =~ /^(EOS|BOS|\t)/ }
150
+
151
+ assert_equal(expected, actual)
152
+ end
153
+
154
+ # Tests Natto::MeCab parsing (default parse_tostr).
155
+ def test_parse_tostr_default
156
+ expected = `echo #{TEST_STR} | mecab`.lines.to_a
157
+ expected.delete_if {|e| e =~ /^(EOS|BOS|\t)/ }
158
+
159
+ actual = @m.parse(TEST_STR).lines.to_a
160
+ actual.delete_if {|e| e =~ /^(EOS|BOS|\t)/ }
161
+
162
+ assert_equal(expected, actual)
163
+ end
164
+
165
+ # Tests Natto::MeCab parsing (default parse_tonode).
166
+ def test_parse_tonode_default
167
+ expected = `echo #{TEST_STR} | mecab`.lines.to_a
168
+ expected.delete_if {|e| e =~ /^(EOS|BOS|\t)/ }
169
+
170
+ actual = []
171
+ @m.parse(TEST_STR) do |node|
172
+ actual << "#{node.surface}\t#{node.feature}\n"
173
+ end
174
+ actual.delete_if {|e| e =~ /^(EOS|BOS|\t)/ }
175
+
176
+ assert_equal(expected, actual)
177
+ end
129
178
  end
@@ -0,0 +1,106 @@
1
+ # coding: utf-8
2
+ require 'rbconfig'
3
+ require 'nkf'
4
+
5
+ # TestMeCabNode encapsulates tests for the basic
6
+ # behavior of Natto::MeCabNode
7
+ class TestMeCabNode < Test::Unit::TestCase
8
+
9
+ host_os = RbConfig::CONFIG['host_os']
10
+ # we need to transfrom from UTF-8 ot SJIS if we are on Windows!
11
+ if host_os =~ /mswin|mingw/i
12
+ TEST_STR = NKF.nkf("-Ws", '試験ですよ、これが。')
13
+ else
14
+ TEST_STR = '試験ですよ、これが。'
15
+ end
16
+
17
+ def setup
18
+ nm = Natto::MeCab.new
19
+ @nodes = []
20
+ nm.parse(TEST_STR) { |n| @nodes << n }
21
+ end
22
+
23
+ def teardown
24
+ @nodes = nil
25
+ end
26
+
27
+ # Tests the surface and feature accessors methods.
28
+ def test_surface_and_feature_accessors
29
+ raw = `echo #{TEST_STR} | mecab`.lines.to_a
30
+ raw.delete_if {|e| e =~ /^(EOS|BOS|\t)/ }
31
+ expected = {}
32
+ raw.each do |l|
33
+ tokens = l.split("\t")
34
+ expected[tokens[0]]=tokens[1].strip
35
+ end
36
+
37
+ actual = {}
38
+ @nodes.each do |n|
39
+ actual[n.surface]=n.feature if (n.stat==Natto::MeCabNode::NOR_NODE ||
40
+ n.stat==Natto::MeCabNode::UNK_NODE)
41
+ end
42
+
43
+ assert_equal(expected, actual)
44
+ end
45
+
46
+ # Tests MeCabNode#surface to show that it is consistent
47
+ # no matter how many times it is invoked.
48
+ def test_manysurfaces
49
+ @nodes.each do |n|
50
+ expected = n.surface
51
+ 5.times { assert_equal(expected, n.surface) }
52
+ end
53
+ end
54
+
55
+ # Tests MeCabNode#feature to show that it is consistent
56
+ # no matter how many times it is invoked.
57
+ def test_manyfeature
58
+ @nodes.each do |n|
59
+ expected = n.feature
60
+ 5.times { assert_equal(expected, n.feature) }
61
+ end
62
+ end
63
+
64
+ # Tests that the accessors of Natto::MeCabNode exist.
65
+ # Note: Object#id is deprecated in 1.9.n, but comes with a warning
66
+ # in 1.8.n
67
+ def test_mecabnode_accessors
68
+ node = @nodes[0]
69
+ members = [
70
+ :prev,
71
+ :next,
72
+ :enext,
73
+ :bnext,
74
+ :rpath,
75
+ :lpath,
76
+ :begin_node_list,
77
+ :end_node_list,
78
+ :surface,
79
+ :feature,
80
+ :id,
81
+ :length,
82
+ :rlength,
83
+ :rcAttr,
84
+ :lcAttr,
85
+ :posid,
86
+ :char_type,
87
+ :stat,
88
+ :isbest,
89
+ :sentence_length,
90
+ :alpha,
91
+ :beta,
92
+ :prob,
93
+ :wcost,
94
+ :cost,
95
+ :token
96
+ ]
97
+ members.each do |nomme|
98
+ assert_not_nil(node.respond_to? nomme )
99
+ end
100
+
101
+ # NoMethodError will be raised for anything else!
102
+ assert_raise NoMethodError do
103
+ node.send :unknown_attr
104
+ end
105
+ end
106
+ end
data/test/test_natto.rb CHANGED
@@ -5,6 +5,7 @@ require 'test/unit'
5
5
  require 'natto'
6
6
 
7
7
  [ '/test/natto/tc_mecab.rb',
8
+ '/test/natto/tc_mecabnode.rb',
8
9
  '/test/natto/tc_dictionaryinfo.rb',
9
10
  '/test/natto/tc_binding.rb' ].each do |tc|
10
11
  require File.join(File.expand_path('.'), tc)
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: natto
3
3
  version: !ruby/object:Gem::Version
4
- hash: 13
4
+ hash: 11
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 4
9
- - 1
10
- version: 0.4.1
8
+ - 5
9
+ - 0
10
+ version: 0.5.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Brooke M. Fujita
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-01-27 00:00:00 +09:00
18
+ date: 2011-02-26 00:00:00 +09:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -51,6 +51,7 @@ files:
51
51
  - test/natto/tc_binding.rb
52
52
  - test/natto/tc_dictionaryinfo.rb
53
53
  - test/natto/tc_mecab.rb
54
+ - test/natto/tc_mecabnode.rb
54
55
  - README.md
55
56
  - LICENSE
56
57
  - CHANGELOG