natto 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,5 +1,10 @@
1
1
  ## CHANGELOG
2
2
 
3
+ - __2011/02/2?__: 0.5.0 release.
4
+ - Added support for node parsing using blocks
5
+ - Added support for mecab options nbest, all-morphs
6
+ - Pulling support for mecab option partial, since it is more of a command-line feature
7
+
3
8
  - __2011/01/27__: 0.4.1 release.
4
9
  - Tweaking the description in natto.gemspec a bit
5
10
 
data/README.md CHANGED
@@ -4,7 +4,7 @@ A Tasty Ruby Binding with MeCab
4
4
  ## What is natto?
5
5
  natto combines the [Ruby programming language](http://www.ruby-lang.org/) with [MeCab](http://mecab.sourceforge.net/), the part-of-speech and morphological analyzer for the Japanese language.
6
6
 
7
- natto is a gem bridging Ruby and MeCab using FFI (foreign function interface). No compilation is necessary, and natto will run on CRuby (mri/yarv) and JRuby (jvm) equally well, on any OS.
7
+ natto is a gem bridging Ruby and MeCab using FFI (foreign function interface). No compilation is necessary, as natto is _not_ a C extension. natto will run on CRuby (mri/yarv) and JRuby (jvm) equally well. natto will also run on Windows, Unix/Linux, and Mac.
8
8
 
9
9
  You can learn more about [natto at Google Code Projects](http://code.google.com/p/natto/).
10
10
 
@@ -31,29 +31,36 @@ e.g., on Windows
31
31
  set MECAB_PATH=C:\Program Files\MeCab\bin\libmecab.dll
32
32
  e.g., for Cygwin
33
33
  export MECAB_PATH=cygmecab-1
34
+ e.g., from within a Ruby program
35
+ ENV['MECAB_PATH']=/usr/local/lib/libmecab.so
34
36
 
35
37
  ## Usage
36
38
  require 'rubygems' if RUBY_VERSION.to_f < 1.9
37
39
  require 'natto'
38
40
 
39
- mecab = Natto::MeCab.new
40
- => #<Natto::MeCab:0x289b88e0 @ptr=#<FFI::Pointer address=0x288865c8>, \
41
- @options={}, \
42
- @version="0.98", \
43
- @dicts=[/usr/local/lib/mecab/dic/ipadic/sys.dic]>
41
+ nm = Natto::MeCab.new
42
+ => #<Natto::MeCab:0x28d30748
43
+ @ptr=#<FFI::Pointer address=0x28a97d50>, \
44
+ @options={}, \
45
+ @dicts=[#<Natto::DictionaryInfo:0x28d3061c
46
+ filename="/usr/local/lib/mecab/dic/ipadic/sys.dic",
47
+ charset="utf8">],
48
+ @version="0.98">
44
49
 
45
- puts mecab.version
46
- => 0.98
50
+ puts nm.version
51
+ => "0.98"
47
52
 
48
- sysdic = mecab.dicts.first
53
+ sysdic = nm.dicts.first
49
54
 
50
55
  puts sysdic.filename
51
- => /usr/local/lib/mecab/dic/ipadic/sys.dic
56
+ => "/usr/local/lib/mecab/dic/ipadic/sys.dic"
52
57
 
53
58
  puts sysdic.charset
54
- => utf8
59
+ => "utf8"
55
60
 
56
- puts mecab.parse('暑い日にはもってこいの一品ですね。')
61
+ nm.parse('暑い日にはもってこいの一品ですね。') do |n|
62
+ puts "#{n.surface}\t#{n.feature}"
63
+ end
57
64
  暑い 形容詞,自立,*,*,形容詞・アウオ段,基本形,暑い,アツイ,アツイ
58
65
  日 名詞,非自立,副詞可能,*,*,*,日,ヒ,ヒ
59
66
  に 助詞,格助詞,一般,*,*,*,に,ニ,ニ
@@ -64,7 +71,7 @@ e.g., for Cygwin
64
71
  です 助動詞,*,*,*,特殊・デス,基本形,です,デス,デス
65
72
  ね 助詞,終助詞,*,*,*,*,ね,ネ,ネ
66
73
  。 終助詞記号,句点,*,*,*,*,。,。,。
67
- EOS
74
+ BOS/EOS,*,*,*,*,*,*,*,*
68
75
  => nil
69
76
 
70
77
  ## Contributing to natto
data/lib/natto/binding.rb CHANGED
@@ -13,7 +13,7 @@ module Natto
13
13
  # String name for the environment variable used by
14
14
  # <tt>Natto</tt> to indicate the exact name / full path
15
15
  # to the <tt>mecab</tt> library.
16
- MECAB_PATH = 'MECAB_PATH'
16
+ MECAB_PATH = 'MECAB_PATH'.freeze
17
17
 
18
18
  # @private
19
19
  def self.included(base)
@@ -31,11 +31,13 @@ module Natto
31
31
  # @raise [LoadError] if MECAB_PATH environment variable is not set in Windows
32
32
  # <br/>
33
33
  # e.g., for bash on UNIX/Linux
34
- # export MECAB_PATH=mecab.so
34
+ # export MECAB_PATH=/usr/local/lib/libmecab.so
35
35
  # e.g., on Windows
36
36
  # set MECAB_PATH=C:\Program Files\MeCab\bin\libmecab.dll
37
37
  # e.g., for Cygwin
38
38
  # export MECAB_PATH=cygmecab-1
39
+ # e.g., from within a Ruby program
40
+ # ENV['MECAB_PATH']=/usr/local/lib/libmecab.so
39
41
  def self.find_library
40
42
  host_os = RbConfig::CONFIG['host_os']
41
43
 
@@ -50,33 +52,72 @@ module Natto
50
52
 
51
53
  ffi_lib(ENV[MECAB_PATH] || find_library)
52
54
 
53
- attach_function :mecab_version, [], :string
54
55
  attach_function :mecab_new2, [:string], :pointer
56
+ attach_function :mecab_version, [], :string
57
+ attach_function :mecab_strerror, [:pointer],:string
55
58
  attach_function :mecab_destroy, [:pointer], :void
59
+
60
+ attach_function :mecab_set_theta, [:pointer, :float], :void
61
+ attach_function :mecab_set_lattice_level, [:pointer, :int], :void
62
+ attach_function :mecab_set_all_morphs, [:pointer, :int], :void
63
+
56
64
  attach_function :mecab_sparse_tostr, [:pointer, :string], :string
57
- attach_function :mecab_strerror, [:pointer],:string
65
+ attach_function :mecab_sparse_tonode, [:pointer, :string], :pointer
66
+
67
+ attach_function :mecab_nbest_init, [:pointer, :string], :int
68
+ attach_function :mecab_nbest_sparse_tostr, [:pointer, :int, :string], :string
69
+ attach_function :mecab_nbest_next_tonode, [:pointer], :pointer
70
+
58
71
  attach_function :mecab_dictionary_info, [:pointer], :pointer
59
72
 
60
73
  # @private
61
74
  module ClassMethods
75
+ def mecab_new2(options_str)
76
+ Natto::Binding.mecab_new2(options_str)
77
+ end
78
+
62
79
  def mecab_version
63
80
  Natto::Binding.mecab_version
64
81
  end
65
82
 
66
- def mecab_new2(options_str)
67
- Natto::Binding.mecab_new2(options_str)
83
+ def mecab_strerror(ptr)
84
+ Natto::Binding.mecab_strerror(ptr)
68
85
  end
69
86
 
70
87
  def mecab_destroy(ptr)
71
88
  Natto::Binding.mecab_destroy(ptr)
72
89
  end
73
90
 
91
+ def mecab_set_theta(ptr, t)
92
+ Natto::Binding.mecab_set_theta(ptr, t)
93
+ end
94
+
95
+ def mecab_set_lattice_level(ptr, ll)
96
+ Natto::Binding.mecab_set_lattice_level(ptr, ll)
97
+ end
98
+
99
+ def mecab_set_all_morphs(ptr, am)
100
+ Natto::Binding.mecab_set_all_morphs(ptr, am)
101
+ end
102
+
74
103
  def mecab_sparse_tostr(ptr, str)
75
104
  Natto::Binding.mecab_sparse_tostr(ptr, str)
76
105
  end
106
+
107
+ def mecab_sparse_tonode(ptr, str)
108
+ Natto::Binding.mecab_sparse_tonode(ptr, str)
109
+ end
77
110
 
78
- def mecab_strerror(ptr)
79
- Natto::Binding.mecab_strerror(ptr)
111
+ def mecab_nbest_next_tonode(ptr)
112
+ Natto::Binding.mecab_nbest_next_tonode(ptr)
113
+ end
114
+
115
+ def mecab_nbest_init(ptr, str)
116
+ Natto::Binding.mecab_nbest_init(ptr, str)
117
+ end
118
+
119
+ def mecab_nbest_sparse_tostr(ptr, n, str)
120
+ Natto::Binding.mecab_nbest_sparse_tostr(ptr, n, str)
80
121
  end
81
122
 
82
123
  def mecab_dictionary_info(ptr)
data/lib/natto/version.rb CHANGED
@@ -6,8 +6,15 @@
6
6
  # <tt>Natto::MeCab</tt> is a wrapper class for the <tt>mecab</tt>
7
7
  # parser.
8
8
  #
9
- # <tt>Natto::DictionaryInfo</tt> is a wrapper for a <tt>Natto::MeCab</tt>
10
- # instance's related dictionary information.
9
+ # <tt>Natto::MeCabStruct</tt> is a base class for a <tt>mecab</tt>
10
+ # struct.
11
+ #
12
+ # <tt>Natto::MeCabNode</tt> is a wrapper for the struct representing
13
+ # a <tt>mecab</tt>-parsed node.
14
+ #
15
+ # <tt>Natto::DictionaryInfo</tt> is a wrapper for the struct
16
+ # representing a <tt>Natto::MeCab</tt> instance's related
17
+ # dictionary information.
11
18
  #
12
19
  # <tt>Natto::MeCabError</tt> is a general error class for the
13
20
  # <tt>Natto</tt> module.
@@ -16,5 +23,5 @@
16
23
  # which are made available via <tt>FFI</tt> bindings to <tt>mecab</tt>.
17
24
  module Natto
18
25
  # Version string for this Rubygem.
19
- VERSION = "0.4.1"
26
+ VERSION = "0.5.0"
20
27
  end
data/lib/natto.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+
2
3
  require 'rubygems' if RUBY_VERSION.to_f < 1.9
3
4
  require 'natto/binding'
4
5
 
@@ -14,23 +15,25 @@ module Natto
14
15
  # require 'rubygems' if RUBY_VERSION.to_f < 1.9
15
16
  # require 'natto'
16
17
  #
17
- # mecab = Natto::MeCab.new(:output_format_type=>'wakati')
18
- # => #<Natto::MeCab:0x28dd471c @ptr=#<FFI::Pointer address=0x28a027d8>, \
19
- # @options={:output_format_type=>"wakati"}, \
20
- # @version="0.98", \
21
- # @dicts=[/usr/local/lib/mecab/dic/ipadic/sys.dic]>
22
- #
23
- # output = mecab.parse('ネバネバの組み合わせ美味しいです。').split
18
+ # nm = Natto::MeCab.new(:output_format_type=>'chasen2')
19
+ # => #<Natto::MeCab:0x28d3bdc8 \
20
+ # @ptr=#<FFI::Pointer address=0x28afb980>, \
21
+ # @options={:output_format_type=>"chasen2"}, \
22
+ # @dicts=[#<Natto::DictionaryInfo:0x289a1f14 \
23
+ # filename="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
24
+ # charset="utf8">], \
25
+ # @version="0.98">
24
26
  #
25
- # output.each do |token|
26
- # puts token
27
+ # nm.parse('ネバネバの組み合わせ美味しいです。') do |n|
28
+ # puts "#{n.surface}\t#{n.feature}"
27
29
  # end
28
- # => ネバネバ
29
- #
30
- # 組み合わせ
31
- # 美味しい
32
- # です
33
- #
30
+ #
31
+ # ネバネバ 名詞,サ変接続,*,*,*,*,ネバネバ,ネバネバ,ネバネバ
32
+ # の 助詞,連体化,*,*,*,*,の,ノ,ノ
33
+ # 組み合わせ 名詞,一般,*,*,*,*,組み合わせ,クミアワセ,クミアワセ
34
+ # 美味しい 形容詞,自立,*,*,形容詞・イ段,基本形,美味しい,オイシイ,オイシイ
35
+ # です 助動詞,*,*,*,特殊・デス,基本形,です,デス,デス
36
+ # 。 記号,句点,*,*,*,*,。,。,。
34
37
  #
35
38
  class MeCab
36
39
  include Natto::Binding
@@ -39,11 +42,11 @@ module Natto
39
42
 
40
43
  # Supported options to the <tt>mecab</tt> parser.
41
44
  # See the <tt>mecab</tt> help for more details.
42
- SUPPORTED_OPTS = [ :rcfile, :dicdir, :userdic, :lattice_level, :all_morphs,
43
- :output_format_type, :partial, :node_format, :unk_format,
44
- :bos_format, :eos_format, :eon_format, :unk_feature,
45
- :input_buffer_size, :allocate_sentence, :nbest, :theta,
46
- :cost_factor ].freeze
45
+ SUPPORTED_OPTS = [ :rcfile, :dicdir, :userdic, :lattice_level, :all_morphs,
46
+ :output_format_type, :node_format, :unk_format,
47
+ :bos_format, :eos_format, :eon_format, :unk_feature,
48
+ :input_buffer_size, :allocate_sentence, :nbest, :theta,
49
+ :cost_factor, :output ].freeze
47
50
 
48
51
  # Initializes the wrapped <tt>mecab</tt> instance with the
49
52
  # given <tt>options</tt> hash.
@@ -56,7 +59,6 @@ module Natto
56
59
  # - :lattice_level -- lattice information level (integer, default 0)
57
60
  # - :all_morphs -- output all morphs (default false)
58
61
  # - :output_format_type -- output format type (wakati, chasen, yomi, etc.)
59
- # - :partial -- partial parsing mode
60
62
  # - :node_format -- user-defined node format
61
63
  # - :unk_format -- user-defined unknown node format
62
64
  # - :bos_format -- user-defined beginning-of-sentence format
@@ -65,20 +67,23 @@ module Natto
65
67
  # - :unk_feature -- feature for unknown word
66
68
  # - :input_buffer_size -- set input buffer size (default 8192)
67
69
  # - :allocate_sentence -- allocate new memory for input sentence
68
- # - :nbest -- output N best results (integer, default 1)
70
+ # - :nbest -- output N best results (integer, default 1), requires lattice level >= 1
69
71
  # - :theta -- temperature parameter theta (float, default 0.75)
70
72
  # - :cost_factor -- cost factor (integer, default 700)
73
+ # - :output -- set the output file name
71
74
  #
72
75
  # <i>Use single-quotes to preserve format options that contain escape chars.</i><br/>
73
76
  # e.g.<br/>
74
77
  #
75
- # mecab = Natto::MeCab.new(:node_format=>'%m\t%f[7]\n')
76
- # => #<Natto::MeCab:0x289b88e0 @ptr=#<FFI::Pointer address=0x288865c8>, \
77
- # @options={:node_format=>"%m\\t%f[7]\\n"}, \
78
- # @version="0.98", \
79
- # @dicts=[/usr/local/lib/mecab/dic/ipadic/sys.dic]>
78
+ # nm = Natto::MeCab.new(:node_format=>'%m¥t%f[7]¥n')
79
+ # => #<Natto::MeCab:0x28d2ae10 @ptr=#<FFI::Pointer address=0x28a97980>, \
80
+ # @options={:node_format=>"%m¥t%f[7]¥n"}, \
81
+ # @dicts=[#<Natto::DictionaryInfo:0x28d2a85c \
82
+ # filename="/usr/local/lib/mecab/dic/ipadic/sys.dic" \
83
+ # charset="utf8">], \
84
+ # @version="0.98">
80
85
  #
81
- # puts mecab.parse('簡単で美味しくて良いですよね。')
86
+ # puts nm.parse('簡単で美味しくて良いですよね。')
82
87
  # 簡単 カンタン
83
88
  # で デ
84
89
  # 美味しくて オイシクテ
@@ -101,23 +106,92 @@ module Natto
101
106
  @ptr = self.mecab_new2(opt_str)
102
107
  raise MeCabError.new("Could not initialize MeCab with options: '#{opt_str}'") if @ptr.address == 0x0
103
108
 
109
+ # set mecab parsing options
110
+ self.mecab_set_theta(@ptr, @options[:theta].to_f) if @options[:theta]
111
+ self.mecab_set_lattice_level(@ptr, @options[:lattice_level].to_i) if @options[:lattice_level]
112
+ self.mecab_set_all_morphs(@ptr, 1) if @options[:all_morphs]
113
+
114
+ # set mecab parsing implementations
115
+ if @options[:nbest] && @options[:nbest] > 1
116
+ # N-Best parsing implementations
117
+ self.mecab_nbest_init(@ptr, str)
118
+ # nbest parsing require lattice level >= 1
119
+ self.mecab_set_lattice_level(@ptr, (@options[:lattice_level] || 1))
120
+ @parse_tostr = lambda { |str|
121
+ return self.mecab_nbest_sparse_tostr(@ptr, @options[:nbest], str) ||
122
+ raise(MeCabError.new(self.mecab_strerror(@ptr)))
123
+ }
124
+ @parse_tonode = lambda { |str| return self.mecab_nbest_next_tonode(@ptr) }
125
+ else
126
+ # default parsing implementations
127
+ @parse_tostr = lambda { |str|
128
+ return self.mecab_sparse_tostr(@ptr, str) || raise(MeCabError.new(self.mecab_strerror(@ptr)))
129
+ }
130
+ @parse_tonode = lambda { |str| return self.mecab_sparse_tonode(@ptr, str) }
131
+ end
132
+
133
+ # set ref to dictionaries
104
134
  @dicts << Natto::DictionaryInfo.new(Natto::Binding.mecab_dictionary_info(@ptr))
105
135
  while @dicts.last.next.address != 0x0
106
136
  @dicts << Natto::DictionaryInfo.new(@dicts.last.next)
107
137
  end
108
138
 
139
+ # set ref to mecab version string
109
140
  @version = self.mecab_version
110
141
 
142
+ # set Proc for freeing mecab pointer
111
143
  ObjectSpace.define_finalizer(self, self.class.create_free_proc(@ptr))
112
144
  end
113
-
114
- # Parses the given string <tt>str</tt>.
145
+
146
+ # Parses the given string <tt>str</tt>. If a block is passed to this method,
147
+ # then node parsing will be used and each node yielded to the given block.
115
148
  #
116
149
  # @param [String] str
117
150
  # @return parsing result from <tt>mecab</tt>
118
151
  # @raise [MeCabError] if the <tt>mecab</tt> parser cannot parse the given string <tt>str</tt>
152
+ # @see MeCabNode
119
153
  def parse(str)
120
- self.mecab_sparse_tostr(@ptr, str) || raise(MeCabError.new(self.mecab_strerror(@ptr)))
154
+ if block_given?
155
+ m_node_ptr = @parse_tonode.call(str)
156
+ head = Natto::MeCabNode.new(m_node_ptr)
157
+ if head && head[:next].address != 0x0
158
+ node = Natto::MeCabNode.new(head[:next])
159
+ while (node.nil? == false)
160
+ yield node
161
+ if node[:next].address != 0x0
162
+ node = Natto::MeCabNode.new(node[:next])
163
+ else
164
+ break
165
+ end
166
+ end
167
+ end
168
+ else
169
+ result = @parse_tostr.call(str)
170
+ result.force_encoding(Encoding.default_external) if result.respond_to?(:encoding) && result.encoding!=Encoding.default_external
171
+ result
172
+ end
173
+ end
174
+
175
+ # Returns human-readable details for the wrapped <tt>mecab</tt> parser.
176
+ # Overrides <tt>Object#to_s</tt>.
177
+ #
178
+ # - encoded object id
179
+ # - FFI pointer to <tt>mecab</tt> object
180
+ # - options hash
181
+ # - list of dictionaries
182
+ # - MeCab version
183
+ #
184
+ # @return [String] encoded object id, FFI pointer, options hash, list of dictionaries, and MeCab version
185
+ def to_s
186
+ %(#{super.chop} @ptr=#{@ptr.to_s}, @options=#{@options.to_s}, @dicts=#{@dicts.to_s}, @version="#{@version.to_s}">)
187
+ end
188
+
189
+ # Overrides <tt>Object#inspect</tt>.
190
+ #
191
+ # @return [String] encoded object id, FFI pointer, options hash, list of dictionaries, and MeCab version
192
+ # @see #to_s
193
+ def inspect
194
+ self.to_s
121
195
  end
122
196
 
123
197
  # Returns a <tt>Proc</tt> that will properly free resources
@@ -138,14 +212,14 @@ module Natto
138
212
  # be passed in the construction of <tt>mecab</tt>.
139
213
  #
140
214
  # @param [Hash] options
141
- # @return string-representation of the options to the <tt>mecab</tt> parser
215
+ # @return [String] representation of the options to the <tt>mecab</tt> parser
142
216
  def self.build_options_str(options={})
143
217
  opt = []
144
218
  SUPPORTED_OPTS.each do |k|
145
219
  if options.has_key? k
146
220
  key = k.to_s.gsub('_', '-')
147
- # all-morphs, partial, and allocate-sentence are just flags
148
- if %w( all-morphs partial allocate-sentence ).include? key
221
+ # all-morphs and allocate-sentence are just flags
222
+ if %w( all-morphs allocate-sentence ).include? key
149
223
  opt << "--#{key}" if options[k]==true
150
224
  else
151
225
  opt << "--#{key}=#{options[k]}"
@@ -160,6 +234,21 @@ module Natto
160
234
  # for the <tt>Natto</tt> module.
161
235
  class MeCabError < RuntimeError; end
162
236
 
237
+ # <tt>MeCabStruct</tt> is a general base class
238
+ # for <tt>FFI::Struct</tt> objects in the <tt>Natto</tt> module.
239
+ class MeCabStruct < FFI::Struct
240
+ # Provides accessor methods for the members of the <tt>mecab</tt> struct.
241
+ #
242
+ # @param [String] attr_name
243
+ # @return member values for the <tt>mecab</tt> struct
244
+ # @raise [NoMethodError] if <tt>attr_name</tt> is not a member of this <tt>mecab</tt> struct
245
+ def method_missing(attr_name)
246
+ member_sym = attr_name.id2name.to_sym
247
+ return self[member_sym] if self.members.include?(member_sym)
248
+ raise(NoMethodError.new("undefined method '#{attr_name}' for #{self}"))
249
+ end
250
+ end
251
+
163
252
  # <tt>DictionaryInfo</tt> is a wrapper for the structure holding
164
253
  # the <tt>MeCab</tt> instance's related dictionary information.
165
254
  #
@@ -180,27 +269,16 @@ module Natto
180
269
  # <tt>mecab</tt> dictionary attributes can be obtained by
181
270
  # using their corresponding accessor.
182
271
  #
183
- # mecab = Natto::MeCab.new
272
+ # nm = Natto::MeCab.new
184
273
  #
185
- # sysdic = m.dicts.first
274
+ # sysdic = nm.dicts.first
186
275
  #
187
276
  # puts sysdic.filename
188
- # => /usr/local/lib/mecab/dic/ipadic/sys.dic
277
+ # => "/usr/local/lib/mecab/dic/ipadic/sys.dic"
189
278
  #
190
279
  # puts sysdic.charset
191
- # => utf8
192
- #
193
- # It is also possible to use the <tt>Symbol</tt> for the
194
- # <tt>mecab</tt> dictionary member to index into the
195
- # <tt>FFI::Struct</tt> layout associative array like so:
196
- #
197
- # puts sysdic[:filename]
198
- # => /usr/local/lib/mecab/dic/ipadic/sys.dic
199
- #
200
- # puts sysdic[:charset]
201
- # => utf8
202
- #
203
- class DictionaryInfo < FFI::Struct
280
+ # => "utf8"
281
+ class DictionaryInfo < MeCabStruct
204
282
 
205
283
  layout :filename, :string,
206
284
  :charset, :string,
@@ -211,7 +289,6 @@ module Natto
211
289
  :version, :ushort,
212
290
  :next, :pointer
213
291
 
214
- # Hack to avoid that deprecation message Object#type thrown in Ruby 1.8.7.
215
292
  if RUBY_VERSION.to_f < 1.9
216
293
  alias_method :deprecated_type, :type
217
294
  # <tt>Object#type</tt> override defined when <tt>RUBY_VERSION</tt> is
@@ -226,22 +303,199 @@ module Natto
226
303
  end
227
304
  end
228
305
 
229
- # Provides accessor methods for the members of the <tt>DictionaryInfo</tt> structure.
306
+ # Returns human-readable details for this <tt>mecab</tt> dictionary.
307
+ # Overrides <tt>Object#to_s</tt>.
230
308
  #
231
- # @param [String] attr_name
232
- # @return member values for the <tt>mecab</tt> dictionary
233
- # @raise [NoMethodError] if <tt>attr_name</tt> is not a member of this <tt>mecab</tt> dictionary <tt>FFI::Struct</tt>
234
- def method_missing(attr_name)
235
- member_sym = attr_name.id2name.to_sym
236
- return self[member_sym] if self.members.include?(member_sym)
237
- raise(NoMethodError.new("undefined method '#{attr_name}' for #{self}"))
309
+ # - encoded object id
310
+ # - full-path dictionary filename
311
+ # - dictionary charset
312
+ #
313
+ # @return [String] encoded object id, dictionary filename, and charset
314
+ def to_s
315
+ %(#{super.chop} filename="#{self.filename}", charset="#{self.charset}">)
316
+ end
317
+
318
+ # Overrides <tt>Object#inspect</tt>.
319
+ #
320
+ # @return [String] encoded object id, dictionary filename, and charset
321
+ # @see #to_s
322
+ def inspect
323
+ self.to_s
238
324
  end
325
+ end
326
+
327
+ # <tt>MeCabNode</tt> is a wrapper for the structure holding
328
+ # the parsed <tt>node</tt>.
329
+ #
330
+ # Values for the <tt>mecab</tt> node attributes may be
331
+ # obtained by using the following <tt>Symbol</tt>s as keys
332
+ # to the layout associative array of <tt>FFI::Struct</tt> members.
333
+ #
334
+ # - :prev
335
+ # - :next
336
+ # - :enext
337
+ # - :bnext
338
+ # - :rpath
339
+ # - :lpath
340
+ # - :begin_node_list
341
+ # - :end_node_list
342
+ # - :surface
343
+ # - :feature
344
+ # - :id
345
+ # - :length
346
+ # - :rlength
347
+ # - :rcAttr
348
+ # - :lcAttr
349
+ # - :posid
350
+ # - :char_type
351
+ # - :stat
352
+ # - :isbest
353
+ # - :sentence_length
354
+ # - :alpha
355
+ # - :beta
356
+ # - :beta
357
+ # - :prob
358
+ # - :wcost
359
+ # - :cost
360
+ # - :token
361
+ #
362
+ # <h2>Usage</h2>
363
+ # An instance of <tt>MeCabNode</tt> is yielded to a block
364
+ # used with <tt>MeCab#parse</tt>. Each resulting node is
365
+ # yielded to the block passed in, where the above-mentioned
366
+ # node attributes may be accessed.
367
+ #
368
+ # nm = Natto::MeCab.new
369
+ #
370
+ # nm.parse('めかぶの使い方がわからなくて困ってました。') do |n|
371
+ # puts "#{n.surface}¥t#{n.cost}"
372
+ # end
373
+ #
374
+ # め 7961
375
+ # かぶ 19303
376
+ # の 25995
377
+ # 使い方 29182
378
+ # が 28327
379
+ # わから 33625
380
+ # なく 34256
381
+ # て 36454
382
+ # 困っ 43797
383
+ # て 42178
384
+ # まし 46708
385
+ # た 46111
386
+ # 。 42677
387
+ # 41141
388
+ # => nil
389
+ #
390
+ # It is also possible to use the <tt>Symbol</tt> for the
391
+ # <tt>mecab</tt> node member to index into the
392
+ # <tt>FFI::Struct</tt> layout associative array like so:
393
+ #
394
+ # nm.parse('納豆に乗っけて頂きます!') {|n| puts n[:feature] }
395
+ #
396
+ # 名詞,一般,*,*,*,*,納豆,ナットウ,ナットー
397
+ # 助詞,格助詞,一般,*,*,*,に,ニ,ニ
398
+ # 動詞,自立,*,*,一段,連用形,乗っける,ノッケ,ノッケ
399
+ # 助詞,接続助詞,*,*,*,*,て,テ,テ
400
+ # 動詞,非自立,*,*,五段・カ行イ音便,連用形,頂く,イタダキ,イタダキ
401
+ # 助動詞,*,*,*,特殊・マス,基本形,ます,マス,マス
402
+ # 記号,一般,*,*,*,*,!,!,!
403
+ # BOS/EOS,*,*,*,*,*,*,*,*
404
+ # => nil
405
+ #
406
+ class MeCabNode < MeCabStruct
407
+
408
+ # Normal <tt>mecab</tt> node.
409
+ NOR_NODE = 0
410
+ # Unknown <tt>mecab</tt> node.
411
+ UNK_NODE = 1
412
+ # Beginning-of-string <tt>mecab</tt> node.
413
+ BOS_NODE = 2
414
+ # End-of-string <tt>mecab</tt> node.
415
+ EOS_NODE = 3
416
+ # End-of-NBest <tt>mecab</tt> node list.
417
+ EON_NODE = 4
239
418
 
240
- # Returns the full-path file name for this dictionary. Overrides <tt>Object#to_s</tt>.
419
+ layout :prev, :pointer,
420
+ :next, :pointer,
421
+ :enext, :pointer,
422
+ :bnext, :pointer,
423
+ :rpath, :pointer,
424
+ :lpath, :pointer,
425
+ :begin_node_list, :pointer,
426
+ :end_node_list, :pointer,
427
+ :surface, :string,
428
+ :feature, :string,
429
+ :id, :uint,
430
+ :length, :ushort,
431
+ :rlength, :ushort,
432
+ :rcAttr, :ushort,
433
+ :lcAttr, :ushort,
434
+ :posid, :ushort,
435
+ :char_type, :uchar,
436
+ :stat, :uchar,
437
+ :isbest, :uchar,
438
+ :sentence_length, :uint,
439
+ :alpha, :float,
440
+ :beta, :float,
441
+ :prob, :float,
442
+ :wcost, :short,
443
+ :cost, :long,
444
+ :token, :pointer
445
+
446
+ if RUBY_VERSION.to_f < 1.9
447
+ alias_method :deprecated_id, :id
448
+ # <tt>Object#id</tt> override defined when <tt>RUBY_VERSION</tt> is
449
+ # older than 1.9. This is a hack to avoid the <tt>Object#id</tt>
450
+ # deprecation warning thrown up in Ruby 1.8.7.
451
+ #
452
+ # <i>This method override is not defined when the Ruby interpreter
453
+ # is 1.9 or greater.</i>
454
+ # @return [Fixnum] <tt>mecab</tt> node id
455
+ def id
456
+ self[:id]
457
+ end
458
+ end
459
+
460
+ # Returns the <tt>surface</tt> value for this node.
461
+ #
462
+ # @return [String] <tt>mecab</tt> node surface value
463
+ def surface
464
+ if self[:surface] && self[:length] > 0
465
+ @surface ||= self[:surface].bytes.to_a()[0,self[:length]].pack('C*')
466
+ @surface.force_encoding(Encoding.default_external) if @surface.respond_to?(:encoding) && @surface.encoding!=Encoding.default_external
467
+ end
468
+ @surface
469
+ end
470
+
471
+ # Returns the <tt>feature</tt> value for this node.
472
+ #
473
+ # @return [String] <tt>mecab</tt> node feature value
474
+ def feature
475
+ @feature ||= self[:feature]
476
+ @feature.force_encoding(Encoding.default_external) if @feature.respond_to?(:encoding) && @feature.encoding!=Encoding.default_external
477
+ @feature
478
+ end
479
+
480
+ # Returns human-readable details for the <tt>mecab</tt> node.
481
+ # Overrides <tt>Object#to_s</tt>.
482
+ #
483
+ # - encoded object id
484
+ # - stat
485
+ # - surface
486
+ # - feature
241
487
  #
242
- # @return [String] full-path filename for this dictionary
488
+ # @return [String] encoded object id, stat, surface, and feature
243
489
  def to_s
244
- self[:filename]
490
+ %(#{super.chop} stat=#{self[:stat]}, surface="#{self.surface}", feature="#{self.feature}">)
491
+ end
492
+
493
+ # Overrides <tt>Object#inspect</tt>.
494
+ #
495
+ # @return [String] encoded object id, stat, surface, and feature
496
+ # @see #to_s
497
+ def inspect
498
+ self.to_s
245
499
  end
246
500
  end
247
501
  end
@@ -21,11 +21,18 @@ class TestNattoBinding < Test::Unit::TestCase
21
21
  # Tests for the inclusion of mecab methods made available
22
22
  # to any classes including the Natto::Binding module.
23
23
  def test_functions_included
24
- [ :mecab_version,
25
- :mecab_new2,
24
+ [ :mecab_new2,
25
+ :mecab_version,
26
+ :mecab_strerror,
26
27
  :mecab_destroy,
28
+ :mecab_set_theta,
29
+ :mecab_set_lattice_level,
30
+ :mecab_set_all_morphs,
27
31
  :mecab_sparse_tostr,
28
- :mecab_strerror,
32
+ :mecab_nbest_sparse_tostr,
33
+ :mecab_nbest_init,
34
+ :mecab_nbest_sparse_tostr,
35
+ :mecab_nbest_next_tonode,
29
36
  :mecab_dictionary_info ].each do |f|
30
37
  assert(@klass.respond_to? f)
31
38
  end
@@ -4,11 +4,19 @@
4
4
  # behavior of Natto::DictionaryInfo
5
5
  class TestDictionaryInfo < Test::Unit::TestCase
6
6
  def setup
7
- @m = Natto::MeCab.new
7
+ m = Natto::MeCab.new
8
+ @dicts = m.dicts
9
+
10
+ out = `mecab -D`.lines.to_a
11
+ out.each do |l|
12
+ tokens = l.split("\t")
13
+ @sysdic_filename = tokens[1].strip if tokens[0] =~ /filename:/i
14
+ @sysdic_charset = tokens[1].strip if tokens[0] =~ /charset:/i
15
+ end
8
16
  end
9
17
 
10
18
  def teardown
11
- @m = nil
19
+ @dicts = nil
12
20
  end
13
21
 
14
22
  # Tests the dictionaries accessor method of Natto::MeCab.
@@ -17,28 +25,36 @@ class TestDictionaryInfo < Test::Unit::TestCase
17
25
  # b) system dictionary encoding is utf-8
18
26
  # c) only dealing w/ case of 1 dictionary being used
19
27
  def test_dictionaries_accessor
20
- dicts = @m.dicts
21
- assert dicts.empty? == false
22
- sysdic = dicts.first
23
- assert_equal('/usr/local/lib/mecab/dic/ipadic/sys.dic', sysdic[:filename])
24
- assert_equal('utf8', sysdic[:charset])
28
+ assert @dicts.empty? == false
29
+ sysdic = @dicts.first
30
+ assert_equal(@sysdic_filename, sysdic[:filename])
31
+ assert_equal(@sysdic_charset, sysdic[:charset])
25
32
  assert_equal(0x0, sysdic[:next].address)
26
- #assert_nil(sysdic.next)
27
33
  end
28
34
 
29
35
  # Tests the to_s method.
30
36
  def test_to_s
31
- assert_equal('/usr/local/lib/mecab/dic/ipadic/sys.dic', @m.dicts.first.to_s)
37
+ #<Natto::DictionaryInfo:0x288879bc @filename=\"/usr/local/lib/mecab/dic/ipadic/sys.dic\", @charset=\"utf8\">
38
+ assert(@dicts.first.to_s.include?("filename=\"#{@sysdic_filename}\", charset=\"#{@sysdic_charset}\""))
32
39
  end
33
40
 
34
41
  # Tests the accessors of Natto::DictionaryInfo.
35
42
  # Note: Object#type is deprecated in 1.9.n, but comes with a warning
36
43
  # in 1.8.n
37
44
  def test_dictionary_info_member_accessors
38
- sysdic = @m.dicts.first
39
- members = %w( filename charset type size lsize rsize version next )
45
+ sysdic = @dicts.first
46
+ members = [
47
+ :filename,
48
+ :charset,
49
+ :type,
50
+ :size,
51
+ :lsize,
52
+ :rsize,
53
+ :version,
54
+ :next
55
+ ]
40
56
  members.each do |nomme|
41
- assert_not_nil(sysdic.send nomme.to_sym )
57
+ assert_not_nil(sysdic.send nomme )
42
58
  end
43
59
 
44
60
  # NoMethodError will be raised for anything else!
@@ -1,8 +1,19 @@
1
1
  # coding: utf-8
2
+ require 'rbconfig'
3
+ require 'nkf'
2
4
 
3
5
  # TestMeCab encapsulates tests for the basic
4
6
  # behavior of Natto::MeCab.
5
7
  class TestMeCab < Test::Unit::TestCase
8
+
9
+ host_os = RbConfig::CONFIG['host_os']
10
+ # we need to transfrom from UTF-8 ot SJIS if we are on Windows!
11
+ if host_os =~ /mswin|mingw/i
12
+ TEST_STR = NKF.nkf("-Ws", '試験ですよ、これが。')
13
+ else
14
+ TEST_STR = '試験ですよ、これが。'
15
+ end
16
+
6
17
  def setup
7
18
  @m = Natto::MeCab.new
8
19
  end
@@ -37,9 +48,6 @@ class TestMeCab < Test::Unit::TestCase
37
48
  res = Natto::MeCab.build_options_str(:output_format_type=>"natto")
38
49
  assert_equal('--output-format-type=natto', res)
39
50
 
40
- res = Natto::MeCab.build_options_str(:partial=>true)
41
- assert_equal('--partial', res)
42
-
43
51
  res = Natto::MeCab.build_options_str(:node_format=>'%m\t%f[7]\n')
44
52
  assert_equal('--node-format=%m\t%f[7]\n', res)
45
53
 
@@ -76,10 +84,8 @@ class TestMeCab < Test::Unit::TestCase
76
84
  res = Natto::MeCab.build_options_str(:output_format_type=>"natto",
77
85
  :userdic=>"/some/file",
78
86
  :dicdir=>"/some/other/file",
79
- :partial=>true,
80
87
  :all_morphs=>true)
81
- assert_equal('--dicdir=/some/other/file --userdic=/some/file --all-morphs --output-format-type=natto --partial', res)
82
-
88
+ assert_equal('--dicdir=/some/other/file --userdic=/some/file --all-morphs --output-format-type=natto', res)
83
89
  end
84
90
 
85
91
  # Tests the construction and initial state of a Natto::MeCab instance.
@@ -96,7 +102,13 @@ class TestMeCab < Test::Unit::TestCase
96
102
  end
97
103
  assert_equal(opts, m.options)
98
104
 
99
- opts = {:all_morphs=>true, :partial=>true, :allocate_sentence=>true}
105
+ opts = {:all_morphs=>true, :allocate_sentence=>true}
106
+ assert_nothing_raised do
107
+ m = Natto::MeCab.new(opts)
108
+ end
109
+ assert_equal(opts, m.options)
110
+
111
+ opts = {:lattice_level=>999}
100
112
  assert_nothing_raised do
101
113
  m = Natto::MeCab.new(opts)
102
114
  end
@@ -126,4 +138,41 @@ class TestMeCab < Test::Unit::TestCase
126
138
  def test_version_accessor
127
139
  assert_equal('0.98', @m.version)
128
140
  end
141
+
142
+ # Tests Natto::MeCab parsing using the --all-morphs option.
143
+ def test_all_morphs
144
+ m = Natto::MeCab.new(:all_morphs=>true)
145
+ expected = `echo #{TEST_STR} | mecab --all-morphs`.lines.to_a
146
+ expected.delete_if {|e| e =~ /^(EOS|BOS)/ }
147
+
148
+ actual = m.parse(TEST_STR).lines.to_a
149
+ actual.delete_if {|e| e =~ /^(EOS|BOS|\t)/ }
150
+
151
+ assert_equal(expected, actual)
152
+ end
153
+
154
+ # Tests Natto::MeCab parsing (default parse_tostr).
155
+ def test_parse_tostr_default
156
+ expected = `echo #{TEST_STR} | mecab`.lines.to_a
157
+ expected.delete_if {|e| e =~ /^(EOS|BOS|\t)/ }
158
+
159
+ actual = @m.parse(TEST_STR).lines.to_a
160
+ actual.delete_if {|e| e =~ /^(EOS|BOS|\t)/ }
161
+
162
+ assert_equal(expected, actual)
163
+ end
164
+
165
+ # Tests Natto::MeCab parsing (default parse_tonode).
166
+ def test_parse_tonode_default
167
+ expected = `echo #{TEST_STR} | mecab`.lines.to_a
168
+ expected.delete_if {|e| e =~ /^(EOS|BOS|\t)/ }
169
+
170
+ actual = []
171
+ @m.parse(TEST_STR) do |node|
172
+ actual << "#{node.surface}\t#{node.feature}\n"
173
+ end
174
+ actual.delete_if {|e| e =~ /^(EOS|BOS|\t)/ }
175
+
176
+ assert_equal(expected, actual)
177
+ end
129
178
  end
@@ -0,0 +1,106 @@
1
+ # coding: utf-8
2
+ require 'rbconfig'
3
+ require 'nkf'
4
+
5
+ # TestMeCabNode encapsulates tests for the basic
6
+ # behavior of Natto::MeCabNode
7
+ class TestMeCabNode < Test::Unit::TestCase
8
+
9
+ host_os = RbConfig::CONFIG['host_os']
10
+ # we need to transfrom from UTF-8 ot SJIS if we are on Windows!
11
+ if host_os =~ /mswin|mingw/i
12
+ TEST_STR = NKF.nkf("-Ws", '試験ですよ、これが。')
13
+ else
14
+ TEST_STR = '試験ですよ、これが。'
15
+ end
16
+
17
+ def setup
18
+ nm = Natto::MeCab.new
19
+ @nodes = []
20
+ nm.parse(TEST_STR) { |n| @nodes << n }
21
+ end
22
+
23
+ def teardown
24
+ @nodes = nil
25
+ end
26
+
27
+ # Tests the surface and feature accessors methods.
28
+ def test_surface_and_feature_accessors
29
+ raw = `echo #{TEST_STR} | mecab`.lines.to_a
30
+ raw.delete_if {|e| e =~ /^(EOS|BOS|\t)/ }
31
+ expected = {}
32
+ raw.each do |l|
33
+ tokens = l.split("\t")
34
+ expected[tokens[0]]=tokens[1].strip
35
+ end
36
+
37
+ actual = {}
38
+ @nodes.each do |n|
39
+ actual[n.surface]=n.feature if (n.stat==Natto::MeCabNode::NOR_NODE ||
40
+ n.stat==Natto::MeCabNode::UNK_NODE)
41
+ end
42
+
43
+ assert_equal(expected, actual)
44
+ end
45
+
46
+ # Tests MeCabNode#surface to show that it is consistent
47
+ # no matter how many times it is invoked.
48
+ def test_manysurfaces
49
+ @nodes.each do |n|
50
+ expected = n.surface
51
+ 5.times { assert_equal(expected, n.surface) }
52
+ end
53
+ end
54
+
55
+ # Tests MeCabNode#feature to show that it is consistent
56
+ # no matter how many times it is invoked.
57
+ def test_manyfeature
58
+ @nodes.each do |n|
59
+ expected = n.feature
60
+ 5.times { assert_equal(expected, n.feature) }
61
+ end
62
+ end
63
+
64
+ # Tests that the accessors of Natto::MeCabNode exist.
65
+ # Note: Object#id is deprecated in 1.9.n, but comes with a warning
66
+ # in 1.8.n
67
+ def test_mecabnode_accessors
68
+ node = @nodes[0]
69
+ members = [
70
+ :prev,
71
+ :next,
72
+ :enext,
73
+ :bnext,
74
+ :rpath,
75
+ :lpath,
76
+ :begin_node_list,
77
+ :end_node_list,
78
+ :surface,
79
+ :feature,
80
+ :id,
81
+ :length,
82
+ :rlength,
83
+ :rcAttr,
84
+ :lcAttr,
85
+ :posid,
86
+ :char_type,
87
+ :stat,
88
+ :isbest,
89
+ :sentence_length,
90
+ :alpha,
91
+ :beta,
92
+ :prob,
93
+ :wcost,
94
+ :cost,
95
+ :token
96
+ ]
97
+ members.each do |nomme|
98
+ assert_not_nil(node.respond_to? nomme )
99
+ end
100
+
101
+ # NoMethodError will be raised for anything else!
102
+ assert_raise NoMethodError do
103
+ node.send :unknown_attr
104
+ end
105
+ end
106
+ end
data/test/test_natto.rb CHANGED
@@ -5,6 +5,7 @@ require 'test/unit'
5
5
  require 'natto'
6
6
 
7
7
  [ '/test/natto/tc_mecab.rb',
8
+ '/test/natto/tc_mecabnode.rb',
8
9
  '/test/natto/tc_dictionaryinfo.rb',
9
10
  '/test/natto/tc_binding.rb' ].each do |tc|
10
11
  require File.join(File.expand_path('.'), tc)
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: natto
3
3
  version: !ruby/object:Gem::Version
4
- hash: 13
4
+ hash: 11
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 4
9
- - 1
10
- version: 0.4.1
8
+ - 5
9
+ - 0
10
+ version: 0.5.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Brooke M. Fujita
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-01-27 00:00:00 +09:00
18
+ date: 2011-02-26 00:00:00 +09:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -51,6 +51,7 @@ files:
51
51
  - test/natto/tc_binding.rb
52
52
  - test/natto/tc_dictionaryinfo.rb
53
53
  - test/natto/tc_mecab.rb
54
+ - test/natto/tc_mecabnode.rb
54
55
  - README.md
55
56
  - LICENSE
56
57
  - CHANGELOG