natto 0.9.6 → 0.9.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -11,18 +11,18 @@ module Natto
11
11
  #
12
12
  # ## Usage
13
13
  #
14
- # require 'rubygems' if RUBY_VERSION.to_f < 1.9
15
14
  # require 'natto'
16
15
  #
17
16
  # nm = Natto::MeCab.new('-Ochasen')
18
17
  # => #<Natto::MeCab:0x28d3bdc8 \
19
18
  # @tagger=#<FFI::Pointer address=0x28afb980>, \
19
+ # @libpath="/usr/local/lib/libmecab.so" \
20
20
  # @options={:output_format_type=>"chasen"}, \
21
21
  # @dicts=[#<Natto::DictionaryInfo:0x289a1f14 \
22
- # type="0", \
23
- # filename="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
24
- # charset="utf8">], \
25
- # @version="0.996">
22
+ # @filepath="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
23
+ # charset=utf8 \
24
+ # type=0>], \
25
+ # @version=0.996>
26
26
  #
27
27
  # nm.parse('凡人にしか見えねえ風景ってのがあるんだよ。') do |n|
28
28
  # puts "#{n.surface}\t#{n.feature}"
@@ -46,8 +46,17 @@ module Natto
46
46
  class MeCab
47
47
  include Natto::Binding
48
48
  include Natto::OptionParse
49
-
50
- attr_reader :tagger, :options, :dicts, :version
49
+
50
+ # @return [FFI:Pointer] pointer to MeCab tagger.
51
+ attr_reader :tagger
52
+ # @return [String] absolute filepath to MeCab library.
53
+ attr_reader :libpath
54
+ # @return [Hash] MeCab options as key-value pairs.
55
+ attr_reader :options
56
+ # @return [Array] listing of all of dictionaries referenced.
57
+ attr_reader :dicts
58
+ # @return [String] `MeCab` versions.
59
+ attr_reader :version
51
60
 
52
61
  # Initializes the wrapped `mecab` instance with the
53
62
  # given `options`.
@@ -83,12 +92,13 @@ module Natto
83
92
  # nm = Natto::MeCab.new(:node_format=>'%m¥t%f[7]¥n')
84
93
  # => #<Natto::MeCab:0x28d2ae10
85
94
  # @tagger=#<FFI::Pointer address=0x28a97980>, \
95
+ # @libpath="/usr/local/lib/libmecab.so", \
86
96
  # @options={:node_format=>"%m¥t%f[7]¥n"}, \
87
97
  # @dicts=[#<Natto::DictionaryInfo:0x28d2a85c \
88
- # type="0", \
89
- # filename="/usr/local/lib/mecab/dic/ipadic/sys.dic" \
90
- # charset="utf8">], \
91
- # @version="0.996">
98
+ # @filepath="/usr/local/lib/mecab/dic/ipadic/sys.dic" \
99
+ # charset=utf8, \
100
+ # type=0>] \
101
+ # @version=0.996>
92
102
  #
93
103
  # puts nm.parse('才能とは求める人間に与えられるものではない。')
94
104
  # 才能 サイノウ
@@ -106,14 +116,15 @@ module Natto
106
116
  # 。 。
107
117
  # EOS
108
118
  #
109
- # @param [Hash or String]
119
+ # @param [Hash, String] options MeCab options for tagger
110
120
  # @raise [MeCabError] if `mecab` cannot be initialized with the given `options`
111
121
  def initialize(options={})
112
122
  @options = self.class.parse_mecab_options(options)
113
123
  @dicts = []
114
124
 
115
- opt_str = self.class.build_options_str(@options)
116
- @tagger = self.mecab_new2(opt_str)
125
+ opt_str = self.class.build_options_str(@options)
126
+ @tagger = self.class.mecab_new2(opt_str)
127
+ @libpath = self.class.find_library
117
128
  raise MeCabError.new("Could not initialize MeCab with options: '#{opt_str}'") if @tagger.address == 0x0
118
129
 
119
130
  self.mecab_set_theta(@tagger, @options[:theta]) if @options[:theta]
@@ -123,69 +134,79 @@ module Natto
123
134
 
124
135
  # Set mecab parsing implementations for N-best and regular parsing,
125
136
  # for both parsing as string and yielding a node object
126
- # N-Best parsing implementations
127
137
  if @options[:nbest] && @options[:nbest] > 1
138
+ # N-Best parsing implementations
128
139
  self.mecab_set_lattice_level(@tagger, (@options[:lattice_level] || 1))
129
- @parse_tostr = lambda do |str|
130
- return self.mecab_nbest_sparse_tostr(@tagger, @options[:nbest], str) ||
140
+
141
+ @parse_tostr = lambda do |text|
142
+ retval = self.mecab_nbest_sparse_tostr(@tagger, @options[:nbest], text) ||
131
143
  raise(MeCabError.new(self.mecab_strerror(@tagger)))
144
+ retval.force_encoding(Encoding.default_external)
132
145
  end
133
- @parse_tonodes = lambda do |str|
134
- nodes = []
135
- if @options[:nbest] && @options[:nbest] > 1
136
- self.mecab_nbest_init(@tagger, str)
146
+
147
+ @parse_tonodes = lambda do |text|
148
+ Enumerator.new do |y|
149
+ self.mecab_nbest_init(@tagger, text)
137
150
  n = self.mecab_nbest_next_tonode(@tagger)
138
151
  raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
152
+
139
153
  nlen = @options[:nbest]
140
154
  nlen.times do |i|
141
- s = str.bytes.to_a
155
+ s = text.bytes.to_a
142
156
  while n && n.address != 0x0
143
157
  mn = Natto::MeCabNode.new(n)
144
- s = s.drop_while {|e| (e==0xa || e==0x20)}
145
- if !s.empty?
146
- sarr = []
147
- mn.length.times { sarr << s.shift }
148
- surf = sarr.pack('C*')
149
- #mn.surface = self.class.force_enc(surf)
150
- mn.surface = surf.force_encoding(Encoding.default_external)
151
- end
152
- if @options[:output_format_type] || @options[:node_format]
153
- mn.feature = self.mecab_format_node(@tagger, n).force_encoding(Encoding.default_external)
158
+ # ignore BOS nodes, since mecab does so
159
+ if !mn.is_bos?
160
+ s = s.drop_while {|e| (e==0xa || e==0x20)}
161
+ if !s.empty?
162
+ sarr = []
163
+ mn.length.times { sarr << s.shift }
164
+ surf = sarr.pack('C*')
165
+ mn.surface = surf.force_encoding(Encoding.default_external)
166
+ end
167
+ if @options[:output_format_type] || @options[:node_format]
168
+ mn.feature = self.mecab_format_node(@tagger, n).force_encoding(Encoding.default_external)
169
+ end
170
+ y.yield mn
154
171
  end
155
- nodes << mn if !mn.is_bos?
156
172
  n = mn.next
157
173
  end
158
174
  n = self.mecab_nbest_next_tonode(@tagger)
159
175
  end
160
176
  end
161
- return nodes
162
177
  end
163
178
  else
164
179
  # default parsing implementations
165
- @parse_tostr = lambda do |str|
166
- return self.mecab_sparse_tostr(@tagger, str) ||
180
+ @parse_tostr = lambda do |text|
181
+ retval = self.mecab_sparse_tostr(@tagger, text) ||
167
182
  raise(MeCabError.new(self.mecab_strerror(@tagger)))
183
+ retval.force_encoding(Encoding.default_external)
168
184
  end
169
- @parse_tonodes = lambda do |str|
170
- nodes = []
171
- n = self.mecab_sparse_tonode(@tagger, str)
172
- raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
173
- mn = Natto::MeCabNode.new(n)
174
- n = mn.next if mn.next.address!=0x0
175
- s = str.bytes.to_a
176
- while n && n.address!=0x0
185
+
186
+ @parse_tonodes = lambda do |text|
187
+ Enumerator.new do |y|
188
+ n = self.mecab_sparse_tonode(@tagger, text)
189
+ raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
190
+
177
191
  mn = Natto::MeCabNode.new(n)
178
- s = s.drop_while {|e| (e==0xa || e==0x20)}
179
- if !s.empty?
180
- sarr = []
181
- mn.length.times { sarr << s.shift }
182
- surf = sarr.pack('C*')
183
- mn.surface = surf.force_encoding(Encoding.default_external)
192
+ n = mn.next if mn.next.address!=0x0
193
+ s = text.bytes.to_a
194
+ while n && n.address!=0x0
195
+ mn = Natto::MeCabNode.new(n)
196
+ s = s.drop_while {|e| (e==0xa || e==0x20)}
197
+ if !s.empty?
198
+ sarr = []
199
+ mn.length.times { sarr << s.shift }
200
+ surf = sarr.pack('C*')
201
+ mn.surface = surf.force_encoding(Encoding.default_external)
202
+ end
203
+ if @options[:output_format_type] || @options[:node_format]
204
+ mn.feature = self.mecab_format_node(@tagger, n).force_encoding(Encoding.default_external)
205
+ end
206
+ y.yield mn
207
+ n = mn.next
184
208
  end
185
- nodes << mn
186
- n = mn.next
187
209
  end
188
- return nodes
189
210
  end
190
211
  end
191
212
 
@@ -199,24 +220,48 @@ module Natto
199
220
  ObjectSpace.define_finalizer(self, self.class.create_free_proc(@tagger))
200
221
  end
201
222
 
202
- # Parses the given string `str`. If a block is passed to this method,
203
- # then node parsing will be used and each node yielded to the given block.
223
+ # Parses the given `text`, returning the MeCab output as a single string.
224
+ # If a block is passed to this method, then node parsing will be used
225
+ # and each node yielded to the given block.
204
226
  #
205
- # @param [String] str
206
- # @return parsing result from `mecab`
207
- # @raise [MeCabError] if the `mecab` tagger cannot parse the given string `str`
208
- # @raise [ArgumentError] if the given string `str` argument is `nil`
227
+ # @param [String] text
228
+ # @return [String] parsing result from `mecab`
229
+ # @raise [MeCabError] if the `mecab` tagger cannot parse the given `text`
230
+ # @raise [ArgumentError] if the given string `text` argument is `nil`
209
231
  # @see MeCabNode
210
- def parse(str)
211
- raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
232
+ def parse(text)
233
+ raise ArgumentError.new 'Text to parse cannot be nil' if text.nil?
212
234
  if block_given?
213
- nodes = @parse_tonodes.call(str)
214
- nodes.each {|n| yield n }
235
+ @parse_tonodes.call(text).each {|n| yield n }
215
236
  else
216
- @parse_tostr.call(str).force_encoding(Encoding.default_external)
237
+ @parse_tostr.call(text)
217
238
  end
218
239
  end
219
240
 
241
+ # Parses the given string `text`, returning an
242
+ # {http://www.ruby-doc.org/core-2.1.5/Enumerator.html Enumerator} that may be
243
+ # used to iterate over the resulting {MeCabNode} objects. This is more
244
+ # efficient than parsing to a simple string, since each node's
245
+ # information will not be materialized all at once as with it is with
246
+ # string output.
247
+ #
248
+ # MeCab nodes contain much more detailed information about
249
+ # the morpheme. Node-formatting may also be used to customize
250
+ # the resulting node's `feature` attribute.
251
+ #
252
+ # @param [String] text
253
+ # @return [Enumerator] of MeCabNode instances
254
+ # @raise [MeCabError] if the `mecab` tagger cannot parse the given `text`
255
+ # @raise [ArgumentError] if the given string `text` argument is `nil`
256
+ # @see MeCabNode
257
+ # @see http://www.ruby-doc.org/core-2.1.5/Enumerator.html
258
+ def enum_parse(text)
259
+ raise ArgumentError.new 'Text to parse cannot be nil' if text.nil?
260
+ @parse_tonodes.call(text)
261
+ end
262
+
263
+ # @deprecated
264
+ # DEPRECATED: use enum_parse instead, this convenience method is useless.
220
265
  # Parses the given string `str`, and returns
221
266
  # a list of `mecab` nodes.
222
267
  # @param [String] str
@@ -225,10 +270,14 @@ module Natto
225
270
  # @raise [ArgumentError] if the given string `str` argument is `nil`
226
271
  # @see MeCabNode
227
272
  def parse_as_nodes(str)
273
+ $stderr.puts 'DEPRECATED: use enum_parse instead'
274
+ $stderr.puts ' This method will be removed in the next release!'
228
275
  raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
229
276
  @parse_tonodes.call(str)
230
277
  end
231
278
 
279
+ # @deprecated
280
+ # DEPRECATED: use enum_parse instead, this convenience method is useless.
232
281
  # Parses the given string `str`, and returns
233
282
  # a list of `mecab` result strings.
234
283
  # @param [String] str
@@ -236,19 +285,29 @@ module Natto
236
285
  # @raise [MeCabError] if the `mecab` tagger cannot parse the given string `str`
237
286
  # @raise [ArgumentError] if the given string `str` argument is `nil`
238
287
  def parse_as_strings(str)
288
+ $stderr.puts 'DEPRECATED: use enum_parse instead'
289
+ $stderr.puts ' This method will be removed in the next release!'
239
290
  raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
240
- @parse_tostr.call(str).force_encoding(Encoding.default_external).lines.to_a
291
+ @parse_tostr.call(str).lines.to_a
241
292
  end
242
293
 
243
- # DEPRECATED: use parse_as_nodes instead.
294
+ # @deprecated
295
+ # DEPRECATED: use enum_parse instead, this convenience method is useless.
296
+ # @param [String] str
297
+ # @return [Array] of parsed `mecab` nodes.
244
298
  def readnodes(str)
245
- $stdout.puts 'DEPRECATED: use parse_as_nodes instead'
299
+ $stderr.puts 'DEPRECATED: use enum_parse instead'
300
+ $stderr.puts ' This method will be removed in the next release!'
246
301
  parse_as_nodes(str)
247
302
  end
248
303
 
249
- # DEPRECATED: use parse_as_strings instead.
304
+ # @deprecated
305
+ # DEPRECATED: use enum_parse instead, this convenience method is useless.
306
+ # @param [String] str
307
+ # @return [Array] of parsed `mecab` result strings.
250
308
  def readlines(str)
251
- $stdout.puts 'DEPRECATED: use parse_as_strings instead'
309
+ $stderr.puts 'DEPRECATED: use enum_parse instead'
310
+ $stderr.puts ' This method will be removed in the next release!'
252
311
  parse_as_strings(str)
253
312
  end
254
313
 
@@ -257,18 +316,27 @@ module Natto
257
316
  #
258
317
  # - encoded object id
259
318
  # - underlying FFI pointer to the `mecab` tagger
319
+ # - real file path to `mecab` library
260
320
  # - options hash
261
321
  # - list of dictionaries
262
322
  # - MeCab version
263
323
  #
264
- # @return [String] encoded object id, underlying FFI pointer, options hash, list of dictionaries, and MeCab version
324
+ # @return [String] encoded object id, underlying FFI pointer,
325
+ # file path to `mecab` library, options hash,
326
+ # list of dictionaries and MeCab version
265
327
  def to_s
266
- %(#{super.chop} @tagger=#{@tagger}, @options=#{@options.inspect}, @dicts=#{@dicts.to_s}, @version="#{@version.to_s}">)
328
+ [ super.chop,
329
+ "@tagger=#{@tagger},",
330
+ "@libpath=\"#{@libpath}\",",
331
+ "@options=#{@options.inspect},",
332
+ "@dicts=#{@dicts.to_s},",
333
+ "@version=#{@version.to_s}>" ].join(' ')
267
334
  end
268
335
 
269
336
  # Overrides `Object#inspect`.
270
337
  #
271
- # @return [String] encoded object id, FFI pointer, options hash, list of dictionaries, and MeCab version
338
+ # @return [String] encoded object id, FFI pointer, options hash,
339
+ # list of dictionaries, and MeCab version
272
340
  # @see #to_s
273
341
  def inspect
274
342
  self.to_s
@@ -293,3 +361,29 @@ module Natto
293
361
  # for the `Natto` module.
294
362
  class MeCabError < RuntimeError; end
295
363
  end
364
+
365
+ # Copyright (c) 2014-2015, Brooke M. Fujita.
366
+ # All rights reserved.
367
+ #
368
+ # Redistribution and use in source and binary forms, with or without
369
+ # modification, are permitted provided that the following conditions are met:
370
+ #
371
+ # * Redistributions of source code must retain the above
372
+ # copyright notice, this list of conditions and the
373
+ # following disclaimer.
374
+ #
375
+ # * Redistributions in binary form must reproduce the above
376
+ # copyright notice, this list of conditions and the
377
+ # following disclaimer in the documentation and/or other
378
+ # materials provided with the distribution.
379
+ #
380
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
381
+ # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
382
+ # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
383
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
384
+ # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
385
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
386
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
387
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
388
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
389
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -115,3 +115,29 @@ module Natto
115
115
  end
116
116
  end
117
117
  end
118
+
119
+ # Copyright (c) 2014-2015, Brooke M. Fujita.
120
+ # All rights reserved.
121
+ #
122
+ # Redistribution and use in source and binary forms, with or without
123
+ # modification, are permitted provided that the following conditions are met:
124
+ #
125
+ # * Redistributions of source code must retain the above
126
+ # copyright notice, this list of conditions and the
127
+ # following disclaimer.
128
+ #
129
+ # * Redistributions in binary form must reproduce the above
130
+ # copyright notice, this list of conditions and the
131
+ # following disclaimer in the documentation and/or other
132
+ # materials provided with the distribution.
133
+ #
134
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
135
+ # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
136
+ # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
137
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
138
+ # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
139
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
140
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
141
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
142
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
143
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -21,21 +21,21 @@ module Natto
21
21
  end
22
22
  end
23
23
 
24
- # `DictionaryInfo` is a wrapper for `struct mecab_dictionary_info_t`
25
- # that holds the `MeCab` instance's related dictionary information.
24
+ # `DictionaryInfo` is a wrapper for the `struct mecab_dictionary_info_t`
25
+ # structure holding the `MeCab` instance's related dictionary information.
26
26
  #
27
27
  # Values for the `mecab` dictionary attributes may be
28
28
  # obtained by using the following `Symbol`s as keys
29
29
  # to the layout associative array of `FFI::Struct` members.
30
30
  #
31
- # - :filename
32
- # - :charset
33
- # - :size
34
- # - :type
35
- # - :lsize
36
- # - :rsize
37
- # - :version
38
- # - :next
31
+ # - :filename - filename of dictionary; on Windows, filename is stored in UTF-8 encoding
32
+ # - :charset - character set of the dictionary
33
+ # - :size - number of words contained in dictionary
34
+ # - :type - dictionary type: 0 (system), 1 (user-defined), 2 (unknown)
35
+ # - :lsize - left attributes size
36
+ # - :rsize - right attributes size
37
+ # - :version - version of this dictionary
38
+ # - :next - pointer to next dictionary in list
39
39
  #
40
40
  # ## Usage
41
41
  # `mecab` dictionary attributes can be obtained by
@@ -44,16 +44,20 @@ module Natto
44
44
  # nm = Natto::MeCab.new
45
45
  #
46
46
  # sysdic = nm.dicts.first
47
- #
48
- # puts sysdic.filename
49
- # => "/usr/local/lib/mecab/dic/ipadic/sys.dic"
47
+ #
48
+ # # display the real path to the mecab lib
49
+ # puts sysdic.filepath
50
+ # => /usr/local/lib/mecab/dic/ipadic/sys.dic
50
51
  #
51
52
  # puts sysdic.charset
52
- # => "utf8"
53
+ # => utf8
53
54
  #
54
55
  # puts sysdic.is_sysdic?
55
56
  # => true
56
57
  class DictionaryInfo < MeCabStruct
58
+ # @return [String] Absolute filepath to MeCab dictionary.
59
+ attr_reader :filepath
60
+
57
61
  # System dictionary.
58
62
  SYS_DIC = 0
59
63
  # User dictionary.
@@ -83,17 +87,31 @@ module Natto
83
87
  end
84
88
  end
85
89
 
90
+ # Initializes this dictionary info instance.
91
+ # Sets the `DictionaryInfo` filepath value.
92
+ #
93
+ # @param [FFI::Pointer] ptr pointer to MeCab dictionary
94
+ def initialize(ptr)
95
+ super(ptr)
96
+
97
+ @filepath = File.absolute_path(self[:filename])
98
+ end
99
+
86
100
  # Returns human-readable details for this `mecab` dictionary.
87
101
  # Overrides `Object#to_s`.
88
102
  #
89
103
  # - encoded object id
90
- # - dictionary type
91
- # - full-path dictionary filename
104
+ # - real file path to this dictionary
92
105
  # - dictionary charset
106
+ # - dictionary type
93
107
  #
94
- # @return [String] encoded object id, type, dictionary filename, and charset
108
+ # @return [String] encoded object id, file path to dictionary, charset and
109
+ # type
95
110
  def to_s
96
- %(#{super.chop} type="#{self.type}", filename="#{self.filename}", charset="#{self.charset}">)
111
+ [ super.chop,
112
+ "@filepath=\"#{@filepath}\",",
113
+ "charset=#{self.charset},",
114
+ "type=#{self.type}>" ].join(' ')
97
115
  end
98
116
 
99
117
  # Overrides `Object#inspect`.
@@ -123,35 +141,35 @@ module Natto
123
141
  end
124
142
  end
125
143
 
126
- # `MeCabNode` is a wrapper for the structure holding
127
- # the parsed `node`.
144
+ # `MeCabNode` is a wrapper for the `struct mecab_node_t`
145
+ # structure holding the parsed `node`.
128
146
  #
129
147
  # Values for the `mecab` node attributes may be
130
148
  # obtained by using the following `Symbol`s as keys
131
149
  # to the layout associative array of `FFI::Struct` members.
132
150
  #
133
- # - :prev
134
- # - :next
135
- # - :enext
136
- # - :bnext
137
- # - :rpath
138
- # - :lpath
139
- # - :surface
140
- # - :feature
141
- # - :id
142
- # - :length
143
- # - :rlength
144
- # - :rcAttr
145
- # - :lcAttr
146
- # - :posid
147
- # - :char_type
148
- # - :stat
149
- # - :isbest
150
- # - :alpha
151
- # - :beta
152
- # - :prob
153
- # - :wcost
154
- # - :cost
151
+ # - :prev - pointer to previous node
152
+ # - :next - pointer to next node
153
+ # - :enext - pointer to the node which ends at the same position
154
+ # - :bnext - pointer to the node which starts at the same position
155
+ # - :rpath - pointer to the right path; nil if MECAB_ONE_BEST mode
156
+ # - :lpath - pointer to the right path; nil if MECAB_ONE_BEST mode
157
+ # - :surface - surface string; length may be obtained with length/rlength members
158
+ # - :feature - feature string
159
+ # - :id - unique node id
160
+ # - :length - length of surface form
161
+ # - :rlength - length of the surface form including white space before the morph
162
+ # - :rcAttr - right attribute id
163
+ # - :lcAttr - left attribute id
164
+ # - :posid - part-of-speech id
165
+ # - :char_type - character type
166
+ # - :stat - node status; 0 (NOR), 1 (UNK), 2 (BOS), 3 (EOS), 4 (EON)
167
+ # - :isbest - 1 if this node is best node
168
+ # - :alpha - forward accumulative log summation, only with marginal probability flag
169
+ # - :beta - backward accumulative log summation, only with marginal probability flag
170
+ # - :prob - marginal probability, only with marginal probability flag
171
+ # - :wcost - word cost
172
+ # - :cost - best accumulative cost from bos node to this node
155
173
  #
156
174
  # ## Usage
157
175
  # An instance of `MeCabNode` is yielded to the block
@@ -164,7 +182,7 @@ module Natto
164
182
  # puts "#{n.surface}\t#{n.cost}" if n.is_nor?
165
183
  # end
166
184
  # 卓球 2874
167
- # 4398
185
+ # なんて 4398
168
186
  # 死ぬ 9261
169
187
  # まで 9386
170
188
  # の 10007
@@ -173,36 +191,29 @@ module Natto
173
191
  # よ 14396
174
192
  # 。 10194
175
193
  #
176
- # It is also possible to use the `Symbol` for the
194
+ # While it is also possible to use the `Symbol` for the
177
195
  # `mecab` node member to index into the
178
- # `FFI::Struct` layout associative array like so:
179
- #
180
- # nm.parse('あいつ笑うと結構可愛い顔してんよ。') {|n| puts n[:feature] }
181
- # 名詞,代名詞,一般,*,*,*,あいつ,アイツ,アイツ
182
- # 動詞,自立,*,*,五段・ワ行促音便,基本形,笑う,ワラウ,ワラウ
183
- # 助詞,接続助詞,*,*,*,*,と,ト,ト
184
- # 副詞,一般,*,*,*,*,結構,ケッコウ,ケッコー
185
- # 形容詞,自立,*,*,形容詞・イ段,基本形,可愛い,カワイイ,カワイイ
186
- # 名詞,一般,*,*,*,*,顔,カオ,カオ
187
- # 動詞,自立,*,*,サ変・スル,連用形,する,シ,シ
188
- # 動詞,非自立,*,*,一段,体言接続特殊,てる,テン,テン
189
- # 助詞,終助詞,*,*,*,*,よ,ヨ,ヨ
190
- # 記号,句点,*,*,*,*,。,。,。
191
- # BOS/EOS,*,*,*,*,*,*,*,*
192
- #
196
+ # `FFI::Struct` layout associative array, please use the attribute
197
+ # accessors. In the case of `:surface` and `:feature`, `mecab`
198
+ # returns the raw bytes, so `natto` will convert that into
199
+ # a string using the default encoding.
193
200
  class MeCabNode < MeCabStruct
194
- attr_accessor :surface, :feature
201
+ # @return [String] surface morpheme surface value.
202
+ attr_accessor :surface
203
+ # @return [String] corresponding feature value.
204
+ attr_accessor :feature
205
+ # @return [FFI::Pointer] pointer to MeCab node struct.
195
206
  attr_reader :pointer
196
207
 
197
- # Normal `mecab` node defined in the dictionary.
208
+ # Normal `mecab` node defined in the dictionary, c.f. `stat`.
198
209
  NOR_NODE = 0
199
- # Unknown `mecab` node not defined in the dictionary.
210
+ # Unknown `mecab` node not defined in the dictionary, c.f. `stat`.
200
211
  UNK_NODE = 1
201
- # Virtual node representing the beginning of the sentence.
212
+ # Virtual node representing the beginning of the sentence, c.f. `stat`.
202
213
  BOS_NODE = 2
203
- # Virutual node representing the end of the sentence.
214
+ # Virutual node representing the end of the sentence, c.f. `stat`.
204
215
  EOS_NODE = 3
205
- # Virtual node representing the end of an N-Best `mecab` node list.
216
+ # Virtual node representing the end of an N-Best `mecab` node list, c.f. `stat`.
206
217
  EON_NODE = 4
207
218
 
208
219
  layout :prev, :pointer,
@@ -227,25 +238,11 @@ module Natto
227
238
  :prob, :float,
228
239
  :wcost, :short,
229
240
  :cost, :long
230
-
231
- #if RUBY_VERSION.to_f < 1.9
232
- # alias_method :deprecated_id, :id
233
- # # `Object#id` override defined when `RUBY_VERSION` is
234
- # # older than 1.9. This is a hack to avoid the `Object#id`
235
- # # deprecation warning thrown up in Ruby 1.8.7.
236
- # #
237
- # # <i>This method override is not defined when the Ruby interpreter
238
- # # is 1.9 or greater.</i>
239
- # # @return [Fixnum] `mecab` node id
240
- # def id
241
- # self[:id]
242
- # end
243
- #end
244
241
 
245
242
  # Initializes this node instance.
246
243
  # Sets the `MeCab` feature value for this node.
247
244
  #
248
- # @param [FFI::Pointer]
245
+ # @param [FFI::Pointer] ptr pointer to MeCab node
249
246
  def initialize(ptr)
250
247
  super(ptr)
251
248
  @pointer = ptr
@@ -308,3 +305,29 @@ module Natto
308
305
  end
309
306
  end
310
307
  end
308
+
309
+ # Copyright (c) 2014-2015, Brooke M. Fujita.
310
+ # All rights reserved.
311
+ #
312
+ # Redistribution and use in source and binary forms, with or without
313
+ # modification, are permitted provided that the following conditions are met:
314
+ #
315
+ # * Redistributions of source code must retain the above
316
+ # copyright notice, this list of conditions and the
317
+ # following disclaimer.
318
+ #
319
+ # * Redistributions in binary form must reproduce the above
320
+ # copyright notice, this list of conditions and the
321
+ # following disclaimer in the documentation and/or other
322
+ # materials provided with the distribution.
323
+ #
324
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
325
+ # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
326
+ # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
327
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
328
+ # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
329
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
330
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
331
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
332
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
333
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.