natto 0.9.6 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,18 +11,18 @@ module Natto
11
11
  #
12
12
  # ## Usage
13
13
  #
14
- # require 'rubygems' if RUBY_VERSION.to_f < 1.9
15
14
  # require 'natto'
16
15
  #
17
16
  # nm = Natto::MeCab.new('-Ochasen')
18
17
  # => #<Natto::MeCab:0x28d3bdc8 \
19
18
  # @tagger=#<FFI::Pointer address=0x28afb980>, \
19
+ # @libpath="/usr/local/lib/libmecab.so" \
20
20
  # @options={:output_format_type=>"chasen"}, \
21
21
  # @dicts=[#<Natto::DictionaryInfo:0x289a1f14 \
22
- # type="0", \
23
- # filename="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
24
- # charset="utf8">], \
25
- # @version="0.996">
22
+ # @filepath="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
23
+ # charset=utf8 \
24
+ # type=0>], \
25
+ # @version=0.996>
26
26
  #
27
27
  # nm.parse('凡人にしか見えねえ風景ってのがあるんだよ。') do |n|
28
28
  # puts "#{n.surface}\t#{n.feature}"
@@ -46,8 +46,17 @@ module Natto
46
46
  class MeCab
47
47
  include Natto::Binding
48
48
  include Natto::OptionParse
49
-
50
- attr_reader :tagger, :options, :dicts, :version
49
+
50
+ # @return [FFI:Pointer] pointer to MeCab tagger.
51
+ attr_reader :tagger
52
+ # @return [String] absolute filepath to MeCab library.
53
+ attr_reader :libpath
54
+ # @return [Hash] MeCab options as key-value pairs.
55
+ attr_reader :options
56
+ # @return [Array] listing of all of dictionaries referenced.
57
+ attr_reader :dicts
58
+ # @return [String] `MeCab` versions.
59
+ attr_reader :version
51
60
 
52
61
  # Initializes the wrapped `mecab` instance with the
53
62
  # given `options`.
@@ -83,12 +92,13 @@ module Natto
83
92
  # nm = Natto::MeCab.new(:node_format=>'%m¥t%f[7]¥n')
84
93
  # => #<Natto::MeCab:0x28d2ae10
85
94
  # @tagger=#<FFI::Pointer address=0x28a97980>, \
95
+ # @libpath="/usr/local/lib/libmecab.so", \
86
96
  # @options={:node_format=>"%m¥t%f[7]¥n"}, \
87
97
  # @dicts=[#<Natto::DictionaryInfo:0x28d2a85c \
88
- # type="0", \
89
- # filename="/usr/local/lib/mecab/dic/ipadic/sys.dic" \
90
- # charset="utf8">], \
91
- # @version="0.996">
98
+ # @filepath="/usr/local/lib/mecab/dic/ipadic/sys.dic" \
99
+ # charset=utf8, \
100
+ # type=0>] \
101
+ # @version=0.996>
92
102
  #
93
103
  # puts nm.parse('才能とは求める人間に与えられるものではない。')
94
104
  # 才能 サイノウ
@@ -106,14 +116,15 @@ module Natto
106
116
  # 。 。
107
117
  # EOS
108
118
  #
109
- # @param [Hash or String]
119
+ # @param [Hash, String] options MeCab options for tagger
110
120
  # @raise [MeCabError] if `mecab` cannot be initialized with the given `options`
111
121
  def initialize(options={})
112
122
  @options = self.class.parse_mecab_options(options)
113
123
  @dicts = []
114
124
 
115
- opt_str = self.class.build_options_str(@options)
116
- @tagger = self.mecab_new2(opt_str)
125
+ opt_str = self.class.build_options_str(@options)
126
+ @tagger = self.class.mecab_new2(opt_str)
127
+ @libpath = self.class.find_library
117
128
  raise MeCabError.new("Could not initialize MeCab with options: '#{opt_str}'") if @tagger.address == 0x0
118
129
 
119
130
  self.mecab_set_theta(@tagger, @options[:theta]) if @options[:theta]
@@ -123,69 +134,79 @@ module Natto
123
134
 
124
135
  # Set mecab parsing implementations for N-best and regular parsing,
125
136
  # for both parsing as string and yielding a node object
126
- # N-Best parsing implementations
127
137
  if @options[:nbest] && @options[:nbest] > 1
138
+ # N-Best parsing implementations
128
139
  self.mecab_set_lattice_level(@tagger, (@options[:lattice_level] || 1))
129
- @parse_tostr = lambda do |str|
130
- return self.mecab_nbest_sparse_tostr(@tagger, @options[:nbest], str) ||
140
+
141
+ @parse_tostr = lambda do |text|
142
+ retval = self.mecab_nbest_sparse_tostr(@tagger, @options[:nbest], text) ||
131
143
  raise(MeCabError.new(self.mecab_strerror(@tagger)))
144
+ retval.force_encoding(Encoding.default_external)
132
145
  end
133
- @parse_tonodes = lambda do |str|
134
- nodes = []
135
- if @options[:nbest] && @options[:nbest] > 1
136
- self.mecab_nbest_init(@tagger, str)
146
+
147
+ @parse_tonodes = lambda do |text|
148
+ Enumerator.new do |y|
149
+ self.mecab_nbest_init(@tagger, text)
137
150
  n = self.mecab_nbest_next_tonode(@tagger)
138
151
  raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
152
+
139
153
  nlen = @options[:nbest]
140
154
  nlen.times do |i|
141
- s = str.bytes.to_a
155
+ s = text.bytes.to_a
142
156
  while n && n.address != 0x0
143
157
  mn = Natto::MeCabNode.new(n)
144
- s = s.drop_while {|e| (e==0xa || e==0x20)}
145
- if !s.empty?
146
- sarr = []
147
- mn.length.times { sarr << s.shift }
148
- surf = sarr.pack('C*')
149
- #mn.surface = self.class.force_enc(surf)
150
- mn.surface = surf.force_encoding(Encoding.default_external)
151
- end
152
- if @options[:output_format_type] || @options[:node_format]
153
- mn.feature = self.mecab_format_node(@tagger, n).force_encoding(Encoding.default_external)
158
+ # ignore BOS nodes, since mecab does so
159
+ if !mn.is_bos?
160
+ s = s.drop_while {|e| (e==0xa || e==0x20)}
161
+ if !s.empty?
162
+ sarr = []
163
+ mn.length.times { sarr << s.shift }
164
+ surf = sarr.pack('C*')
165
+ mn.surface = surf.force_encoding(Encoding.default_external)
166
+ end
167
+ if @options[:output_format_type] || @options[:node_format]
168
+ mn.feature = self.mecab_format_node(@tagger, n).force_encoding(Encoding.default_external)
169
+ end
170
+ y.yield mn
154
171
  end
155
- nodes << mn if !mn.is_bos?
156
172
  n = mn.next
157
173
  end
158
174
  n = self.mecab_nbest_next_tonode(@tagger)
159
175
  end
160
176
  end
161
- return nodes
162
177
  end
163
178
  else
164
179
  # default parsing implementations
165
- @parse_tostr = lambda do |str|
166
- return self.mecab_sparse_tostr(@tagger, str) ||
180
+ @parse_tostr = lambda do |text|
181
+ retval = self.mecab_sparse_tostr(@tagger, text) ||
167
182
  raise(MeCabError.new(self.mecab_strerror(@tagger)))
183
+ retval.force_encoding(Encoding.default_external)
168
184
  end
169
- @parse_tonodes = lambda do |str|
170
- nodes = []
171
- n = self.mecab_sparse_tonode(@tagger, str)
172
- raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
173
- mn = Natto::MeCabNode.new(n)
174
- n = mn.next if mn.next.address!=0x0
175
- s = str.bytes.to_a
176
- while n && n.address!=0x0
185
+
186
+ @parse_tonodes = lambda do |text|
187
+ Enumerator.new do |y|
188
+ n = self.mecab_sparse_tonode(@tagger, text)
189
+ raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
190
+
177
191
  mn = Natto::MeCabNode.new(n)
178
- s = s.drop_while {|e| (e==0xa || e==0x20)}
179
- if !s.empty?
180
- sarr = []
181
- mn.length.times { sarr << s.shift }
182
- surf = sarr.pack('C*')
183
- mn.surface = surf.force_encoding(Encoding.default_external)
192
+ n = mn.next if mn.next.address!=0x0
193
+ s = text.bytes.to_a
194
+ while n && n.address!=0x0
195
+ mn = Natto::MeCabNode.new(n)
196
+ s = s.drop_while {|e| (e==0xa || e==0x20)}
197
+ if !s.empty?
198
+ sarr = []
199
+ mn.length.times { sarr << s.shift }
200
+ surf = sarr.pack('C*')
201
+ mn.surface = surf.force_encoding(Encoding.default_external)
202
+ end
203
+ if @options[:output_format_type] || @options[:node_format]
204
+ mn.feature = self.mecab_format_node(@tagger, n).force_encoding(Encoding.default_external)
205
+ end
206
+ y.yield mn
207
+ n = mn.next
184
208
  end
185
- nodes << mn
186
- n = mn.next
187
209
  end
188
- return nodes
189
210
  end
190
211
  end
191
212
 
@@ -199,24 +220,48 @@ module Natto
199
220
  ObjectSpace.define_finalizer(self, self.class.create_free_proc(@tagger))
200
221
  end
201
222
 
202
- # Parses the given string `str`. If a block is passed to this method,
203
- # then node parsing will be used and each node yielded to the given block.
223
+ # Parses the given `text`, returning the MeCab output as a single string.
224
+ # If a block is passed to this method, then node parsing will be used
225
+ # and each node yielded to the given block.
204
226
  #
205
- # @param [String] str
206
- # @return parsing result from `mecab`
207
- # @raise [MeCabError] if the `mecab` tagger cannot parse the given string `str`
208
- # @raise [ArgumentError] if the given string `str` argument is `nil`
227
+ # @param [String] text
228
+ # @return [String] parsing result from `mecab`
229
+ # @raise [MeCabError] if the `mecab` tagger cannot parse the given `text`
230
+ # @raise [ArgumentError] if the given string `text` argument is `nil`
209
231
  # @see MeCabNode
210
- def parse(str)
211
- raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
232
+ def parse(text)
233
+ raise ArgumentError.new 'Text to parse cannot be nil' if text.nil?
212
234
  if block_given?
213
- nodes = @parse_tonodes.call(str)
214
- nodes.each {|n| yield n }
235
+ @parse_tonodes.call(text).each {|n| yield n }
215
236
  else
216
- @parse_tostr.call(str).force_encoding(Encoding.default_external)
237
+ @parse_tostr.call(text)
217
238
  end
218
239
  end
219
240
 
241
+ # Parses the given string `text`, returning an
242
+ # {http://www.ruby-doc.org/core-2.1.5/Enumerator.html Enumerator} that may be
243
+ # used to iterate over the resulting {MeCabNode} objects. This is more
244
+ # efficient than parsing to a simple string, since each node's
245
+ # information will not be materialized all at once as with it is with
246
+ # string output.
247
+ #
248
+ # MeCab nodes contain much more detailed information about
249
+ # the morpheme. Node-formatting may also be used to customize
250
+ # the resulting node's `feature` attribute.
251
+ #
252
+ # @param [String] text
253
+ # @return [Enumerator] of MeCabNode instances
254
+ # @raise [MeCabError] if the `mecab` tagger cannot parse the given `text`
255
+ # @raise [ArgumentError] if the given string `text` argument is `nil`
256
+ # @see MeCabNode
257
+ # @see http://www.ruby-doc.org/core-2.1.5/Enumerator.html
258
+ def enum_parse(text)
259
+ raise ArgumentError.new 'Text to parse cannot be nil' if text.nil?
260
+ @parse_tonodes.call(text)
261
+ end
262
+
263
+ # @deprecated
264
+ # DEPRECATED: use enum_parse instead, this convenience method is useless.
220
265
  # Parses the given string `str`, and returns
221
266
  # a list of `mecab` nodes.
222
267
  # @param [String] str
@@ -225,10 +270,14 @@ module Natto
225
270
  # @raise [ArgumentError] if the given string `str` argument is `nil`
226
271
  # @see MeCabNode
227
272
  def parse_as_nodes(str)
273
+ $stderr.puts 'DEPRECATED: use enum_parse instead'
274
+ $stderr.puts ' This method will be removed in the next release!'
228
275
  raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
229
276
  @parse_tonodes.call(str)
230
277
  end
231
278
 
279
+ # @deprecated
280
+ # DEPRECATED: use enum_parse instead, this convenience method is useless.
232
281
  # Parses the given string `str`, and returns
233
282
  # a list of `mecab` result strings.
234
283
  # @param [String] str
@@ -236,19 +285,29 @@ module Natto
236
285
  # @raise [MeCabError] if the `mecab` tagger cannot parse the given string `str`
237
286
  # @raise [ArgumentError] if the given string `str` argument is `nil`
238
287
  def parse_as_strings(str)
288
+ $stderr.puts 'DEPRECATED: use enum_parse instead'
289
+ $stderr.puts ' This method will be removed in the next release!'
239
290
  raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
240
- @parse_tostr.call(str).force_encoding(Encoding.default_external).lines.to_a
291
+ @parse_tostr.call(str).lines.to_a
241
292
  end
242
293
 
243
- # DEPRECATED: use parse_as_nodes instead.
294
+ # @deprecated
295
+ # DEPRECATED: use enum_parse instead, this convenience method is useless.
296
+ # @param [String] str
297
+ # @return [Array] of parsed `mecab` nodes.
244
298
  def readnodes(str)
245
- $stdout.puts 'DEPRECATED: use parse_as_nodes instead'
299
+ $stderr.puts 'DEPRECATED: use enum_parse instead'
300
+ $stderr.puts ' This method will be removed in the next release!'
246
301
  parse_as_nodes(str)
247
302
  end
248
303
 
249
- # DEPRECATED: use parse_as_strings instead.
304
+ # @deprecated
305
+ # DEPRECATED: use enum_parse instead, this convenience method is useless.
306
+ # @param [String] str
307
+ # @return [Array] of parsed `mecab` result strings.
250
308
  def readlines(str)
251
- $stdout.puts 'DEPRECATED: use parse_as_strings instead'
309
+ $stderr.puts 'DEPRECATED: use enum_parse instead'
310
+ $stderr.puts ' This method will be removed in the next release!'
252
311
  parse_as_strings(str)
253
312
  end
254
313
 
@@ -257,18 +316,27 @@ module Natto
257
316
  #
258
317
  # - encoded object id
259
318
  # - underlying FFI pointer to the `mecab` tagger
319
+ # - real file path to `mecab` library
260
320
  # - options hash
261
321
  # - list of dictionaries
262
322
  # - MeCab version
263
323
  #
264
- # @return [String] encoded object id, underlying FFI pointer, options hash, list of dictionaries, and MeCab version
324
+ # @return [String] encoded object id, underlying FFI pointer,
325
+ # file path to `mecab` library, options hash,
326
+ # list of dictionaries and MeCab version
265
327
  def to_s
266
- %(#{super.chop} @tagger=#{@tagger}, @options=#{@options.inspect}, @dicts=#{@dicts.to_s}, @version="#{@version.to_s}">)
328
+ [ super.chop,
329
+ "@tagger=#{@tagger},",
330
+ "@libpath=\"#{@libpath}\",",
331
+ "@options=#{@options.inspect},",
332
+ "@dicts=#{@dicts.to_s},",
333
+ "@version=#{@version.to_s}>" ].join(' ')
267
334
  end
268
335
 
269
336
  # Overrides `Object#inspect`.
270
337
  #
271
- # @return [String] encoded object id, FFI pointer, options hash, list of dictionaries, and MeCab version
338
+ # @return [String] encoded object id, FFI pointer, options hash,
339
+ # list of dictionaries, and MeCab version
272
340
  # @see #to_s
273
341
  def inspect
274
342
  self.to_s
@@ -293,3 +361,29 @@ module Natto
293
361
  # for the `Natto` module.
294
362
  class MeCabError < RuntimeError; end
295
363
  end
364
+
365
+ # Copyright (c) 2014-2015, Brooke M. Fujita.
366
+ # All rights reserved.
367
+ #
368
+ # Redistribution and use in source and binary forms, with or without
369
+ # modification, are permitted provided that the following conditions are met:
370
+ #
371
+ # * Redistributions of source code must retain the above
372
+ # copyright notice, this list of conditions and the
373
+ # following disclaimer.
374
+ #
375
+ # * Redistributions in binary form must reproduce the above
376
+ # copyright notice, this list of conditions and the
377
+ # following disclaimer in the documentation and/or other
378
+ # materials provided with the distribution.
379
+ #
380
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
381
+ # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
382
+ # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
383
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
384
+ # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
385
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
386
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
387
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
388
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
389
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -115,3 +115,29 @@ module Natto
115
115
  end
116
116
  end
117
117
  end
118
+
119
+ # Copyright (c) 2014-2015, Brooke M. Fujita.
120
+ # All rights reserved.
121
+ #
122
+ # Redistribution and use in source and binary forms, with or without
123
+ # modification, are permitted provided that the following conditions are met:
124
+ #
125
+ # * Redistributions of source code must retain the above
126
+ # copyright notice, this list of conditions and the
127
+ # following disclaimer.
128
+ #
129
+ # * Redistributions in binary form must reproduce the above
130
+ # copyright notice, this list of conditions and the
131
+ # following disclaimer in the documentation and/or other
132
+ # materials provided with the distribution.
133
+ #
134
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
135
+ # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
136
+ # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
137
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
138
+ # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
139
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
140
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
141
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
142
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
143
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -21,21 +21,21 @@ module Natto
21
21
  end
22
22
  end
23
23
 
24
- # `DictionaryInfo` is a wrapper for `struct mecab_dictionary_info_t`
25
- # that holds the `MeCab` instance's related dictionary information.
24
+ # `DictionaryInfo` is a wrapper for the `struct mecab_dictionary_info_t`
25
+ # structure holding the `MeCab` instance's related dictionary information.
26
26
  #
27
27
  # Values for the `mecab` dictionary attributes may be
28
28
  # obtained by using the following `Symbol`s as keys
29
29
  # to the layout associative array of `FFI::Struct` members.
30
30
  #
31
- # - :filename
32
- # - :charset
33
- # - :size
34
- # - :type
35
- # - :lsize
36
- # - :rsize
37
- # - :version
38
- # - :next
31
+ # - :filename - filename of dictionary; on Windows, filename is stored in UTF-8 encoding
32
+ # - :charset - character set of the dictionary
33
+ # - :size - number of words contained in dictionary
34
+ # - :type - dictionary type: 0 (system), 1 (user-defined), 2 (unknown)
35
+ # - :lsize - left attributes size
36
+ # - :rsize - right attributes size
37
+ # - :version - version of this dictionary
38
+ # - :next - pointer to next dictionary in list
39
39
  #
40
40
  # ## Usage
41
41
  # `mecab` dictionary attributes can be obtained by
@@ -44,16 +44,20 @@ module Natto
44
44
  # nm = Natto::MeCab.new
45
45
  #
46
46
  # sysdic = nm.dicts.first
47
- #
48
- # puts sysdic.filename
49
- # => "/usr/local/lib/mecab/dic/ipadic/sys.dic"
47
+ #
48
+ # # display the real path to the mecab lib
49
+ # puts sysdic.filepath
50
+ # => /usr/local/lib/mecab/dic/ipadic/sys.dic
50
51
  #
51
52
  # puts sysdic.charset
52
- # => "utf8"
53
+ # => utf8
53
54
  #
54
55
  # puts sysdic.is_sysdic?
55
56
  # => true
56
57
  class DictionaryInfo < MeCabStruct
58
+ # @return [String] Absolute filepath to MeCab dictionary.
59
+ attr_reader :filepath
60
+
57
61
  # System dictionary.
58
62
  SYS_DIC = 0
59
63
  # User dictionary.
@@ -83,17 +87,31 @@ module Natto
83
87
  end
84
88
  end
85
89
 
90
+ # Initializes this dictionary info instance.
91
+ # Sets the `DictionaryInfo` filepath value.
92
+ #
93
+ # @param [FFI::Pointer] ptr pointer to MeCab dictionary
94
+ def initialize(ptr)
95
+ super(ptr)
96
+
97
+ @filepath = File.absolute_path(self[:filename])
98
+ end
99
+
86
100
  # Returns human-readable details for this `mecab` dictionary.
87
101
  # Overrides `Object#to_s`.
88
102
  #
89
103
  # - encoded object id
90
- # - dictionary type
91
- # - full-path dictionary filename
104
+ # - real file path to this dictionary
92
105
  # - dictionary charset
106
+ # - dictionary type
93
107
  #
94
- # @return [String] encoded object id, type, dictionary filename, and charset
108
+ # @return [String] encoded object id, file path to dictionary, charset and
109
+ # type
95
110
  def to_s
96
- %(#{super.chop} type="#{self.type}", filename="#{self.filename}", charset="#{self.charset}">)
111
+ [ super.chop,
112
+ "@filepath=\"#{@filepath}\",",
113
+ "charset=#{self.charset},",
114
+ "type=#{self.type}>" ].join(' ')
97
115
  end
98
116
 
99
117
  # Overrides `Object#inspect`.
@@ -123,35 +141,35 @@ module Natto
123
141
  end
124
142
  end
125
143
 
126
- # `MeCabNode` is a wrapper for the structure holding
127
- # the parsed `node`.
144
+ # `MeCabNode` is a wrapper for the `struct mecab_node_t`
145
+ # structure holding the parsed `node`.
128
146
  #
129
147
  # Values for the `mecab` node attributes may be
130
148
  # obtained by using the following `Symbol`s as keys
131
149
  # to the layout associative array of `FFI::Struct` members.
132
150
  #
133
- # - :prev
134
- # - :next
135
- # - :enext
136
- # - :bnext
137
- # - :rpath
138
- # - :lpath
139
- # - :surface
140
- # - :feature
141
- # - :id
142
- # - :length
143
- # - :rlength
144
- # - :rcAttr
145
- # - :lcAttr
146
- # - :posid
147
- # - :char_type
148
- # - :stat
149
- # - :isbest
150
- # - :alpha
151
- # - :beta
152
- # - :prob
153
- # - :wcost
154
- # - :cost
151
+ # - :prev - pointer to previous node
152
+ # - :next - pointer to next node
153
+ # - :enext - pointer to the node which ends at the same position
154
+ # - :bnext - pointer to the node which starts at the same position
155
+ # - :rpath - pointer to the right path; nil if MECAB_ONE_BEST mode
156
+ # - :lpath - pointer to the right path; nil if MECAB_ONE_BEST mode
157
+ # - :surface - surface string; length may be obtained with length/rlength members
158
+ # - :feature - feature string
159
+ # - :id - unique node id
160
+ # - :length - length of surface form
161
+ # - :rlength - length of the surface form including white space before the morph
162
+ # - :rcAttr - right attribute id
163
+ # - :lcAttr - left attribute id
164
+ # - :posid - part-of-speech id
165
+ # - :char_type - character type
166
+ # - :stat - node status; 0 (NOR), 1 (UNK), 2 (BOS), 3 (EOS), 4 (EON)
167
+ # - :isbest - 1 if this node is best node
168
+ # - :alpha - forward accumulative log summation, only with marginal probability flag
169
+ # - :beta - backward accumulative log summation, only with marginal probability flag
170
+ # - :prob - marginal probability, only with marginal probability flag
171
+ # - :wcost - word cost
172
+ # - :cost - best accumulative cost from bos node to this node
155
173
  #
156
174
  # ## Usage
157
175
  # An instance of `MeCabNode` is yielded to the block
@@ -164,7 +182,7 @@ module Natto
164
182
  # puts "#{n.surface}\t#{n.cost}" if n.is_nor?
165
183
  # end
166
184
  # 卓球 2874
167
- # 4398
185
+ # なんて 4398
168
186
  # 死ぬ 9261
169
187
  # まで 9386
170
188
  # の 10007
@@ -173,36 +191,29 @@ module Natto
173
191
  # よ 14396
174
192
  # 。 10194
175
193
  #
176
- # It is also possible to use the `Symbol` for the
194
+ # While it is also possible to use the `Symbol` for the
177
195
  # `mecab` node member to index into the
178
- # `FFI::Struct` layout associative array like so:
179
- #
180
- # nm.parse('あいつ笑うと結構可愛い顔してんよ。') {|n| puts n[:feature] }
181
- # 名詞,代名詞,一般,*,*,*,あいつ,アイツ,アイツ
182
- # 動詞,自立,*,*,五段・ワ行促音便,基本形,笑う,ワラウ,ワラウ
183
- # 助詞,接続助詞,*,*,*,*,と,ト,ト
184
- # 副詞,一般,*,*,*,*,結構,ケッコウ,ケッコー
185
- # 形容詞,自立,*,*,形容詞・イ段,基本形,可愛い,カワイイ,カワイイ
186
- # 名詞,一般,*,*,*,*,顔,カオ,カオ
187
- # 動詞,自立,*,*,サ変・スル,連用形,する,シ,シ
188
- # 動詞,非自立,*,*,一段,体言接続特殊,てる,テン,テン
189
- # 助詞,終助詞,*,*,*,*,よ,ヨ,ヨ
190
- # 記号,句点,*,*,*,*,。,。,。
191
- # BOS/EOS,*,*,*,*,*,*,*,*
192
- #
196
+ # `FFI::Struct` layout associative array, please use the attribute
197
+ # accessors. In the case of `:surface` and `:feature`, `mecab`
198
+ # returns the raw bytes, so `natto` will convert that into
199
+ # a string using the default encoding.
193
200
  class MeCabNode < MeCabStruct
194
- attr_accessor :surface, :feature
201
+ # @return [String] surface morpheme surface value.
202
+ attr_accessor :surface
203
+ # @return [String] corresponding feature value.
204
+ attr_accessor :feature
205
+ # @return [FFI::Pointer] pointer to MeCab node struct.
195
206
  attr_reader :pointer
196
207
 
197
- # Normal `mecab` node defined in the dictionary.
208
+ # Normal `mecab` node defined in the dictionary, c.f. `stat`.
198
209
  NOR_NODE = 0
199
- # Unknown `mecab` node not defined in the dictionary.
210
+ # Unknown `mecab` node not defined in the dictionary, c.f. `stat`.
200
211
  UNK_NODE = 1
201
- # Virtual node representing the beginning of the sentence.
212
+ # Virtual node representing the beginning of the sentence, c.f. `stat`.
202
213
  BOS_NODE = 2
203
- # Virutual node representing the end of the sentence.
214
+ # Virutual node representing the end of the sentence, c.f. `stat`.
204
215
  EOS_NODE = 3
205
- # Virtual node representing the end of an N-Best `mecab` node list.
216
+ # Virtual node representing the end of an N-Best `mecab` node list, c.f. `stat`.
206
217
  EON_NODE = 4
207
218
 
208
219
  layout :prev, :pointer,
@@ -227,25 +238,11 @@ module Natto
227
238
  :prob, :float,
228
239
  :wcost, :short,
229
240
  :cost, :long
230
-
231
- #if RUBY_VERSION.to_f < 1.9
232
- # alias_method :deprecated_id, :id
233
- # # `Object#id` override defined when `RUBY_VERSION` is
234
- # # older than 1.9. This is a hack to avoid the `Object#id`
235
- # # deprecation warning thrown up in Ruby 1.8.7.
236
- # #
237
- # # <i>This method override is not defined when the Ruby interpreter
238
- # # is 1.9 or greater.</i>
239
- # # @return [Fixnum] `mecab` node id
240
- # def id
241
- # self[:id]
242
- # end
243
- #end
244
241
 
245
242
  # Initializes this node instance.
246
243
  # Sets the `MeCab` feature value for this node.
247
244
  #
248
- # @param [FFI::Pointer]
245
+ # @param [FFI::Pointer] ptr pointer to MeCab node
249
246
  def initialize(ptr)
250
247
  super(ptr)
251
248
  @pointer = ptr
@@ -308,3 +305,29 @@ module Natto
308
305
  end
309
306
  end
310
307
  end
308
+
309
+ # Copyright (c) 2014-2015, Brooke M. Fujita.
310
+ # All rights reserved.
311
+ #
312
+ # Redistribution and use in source and binary forms, with or without
313
+ # modification, are permitted provided that the following conditions are met:
314
+ #
315
+ # * Redistributions of source code must retain the above
316
+ # copyright notice, this list of conditions and the
317
+ # following disclaimer.
318
+ #
319
+ # * Redistributions in binary form must reproduce the above
320
+ # copyright notice, this list of conditions and the
321
+ # following disclaimer in the documentation and/or other
322
+ # materials provided with the distribution.
323
+ #
324
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
325
+ # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
326
+ # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
327
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
328
+ # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
329
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
330
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
331
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
332
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
333
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.