natto 0.9.6 → 0.9.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +18 -0
- data/LICENSE +13 -11
- data/README.md +233 -108
- data/lib/natto.rb +26 -0
- data/lib/natto/binding.rb +69 -25
- data/lib/natto/natto.rb +166 -72
- data/lib/natto/option_parse.rb +26 -0
- data/lib/natto/struct.rb +103 -80
- data/lib/natto/version.rb +27 -1
- metadata +12 -10
data/lib/natto/natto.rb
CHANGED
@@ -11,18 +11,18 @@ module Natto
|
|
11
11
|
#
|
12
12
|
# ## Usage
|
13
13
|
#
|
14
|
-
# require 'rubygems' if RUBY_VERSION.to_f < 1.9
|
15
14
|
# require 'natto'
|
16
15
|
#
|
17
16
|
# nm = Natto::MeCab.new('-Ochasen')
|
18
17
|
# => #<Natto::MeCab:0x28d3bdc8 \
|
19
18
|
# @tagger=#<FFI::Pointer address=0x28afb980>, \
|
19
|
+
# @libpath="/usr/local/lib/libmecab.so" \
|
20
20
|
# @options={:output_format_type=>"chasen"}, \
|
21
21
|
# @dicts=[#<Natto::DictionaryInfo:0x289a1f14 \
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
# @version=
|
22
|
+
# @filepath="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
|
23
|
+
# charset=utf8 \
|
24
|
+
# type=0>], \
|
25
|
+
# @version=0.996>
|
26
26
|
#
|
27
27
|
# nm.parse('凡人にしか見えねえ風景ってのがあるんだよ。') do |n|
|
28
28
|
# puts "#{n.surface}\t#{n.feature}"
|
@@ -46,8 +46,17 @@ module Natto
|
|
46
46
|
class MeCab
|
47
47
|
include Natto::Binding
|
48
48
|
include Natto::OptionParse
|
49
|
-
|
50
|
-
|
49
|
+
|
50
|
+
# @return [FFI:Pointer] pointer to MeCab tagger.
|
51
|
+
attr_reader :tagger
|
52
|
+
# @return [String] absolute filepath to MeCab library.
|
53
|
+
attr_reader :libpath
|
54
|
+
# @return [Hash] MeCab options as key-value pairs.
|
55
|
+
attr_reader :options
|
56
|
+
# @return [Array] listing of all of dictionaries referenced.
|
57
|
+
attr_reader :dicts
|
58
|
+
# @return [String] `MeCab` versions.
|
59
|
+
attr_reader :version
|
51
60
|
|
52
61
|
# Initializes the wrapped `mecab` instance with the
|
53
62
|
# given `options`.
|
@@ -83,12 +92,13 @@ module Natto
|
|
83
92
|
# nm = Natto::MeCab.new(:node_format=>'%m¥t%f[7]¥n')
|
84
93
|
# => #<Natto::MeCab:0x28d2ae10
|
85
94
|
# @tagger=#<FFI::Pointer address=0x28a97980>, \
|
95
|
+
# @libpath="/usr/local/lib/libmecab.so", \
|
86
96
|
# @options={:node_format=>"%m¥t%f[7]¥n"}, \
|
87
97
|
# @dicts=[#<Natto::DictionaryInfo:0x28d2a85c \
|
88
|
-
#
|
89
|
-
#
|
90
|
-
#
|
91
|
-
# @version=
|
98
|
+
# @filepath="/usr/local/lib/mecab/dic/ipadic/sys.dic" \
|
99
|
+
# charset=utf8, \
|
100
|
+
# type=0>] \
|
101
|
+
# @version=0.996>
|
92
102
|
#
|
93
103
|
# puts nm.parse('才能とは求める人間に与えられるものではない。')
|
94
104
|
# 才能 サイノウ
|
@@ -106,14 +116,15 @@ module Natto
|
|
106
116
|
# 。 。
|
107
117
|
# EOS
|
108
118
|
#
|
109
|
-
# @param [Hash
|
119
|
+
# @param [Hash, String] options MeCab options for tagger
|
110
120
|
# @raise [MeCabError] if `mecab` cannot be initialized with the given `options`
|
111
121
|
def initialize(options={})
|
112
122
|
@options = self.class.parse_mecab_options(options)
|
113
123
|
@dicts = []
|
114
124
|
|
115
|
-
opt_str
|
116
|
-
@tagger
|
125
|
+
opt_str = self.class.build_options_str(@options)
|
126
|
+
@tagger = self.class.mecab_new2(opt_str)
|
127
|
+
@libpath = self.class.find_library
|
117
128
|
raise MeCabError.new("Could not initialize MeCab with options: '#{opt_str}'") if @tagger.address == 0x0
|
118
129
|
|
119
130
|
self.mecab_set_theta(@tagger, @options[:theta]) if @options[:theta]
|
@@ -123,69 +134,79 @@ module Natto
|
|
123
134
|
|
124
135
|
# Set mecab parsing implementations for N-best and regular parsing,
|
125
136
|
# for both parsing as string and yielding a node object
|
126
|
-
# N-Best parsing implementations
|
127
137
|
if @options[:nbest] && @options[:nbest] > 1
|
138
|
+
# N-Best parsing implementations
|
128
139
|
self.mecab_set_lattice_level(@tagger, (@options[:lattice_level] || 1))
|
129
|
-
|
130
|
-
|
140
|
+
|
141
|
+
@parse_tostr = lambda do |text|
|
142
|
+
retval = self.mecab_nbest_sparse_tostr(@tagger, @options[:nbest], text) ||
|
131
143
|
raise(MeCabError.new(self.mecab_strerror(@tagger)))
|
144
|
+
retval.force_encoding(Encoding.default_external)
|
132
145
|
end
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
self.mecab_nbest_init(@tagger,
|
146
|
+
|
147
|
+
@parse_tonodes = lambda do |text|
|
148
|
+
Enumerator.new do |y|
|
149
|
+
self.mecab_nbest_init(@tagger, text)
|
137
150
|
n = self.mecab_nbest_next_tonode(@tagger)
|
138
151
|
raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
|
152
|
+
|
139
153
|
nlen = @options[:nbest]
|
140
154
|
nlen.times do |i|
|
141
|
-
s =
|
155
|
+
s = text.bytes.to_a
|
142
156
|
while n && n.address != 0x0
|
143
157
|
mn = Natto::MeCabNode.new(n)
|
144
|
-
|
145
|
-
if !
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
158
|
+
# ignore BOS nodes, since mecab does so
|
159
|
+
if !mn.is_bos?
|
160
|
+
s = s.drop_while {|e| (e==0xa || e==0x20)}
|
161
|
+
if !s.empty?
|
162
|
+
sarr = []
|
163
|
+
mn.length.times { sarr << s.shift }
|
164
|
+
surf = sarr.pack('C*')
|
165
|
+
mn.surface = surf.force_encoding(Encoding.default_external)
|
166
|
+
end
|
167
|
+
if @options[:output_format_type] || @options[:node_format]
|
168
|
+
mn.feature = self.mecab_format_node(@tagger, n).force_encoding(Encoding.default_external)
|
169
|
+
end
|
170
|
+
y.yield mn
|
154
171
|
end
|
155
|
-
nodes << mn if !mn.is_bos?
|
156
172
|
n = mn.next
|
157
173
|
end
|
158
174
|
n = self.mecab_nbest_next_tonode(@tagger)
|
159
175
|
end
|
160
176
|
end
|
161
|
-
return nodes
|
162
177
|
end
|
163
178
|
else
|
164
179
|
# default parsing implementations
|
165
|
-
@parse_tostr = lambda do |
|
166
|
-
|
180
|
+
@parse_tostr = lambda do |text|
|
181
|
+
retval = self.mecab_sparse_tostr(@tagger, text) ||
|
167
182
|
raise(MeCabError.new(self.mecab_strerror(@tagger)))
|
183
|
+
retval.force_encoding(Encoding.default_external)
|
168
184
|
end
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
s = str.bytes.to_a
|
176
|
-
while n && n.address!=0x0
|
185
|
+
|
186
|
+
@parse_tonodes = lambda do |text|
|
187
|
+
Enumerator.new do |y|
|
188
|
+
n = self.mecab_sparse_tonode(@tagger, text)
|
189
|
+
raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
|
190
|
+
|
177
191
|
mn = Natto::MeCabNode.new(n)
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
mn
|
182
|
-
|
183
|
-
|
192
|
+
n = mn.next if mn.next.address!=0x0
|
193
|
+
s = text.bytes.to_a
|
194
|
+
while n && n.address!=0x0
|
195
|
+
mn = Natto::MeCabNode.new(n)
|
196
|
+
s = s.drop_while {|e| (e==0xa || e==0x20)}
|
197
|
+
if !s.empty?
|
198
|
+
sarr = []
|
199
|
+
mn.length.times { sarr << s.shift }
|
200
|
+
surf = sarr.pack('C*')
|
201
|
+
mn.surface = surf.force_encoding(Encoding.default_external)
|
202
|
+
end
|
203
|
+
if @options[:output_format_type] || @options[:node_format]
|
204
|
+
mn.feature = self.mecab_format_node(@tagger, n).force_encoding(Encoding.default_external)
|
205
|
+
end
|
206
|
+
y.yield mn
|
207
|
+
n = mn.next
|
184
208
|
end
|
185
|
-
nodes << mn
|
186
|
-
n = mn.next
|
187
209
|
end
|
188
|
-
return nodes
|
189
210
|
end
|
190
211
|
end
|
191
212
|
|
@@ -199,24 +220,48 @@ module Natto
|
|
199
220
|
ObjectSpace.define_finalizer(self, self.class.create_free_proc(@tagger))
|
200
221
|
end
|
201
222
|
|
202
|
-
# Parses the given
|
203
|
-
#
|
223
|
+
# Parses the given `text`, returning the MeCab output as a single string.
|
224
|
+
# If a block is passed to this method, then node parsing will be used
|
225
|
+
# and each node yielded to the given block.
|
204
226
|
#
|
205
|
-
# @param [String]
|
206
|
-
# @return parsing result from `mecab`
|
207
|
-
# @raise [MeCabError] if the `mecab` tagger cannot parse the given
|
208
|
-
# @raise [ArgumentError] if the given string `
|
227
|
+
# @param [String] text
|
228
|
+
# @return [String] parsing result from `mecab`
|
229
|
+
# @raise [MeCabError] if the `mecab` tagger cannot parse the given `text`
|
230
|
+
# @raise [ArgumentError] if the given string `text` argument is `nil`
|
209
231
|
# @see MeCabNode
|
210
|
-
def parse(
|
211
|
-
raise ArgumentError.new '
|
232
|
+
def parse(text)
|
233
|
+
raise ArgumentError.new 'Text to parse cannot be nil' if text.nil?
|
212
234
|
if block_given?
|
213
|
-
|
214
|
-
nodes.each {|n| yield n }
|
235
|
+
@parse_tonodes.call(text).each {|n| yield n }
|
215
236
|
else
|
216
|
-
@parse_tostr.call(
|
237
|
+
@parse_tostr.call(text)
|
217
238
|
end
|
218
239
|
end
|
219
240
|
|
241
|
+
# Parses the given string `text`, returning an
|
242
|
+
# {http://www.ruby-doc.org/core-2.1.5/Enumerator.html Enumerator} that may be
|
243
|
+
# used to iterate over the resulting {MeCabNode} objects. This is more
|
244
|
+
# efficient than parsing to a simple string, since each node's
|
245
|
+
# information will not be materialized all at once as with it is with
|
246
|
+
# string output.
|
247
|
+
#
|
248
|
+
# MeCab nodes contain much more detailed information about
|
249
|
+
# the morpheme. Node-formatting may also be used to customize
|
250
|
+
# the resulting node's `feature` attribute.
|
251
|
+
#
|
252
|
+
# @param [String] text
|
253
|
+
# @return [Enumerator] of MeCabNode instances
|
254
|
+
# @raise [MeCabError] if the `mecab` tagger cannot parse the given `text`
|
255
|
+
# @raise [ArgumentError] if the given string `text` argument is `nil`
|
256
|
+
# @see MeCabNode
|
257
|
+
# @see http://www.ruby-doc.org/core-2.1.5/Enumerator.html
|
258
|
+
def enum_parse(text)
|
259
|
+
raise ArgumentError.new 'Text to parse cannot be nil' if text.nil?
|
260
|
+
@parse_tonodes.call(text)
|
261
|
+
end
|
262
|
+
|
263
|
+
# @deprecated
|
264
|
+
# DEPRECATED: use enum_parse instead, this convenience method is useless.
|
220
265
|
# Parses the given string `str`, and returns
|
221
266
|
# a list of `mecab` nodes.
|
222
267
|
# @param [String] str
|
@@ -225,10 +270,14 @@ module Natto
|
|
225
270
|
# @raise [ArgumentError] if the given string `str` argument is `nil`
|
226
271
|
# @see MeCabNode
|
227
272
|
def parse_as_nodes(str)
|
273
|
+
$stderr.puts 'DEPRECATED: use enum_parse instead'
|
274
|
+
$stderr.puts ' This method will be removed in the next release!'
|
228
275
|
raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
|
229
276
|
@parse_tonodes.call(str)
|
230
277
|
end
|
231
278
|
|
279
|
+
# @deprecated
|
280
|
+
# DEPRECATED: use enum_parse instead, this convenience method is useless.
|
232
281
|
# Parses the given string `str`, and returns
|
233
282
|
# a list of `mecab` result strings.
|
234
283
|
# @param [String] str
|
@@ -236,19 +285,29 @@ module Natto
|
|
236
285
|
# @raise [MeCabError] if the `mecab` tagger cannot parse the given string `str`
|
237
286
|
# @raise [ArgumentError] if the given string `str` argument is `nil`
|
238
287
|
def parse_as_strings(str)
|
288
|
+
$stderr.puts 'DEPRECATED: use enum_parse instead'
|
289
|
+
$stderr.puts ' This method will be removed in the next release!'
|
239
290
|
raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
|
240
|
-
@parse_tostr.call(str).
|
291
|
+
@parse_tostr.call(str).lines.to_a
|
241
292
|
end
|
242
293
|
|
243
|
-
#
|
294
|
+
# @deprecated
|
295
|
+
# DEPRECATED: use enum_parse instead, this convenience method is useless.
|
296
|
+
# @param [String] str
|
297
|
+
# @return [Array] of parsed `mecab` nodes.
|
244
298
|
def readnodes(str)
|
245
|
-
$
|
299
|
+
$stderr.puts 'DEPRECATED: use enum_parse instead'
|
300
|
+
$stderr.puts ' This method will be removed in the next release!'
|
246
301
|
parse_as_nodes(str)
|
247
302
|
end
|
248
303
|
|
249
|
-
#
|
304
|
+
# @deprecated
|
305
|
+
# DEPRECATED: use enum_parse instead, this convenience method is useless.
|
306
|
+
# @param [String] str
|
307
|
+
# @return [Array] of parsed `mecab` result strings.
|
250
308
|
def readlines(str)
|
251
|
-
$
|
309
|
+
$stderr.puts 'DEPRECATED: use enum_parse instead'
|
310
|
+
$stderr.puts ' This method will be removed in the next release!'
|
252
311
|
parse_as_strings(str)
|
253
312
|
end
|
254
313
|
|
@@ -257,18 +316,27 @@ module Natto
|
|
257
316
|
#
|
258
317
|
# - encoded object id
|
259
318
|
# - underlying FFI pointer to the `mecab` tagger
|
319
|
+
# - real file path to `mecab` library
|
260
320
|
# - options hash
|
261
321
|
# - list of dictionaries
|
262
322
|
# - MeCab version
|
263
323
|
#
|
264
|
-
# @return [String] encoded object id, underlying FFI pointer,
|
324
|
+
# @return [String] encoded object id, underlying FFI pointer,
|
325
|
+
# file path to `mecab` library, options hash,
|
326
|
+
# list of dictionaries and MeCab version
|
265
327
|
def to_s
|
266
|
-
|
328
|
+
[ super.chop,
|
329
|
+
"@tagger=#{@tagger},",
|
330
|
+
"@libpath=\"#{@libpath}\",",
|
331
|
+
"@options=#{@options.inspect},",
|
332
|
+
"@dicts=#{@dicts.to_s},",
|
333
|
+
"@version=#{@version.to_s}>" ].join(' ')
|
267
334
|
end
|
268
335
|
|
269
336
|
# Overrides `Object#inspect`.
|
270
337
|
#
|
271
|
-
# @return [String] encoded object id, FFI pointer, options hash,
|
338
|
+
# @return [String] encoded object id, FFI pointer, options hash,
|
339
|
+
# list of dictionaries, and MeCab version
|
272
340
|
# @see #to_s
|
273
341
|
def inspect
|
274
342
|
self.to_s
|
@@ -293,3 +361,29 @@ module Natto
|
|
293
361
|
# for the `Natto` module.
|
294
362
|
class MeCabError < RuntimeError; end
|
295
363
|
end
|
364
|
+
|
365
|
+
# Copyright (c) 2014-2015, Brooke M. Fujita.
|
366
|
+
# All rights reserved.
|
367
|
+
#
|
368
|
+
# Redistribution and use in source and binary forms, with or without
|
369
|
+
# modification, are permitted provided that the following conditions are met:
|
370
|
+
#
|
371
|
+
# * Redistributions of source code must retain the above
|
372
|
+
# copyright notice, this list of conditions and the
|
373
|
+
# following disclaimer.
|
374
|
+
#
|
375
|
+
# * Redistributions in binary form must reproduce the above
|
376
|
+
# copyright notice, this list of conditions and the
|
377
|
+
# following disclaimer in the documentation and/or other
|
378
|
+
# materials provided with the distribution.
|
379
|
+
#
|
380
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
381
|
+
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
382
|
+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
383
|
+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
384
|
+
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
385
|
+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
386
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
387
|
+
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
388
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
389
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/lib/natto/option_parse.rb
CHANGED
@@ -115,3 +115,29 @@ module Natto
|
|
115
115
|
end
|
116
116
|
end
|
117
117
|
end
|
118
|
+
|
119
|
+
# Copyright (c) 2014-2015, Brooke M. Fujita.
|
120
|
+
# All rights reserved.
|
121
|
+
#
|
122
|
+
# Redistribution and use in source and binary forms, with or without
|
123
|
+
# modification, are permitted provided that the following conditions are met:
|
124
|
+
#
|
125
|
+
# * Redistributions of source code must retain the above
|
126
|
+
# copyright notice, this list of conditions and the
|
127
|
+
# following disclaimer.
|
128
|
+
#
|
129
|
+
# * Redistributions in binary form must reproduce the above
|
130
|
+
# copyright notice, this list of conditions and the
|
131
|
+
# following disclaimer in the documentation and/or other
|
132
|
+
# materials provided with the distribution.
|
133
|
+
#
|
134
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
135
|
+
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
136
|
+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
137
|
+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
138
|
+
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
139
|
+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
140
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
141
|
+
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
142
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
143
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/lib/natto/struct.rb
CHANGED
@@ -21,21 +21,21 @@ module Natto
|
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
-
# `DictionaryInfo` is a wrapper for `struct mecab_dictionary_info_t`
|
25
|
-
#
|
24
|
+
# `DictionaryInfo` is a wrapper for the `struct mecab_dictionary_info_t`
|
25
|
+
# structure holding the `MeCab` instance's related dictionary information.
|
26
26
|
#
|
27
27
|
# Values for the `mecab` dictionary attributes may be
|
28
28
|
# obtained by using the following `Symbol`s as keys
|
29
29
|
# to the layout associative array of `FFI::Struct` members.
|
30
30
|
#
|
31
|
-
# - :filename
|
32
|
-
# - :charset
|
33
|
-
# - :size
|
34
|
-
# - :type
|
35
|
-
# - :lsize
|
36
|
-
# - :rsize
|
37
|
-
# - :version
|
38
|
-
# - :next
|
31
|
+
# - :filename - filename of dictionary; on Windows, filename is stored in UTF-8 encoding
|
32
|
+
# - :charset - character set of the dictionary
|
33
|
+
# - :size - number of words contained in dictionary
|
34
|
+
# - :type - dictionary type: 0 (system), 1 (user-defined), 2 (unknown)
|
35
|
+
# - :lsize - left attributes size
|
36
|
+
# - :rsize - right attributes size
|
37
|
+
# - :version - version of this dictionary
|
38
|
+
# - :next - pointer to next dictionary in list
|
39
39
|
#
|
40
40
|
# ## Usage
|
41
41
|
# `mecab` dictionary attributes can be obtained by
|
@@ -44,16 +44,20 @@ module Natto
|
|
44
44
|
# nm = Natto::MeCab.new
|
45
45
|
#
|
46
46
|
# sysdic = nm.dicts.first
|
47
|
-
#
|
48
|
-
#
|
49
|
-
#
|
47
|
+
#
|
48
|
+
# # display the real path to the mecab lib
|
49
|
+
# puts sysdic.filepath
|
50
|
+
# => /usr/local/lib/mecab/dic/ipadic/sys.dic
|
50
51
|
#
|
51
52
|
# puts sysdic.charset
|
52
|
-
# =>
|
53
|
+
# => utf8
|
53
54
|
#
|
54
55
|
# puts sysdic.is_sysdic?
|
55
56
|
# => true
|
56
57
|
class DictionaryInfo < MeCabStruct
|
58
|
+
# @return [String] Absolute filepath to MeCab dictionary.
|
59
|
+
attr_reader :filepath
|
60
|
+
|
57
61
|
# System dictionary.
|
58
62
|
SYS_DIC = 0
|
59
63
|
# User dictionary.
|
@@ -83,17 +87,31 @@ module Natto
|
|
83
87
|
end
|
84
88
|
end
|
85
89
|
|
90
|
+
# Initializes this dictionary info instance.
|
91
|
+
# Sets the `DictionaryInfo` filepath value.
|
92
|
+
#
|
93
|
+
# @param [FFI::Pointer] ptr pointer to MeCab dictionary
|
94
|
+
def initialize(ptr)
|
95
|
+
super(ptr)
|
96
|
+
|
97
|
+
@filepath = File.absolute_path(self[:filename])
|
98
|
+
end
|
99
|
+
|
86
100
|
# Returns human-readable details for this `mecab` dictionary.
|
87
101
|
# Overrides `Object#to_s`.
|
88
102
|
#
|
89
103
|
# - encoded object id
|
90
|
-
# - dictionary
|
91
|
-
# - full-path dictionary filename
|
104
|
+
# - real file path to this dictionary
|
92
105
|
# - dictionary charset
|
106
|
+
# - dictionary type
|
93
107
|
#
|
94
|
-
# @return [String] encoded object id,
|
108
|
+
# @return [String] encoded object id, file path to dictionary, charset and
|
109
|
+
# type
|
95
110
|
def to_s
|
96
|
-
|
111
|
+
[ super.chop,
|
112
|
+
"@filepath=\"#{@filepath}\",",
|
113
|
+
"charset=#{self.charset},",
|
114
|
+
"type=#{self.type}>" ].join(' ')
|
97
115
|
end
|
98
116
|
|
99
117
|
# Overrides `Object#inspect`.
|
@@ -123,35 +141,35 @@ module Natto
|
|
123
141
|
end
|
124
142
|
end
|
125
143
|
|
126
|
-
# `MeCabNode` is a wrapper for the
|
127
|
-
# the parsed `node`.
|
144
|
+
# `MeCabNode` is a wrapper for the `struct mecab_node_t`
|
145
|
+
# structure holding the parsed `node`.
|
128
146
|
#
|
129
147
|
# Values for the `mecab` node attributes may be
|
130
148
|
# obtained by using the following `Symbol`s as keys
|
131
149
|
# to the layout associative array of `FFI::Struct` members.
|
132
150
|
#
|
133
|
-
# - :prev
|
134
|
-
# - :next
|
135
|
-
# - :enext
|
136
|
-
# - :bnext
|
137
|
-
# - :rpath
|
138
|
-
# - :lpath
|
139
|
-
# - :surface
|
140
|
-
# - :feature
|
141
|
-
# - :id
|
142
|
-
# - :length
|
143
|
-
# - :rlength
|
144
|
-
# - :rcAttr
|
145
|
-
# - :lcAttr
|
146
|
-
# - :posid
|
147
|
-
# - :char_type
|
148
|
-
# - :stat
|
149
|
-
# - :isbest
|
150
|
-
# - :alpha
|
151
|
-
# - :beta
|
152
|
-
# - :prob
|
153
|
-
# - :wcost
|
154
|
-
# - :cost
|
151
|
+
# - :prev - pointer to previous node
|
152
|
+
# - :next - pointer to next node
|
153
|
+
# - :enext - pointer to the node which ends at the same position
|
154
|
+
# - :bnext - pointer to the node which starts at the same position
|
155
|
+
# - :rpath - pointer to the right path; nil if MECAB_ONE_BEST mode
|
156
|
+
# - :lpath - pointer to the right path; nil if MECAB_ONE_BEST mode
|
157
|
+
# - :surface - surface string; length may be obtained with length/rlength members
|
158
|
+
# - :feature - feature string
|
159
|
+
# - :id - unique node id
|
160
|
+
# - :length - length of surface form
|
161
|
+
# - :rlength - length of the surface form including white space before the morph
|
162
|
+
# - :rcAttr - right attribute id
|
163
|
+
# - :lcAttr - left attribute id
|
164
|
+
# - :posid - part-of-speech id
|
165
|
+
# - :char_type - character type
|
166
|
+
# - :stat - node status; 0 (NOR), 1 (UNK), 2 (BOS), 3 (EOS), 4 (EON)
|
167
|
+
# - :isbest - 1 if this node is best node
|
168
|
+
# - :alpha - forward accumulative log summation, only with marginal probability flag
|
169
|
+
# - :beta - backward accumulative log summation, only with marginal probability flag
|
170
|
+
# - :prob - marginal probability, only with marginal probability flag
|
171
|
+
# - :wcost - word cost
|
172
|
+
# - :cost - best accumulative cost from bos node to this node
|
155
173
|
#
|
156
174
|
# ## Usage
|
157
175
|
# An instance of `MeCabNode` is yielded to the block
|
@@ -164,7 +182,7 @@ module Natto
|
|
164
182
|
# puts "#{n.surface}\t#{n.cost}" if n.is_nor?
|
165
183
|
# end
|
166
184
|
# 卓球 2874
|
167
|
-
#
|
185
|
+
# なんて 4398
|
168
186
|
# 死ぬ 9261
|
169
187
|
# まで 9386
|
170
188
|
# の 10007
|
@@ -173,36 +191,29 @@ module Natto
|
|
173
191
|
# よ 14396
|
174
192
|
# 。 10194
|
175
193
|
#
|
176
|
-
#
|
194
|
+
# While it is also possible to use the `Symbol` for the
|
177
195
|
# `mecab` node member to index into the
|
178
|
-
# `FFI::Struct` layout associative array
|
179
|
-
#
|
180
|
-
#
|
181
|
-
#
|
182
|
-
# 動詞,自立,*,*,五段・ワ行促音便,基本形,笑う,ワラウ,ワラウ
|
183
|
-
# 助詞,接続助詞,*,*,*,*,と,ト,ト
|
184
|
-
# 副詞,一般,*,*,*,*,結構,ケッコウ,ケッコー
|
185
|
-
# 形容詞,自立,*,*,形容詞・イ段,基本形,可愛い,カワイイ,カワイイ
|
186
|
-
# 名詞,一般,*,*,*,*,顔,カオ,カオ
|
187
|
-
# 動詞,自立,*,*,サ変・スル,連用形,する,シ,シ
|
188
|
-
# 動詞,非自立,*,*,一段,体言接続特殊,てる,テン,テン
|
189
|
-
# 助詞,終助詞,*,*,*,*,よ,ヨ,ヨ
|
190
|
-
# 記号,句点,*,*,*,*,。,。,。
|
191
|
-
# BOS/EOS,*,*,*,*,*,*,*,*
|
192
|
-
#
|
196
|
+
# `FFI::Struct` layout associative array, please use the attribute
|
197
|
+
# accessors. In the case of `:surface` and `:feature`, `mecab`
|
198
|
+
# returns the raw bytes, so `natto` will convert that into
|
199
|
+
# a string using the default encoding.
|
193
200
|
class MeCabNode < MeCabStruct
|
194
|
-
|
201
|
+
# @return [String] surface morpheme surface value.
|
202
|
+
attr_accessor :surface
|
203
|
+
# @return [String] corresponding feature value.
|
204
|
+
attr_accessor :feature
|
205
|
+
# @return [FFI::Pointer] pointer to MeCab node struct.
|
195
206
|
attr_reader :pointer
|
196
207
|
|
197
|
-
# Normal `mecab` node defined in the dictionary.
|
208
|
+
# Normal `mecab` node defined in the dictionary, c.f. `stat`.
|
198
209
|
NOR_NODE = 0
|
199
|
-
# Unknown `mecab` node not defined in the dictionary.
|
210
|
+
# Unknown `mecab` node not defined in the dictionary, c.f. `stat`.
|
200
211
|
UNK_NODE = 1
|
201
|
-
# Virtual node representing the beginning of the sentence.
|
212
|
+
# Virtual node representing the beginning of the sentence, c.f. `stat`.
|
202
213
|
BOS_NODE = 2
|
203
|
-
# Virutual node representing the end of the sentence.
|
214
|
+
# Virutual node representing the end of the sentence, c.f. `stat`.
|
204
215
|
EOS_NODE = 3
|
205
|
-
# Virtual node representing the end of an N-Best `mecab` node list.
|
216
|
+
# Virtual node representing the end of an N-Best `mecab` node list, c.f. `stat`.
|
206
217
|
EON_NODE = 4
|
207
218
|
|
208
219
|
layout :prev, :pointer,
|
@@ -227,25 +238,11 @@ module Natto
|
|
227
238
|
:prob, :float,
|
228
239
|
:wcost, :short,
|
229
240
|
:cost, :long
|
230
|
-
|
231
|
-
#if RUBY_VERSION.to_f < 1.9
|
232
|
-
# alias_method :deprecated_id, :id
|
233
|
-
# # `Object#id` override defined when `RUBY_VERSION` is
|
234
|
-
# # older than 1.9. This is a hack to avoid the `Object#id`
|
235
|
-
# # deprecation warning thrown up in Ruby 1.8.7.
|
236
|
-
# #
|
237
|
-
# # <i>This method override is not defined when the Ruby interpreter
|
238
|
-
# # is 1.9 or greater.</i>
|
239
|
-
# # @return [Fixnum] `mecab` node id
|
240
|
-
# def id
|
241
|
-
# self[:id]
|
242
|
-
# end
|
243
|
-
#end
|
244
241
|
|
245
242
|
# Initializes this node instance.
|
246
243
|
# Sets the `MeCab` feature value for this node.
|
247
244
|
#
|
248
|
-
# @param [FFI::Pointer]
|
245
|
+
# @param [FFI::Pointer] ptr pointer to MeCab node
|
249
246
|
def initialize(ptr)
|
250
247
|
super(ptr)
|
251
248
|
@pointer = ptr
|
@@ -308,3 +305,29 @@ module Natto
|
|
308
305
|
end
|
309
306
|
end
|
310
307
|
end
|
308
|
+
|
309
|
+
# Copyright (c) 2014-2015, Brooke M. Fujita.
|
310
|
+
# All rights reserved.
|
311
|
+
#
|
312
|
+
# Redistribution and use in source and binary forms, with or without
|
313
|
+
# modification, are permitted provided that the following conditions are met:
|
314
|
+
#
|
315
|
+
# * Redistributions of source code must retain the above
|
316
|
+
# copyright notice, this list of conditions and the
|
317
|
+
# following disclaimer.
|
318
|
+
#
|
319
|
+
# * Redistributions in binary form must reproduce the above
|
320
|
+
# copyright notice, this list of conditions and the
|
321
|
+
# following disclaimer in the documentation and/or other
|
322
|
+
# materials provided with the distribution.
|
323
|
+
#
|
324
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
325
|
+
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
326
|
+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
327
|
+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
328
|
+
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
329
|
+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
330
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
331
|
+
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
332
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
333
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|