natto 0.9.6 → 0.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +18 -0
- data/LICENSE +13 -11
- data/README.md +233 -108
- data/lib/natto.rb +26 -0
- data/lib/natto/binding.rb +69 -25
- data/lib/natto/natto.rb +166 -72
- data/lib/natto/option_parse.rb +26 -0
- data/lib/natto/struct.rb +103 -80
- data/lib/natto/version.rb +27 -1
- metadata +12 -10
data/lib/natto/natto.rb
CHANGED
@@ -11,18 +11,18 @@ module Natto
|
|
11
11
|
#
|
12
12
|
# ## Usage
|
13
13
|
#
|
14
|
-
# require 'rubygems' if RUBY_VERSION.to_f < 1.9
|
15
14
|
# require 'natto'
|
16
15
|
#
|
17
16
|
# nm = Natto::MeCab.new('-Ochasen')
|
18
17
|
# => #<Natto::MeCab:0x28d3bdc8 \
|
19
18
|
# @tagger=#<FFI::Pointer address=0x28afb980>, \
|
19
|
+
# @libpath="/usr/local/lib/libmecab.so" \
|
20
20
|
# @options={:output_format_type=>"chasen"}, \
|
21
21
|
# @dicts=[#<Natto::DictionaryInfo:0x289a1f14 \
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
# @version=
|
22
|
+
# @filepath="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
|
23
|
+
# charset=utf8 \
|
24
|
+
# type=0>], \
|
25
|
+
# @version=0.996>
|
26
26
|
#
|
27
27
|
# nm.parse('凡人にしか見えねえ風景ってのがあるんだよ。') do |n|
|
28
28
|
# puts "#{n.surface}\t#{n.feature}"
|
@@ -46,8 +46,17 @@ module Natto
|
|
46
46
|
class MeCab
|
47
47
|
include Natto::Binding
|
48
48
|
include Natto::OptionParse
|
49
|
-
|
50
|
-
|
49
|
+
|
50
|
+
# @return [FFI:Pointer] pointer to MeCab tagger.
|
51
|
+
attr_reader :tagger
|
52
|
+
# @return [String] absolute filepath to MeCab library.
|
53
|
+
attr_reader :libpath
|
54
|
+
# @return [Hash] MeCab options as key-value pairs.
|
55
|
+
attr_reader :options
|
56
|
+
# @return [Array] listing of all of dictionaries referenced.
|
57
|
+
attr_reader :dicts
|
58
|
+
# @return [String] `MeCab` versions.
|
59
|
+
attr_reader :version
|
51
60
|
|
52
61
|
# Initializes the wrapped `mecab` instance with the
|
53
62
|
# given `options`.
|
@@ -83,12 +92,13 @@ module Natto
|
|
83
92
|
# nm = Natto::MeCab.new(:node_format=>'%m¥t%f[7]¥n')
|
84
93
|
# => #<Natto::MeCab:0x28d2ae10
|
85
94
|
# @tagger=#<FFI::Pointer address=0x28a97980>, \
|
95
|
+
# @libpath="/usr/local/lib/libmecab.so", \
|
86
96
|
# @options={:node_format=>"%m¥t%f[7]¥n"}, \
|
87
97
|
# @dicts=[#<Natto::DictionaryInfo:0x28d2a85c \
|
88
|
-
#
|
89
|
-
#
|
90
|
-
#
|
91
|
-
# @version=
|
98
|
+
# @filepath="/usr/local/lib/mecab/dic/ipadic/sys.dic" \
|
99
|
+
# charset=utf8, \
|
100
|
+
# type=0>] \
|
101
|
+
# @version=0.996>
|
92
102
|
#
|
93
103
|
# puts nm.parse('才能とは求める人間に与えられるものではない。')
|
94
104
|
# 才能 サイノウ
|
@@ -106,14 +116,15 @@ module Natto
|
|
106
116
|
# 。 。
|
107
117
|
# EOS
|
108
118
|
#
|
109
|
-
# @param [Hash
|
119
|
+
# @param [Hash, String] options MeCab options for tagger
|
110
120
|
# @raise [MeCabError] if `mecab` cannot be initialized with the given `options`
|
111
121
|
def initialize(options={})
|
112
122
|
@options = self.class.parse_mecab_options(options)
|
113
123
|
@dicts = []
|
114
124
|
|
115
|
-
opt_str
|
116
|
-
@tagger
|
125
|
+
opt_str = self.class.build_options_str(@options)
|
126
|
+
@tagger = self.class.mecab_new2(opt_str)
|
127
|
+
@libpath = self.class.find_library
|
117
128
|
raise MeCabError.new("Could not initialize MeCab with options: '#{opt_str}'") if @tagger.address == 0x0
|
118
129
|
|
119
130
|
self.mecab_set_theta(@tagger, @options[:theta]) if @options[:theta]
|
@@ -123,69 +134,79 @@ module Natto
|
|
123
134
|
|
124
135
|
# Set mecab parsing implementations for N-best and regular parsing,
|
125
136
|
# for both parsing as string and yielding a node object
|
126
|
-
# N-Best parsing implementations
|
127
137
|
if @options[:nbest] && @options[:nbest] > 1
|
138
|
+
# N-Best parsing implementations
|
128
139
|
self.mecab_set_lattice_level(@tagger, (@options[:lattice_level] || 1))
|
129
|
-
|
130
|
-
|
140
|
+
|
141
|
+
@parse_tostr = lambda do |text|
|
142
|
+
retval = self.mecab_nbest_sparse_tostr(@tagger, @options[:nbest], text) ||
|
131
143
|
raise(MeCabError.new(self.mecab_strerror(@tagger)))
|
144
|
+
retval.force_encoding(Encoding.default_external)
|
132
145
|
end
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
self.mecab_nbest_init(@tagger,
|
146
|
+
|
147
|
+
@parse_tonodes = lambda do |text|
|
148
|
+
Enumerator.new do |y|
|
149
|
+
self.mecab_nbest_init(@tagger, text)
|
137
150
|
n = self.mecab_nbest_next_tonode(@tagger)
|
138
151
|
raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
|
152
|
+
|
139
153
|
nlen = @options[:nbest]
|
140
154
|
nlen.times do |i|
|
141
|
-
s =
|
155
|
+
s = text.bytes.to_a
|
142
156
|
while n && n.address != 0x0
|
143
157
|
mn = Natto::MeCabNode.new(n)
|
144
|
-
|
145
|
-
if !
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
158
|
+
# ignore BOS nodes, since mecab does so
|
159
|
+
if !mn.is_bos?
|
160
|
+
s = s.drop_while {|e| (e==0xa || e==0x20)}
|
161
|
+
if !s.empty?
|
162
|
+
sarr = []
|
163
|
+
mn.length.times { sarr << s.shift }
|
164
|
+
surf = sarr.pack('C*')
|
165
|
+
mn.surface = surf.force_encoding(Encoding.default_external)
|
166
|
+
end
|
167
|
+
if @options[:output_format_type] || @options[:node_format]
|
168
|
+
mn.feature = self.mecab_format_node(@tagger, n).force_encoding(Encoding.default_external)
|
169
|
+
end
|
170
|
+
y.yield mn
|
154
171
|
end
|
155
|
-
nodes << mn if !mn.is_bos?
|
156
172
|
n = mn.next
|
157
173
|
end
|
158
174
|
n = self.mecab_nbest_next_tonode(@tagger)
|
159
175
|
end
|
160
176
|
end
|
161
|
-
return nodes
|
162
177
|
end
|
163
178
|
else
|
164
179
|
# default parsing implementations
|
165
|
-
@parse_tostr = lambda do |
|
166
|
-
|
180
|
+
@parse_tostr = lambda do |text|
|
181
|
+
retval = self.mecab_sparse_tostr(@tagger, text) ||
|
167
182
|
raise(MeCabError.new(self.mecab_strerror(@tagger)))
|
183
|
+
retval.force_encoding(Encoding.default_external)
|
168
184
|
end
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
s = str.bytes.to_a
|
176
|
-
while n && n.address!=0x0
|
185
|
+
|
186
|
+
@parse_tonodes = lambda do |text|
|
187
|
+
Enumerator.new do |y|
|
188
|
+
n = self.mecab_sparse_tonode(@tagger, text)
|
189
|
+
raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
|
190
|
+
|
177
191
|
mn = Natto::MeCabNode.new(n)
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
mn
|
182
|
-
|
183
|
-
|
192
|
+
n = mn.next if mn.next.address!=0x0
|
193
|
+
s = text.bytes.to_a
|
194
|
+
while n && n.address!=0x0
|
195
|
+
mn = Natto::MeCabNode.new(n)
|
196
|
+
s = s.drop_while {|e| (e==0xa || e==0x20)}
|
197
|
+
if !s.empty?
|
198
|
+
sarr = []
|
199
|
+
mn.length.times { sarr << s.shift }
|
200
|
+
surf = sarr.pack('C*')
|
201
|
+
mn.surface = surf.force_encoding(Encoding.default_external)
|
202
|
+
end
|
203
|
+
if @options[:output_format_type] || @options[:node_format]
|
204
|
+
mn.feature = self.mecab_format_node(@tagger, n).force_encoding(Encoding.default_external)
|
205
|
+
end
|
206
|
+
y.yield mn
|
207
|
+
n = mn.next
|
184
208
|
end
|
185
|
-
nodes << mn
|
186
|
-
n = mn.next
|
187
209
|
end
|
188
|
-
return nodes
|
189
210
|
end
|
190
211
|
end
|
191
212
|
|
@@ -199,24 +220,48 @@ module Natto
|
|
199
220
|
ObjectSpace.define_finalizer(self, self.class.create_free_proc(@tagger))
|
200
221
|
end
|
201
222
|
|
202
|
-
# Parses the given
|
203
|
-
#
|
223
|
+
# Parses the given `text`, returning the MeCab output as a single string.
|
224
|
+
# If a block is passed to this method, then node parsing will be used
|
225
|
+
# and each node yielded to the given block.
|
204
226
|
#
|
205
|
-
# @param [String]
|
206
|
-
# @return parsing result from `mecab`
|
207
|
-
# @raise [MeCabError] if the `mecab` tagger cannot parse the given
|
208
|
-
# @raise [ArgumentError] if the given string `
|
227
|
+
# @param [String] text
|
228
|
+
# @return [String] parsing result from `mecab`
|
229
|
+
# @raise [MeCabError] if the `mecab` tagger cannot parse the given `text`
|
230
|
+
# @raise [ArgumentError] if the given string `text` argument is `nil`
|
209
231
|
# @see MeCabNode
|
210
|
-
def parse(
|
211
|
-
raise ArgumentError.new '
|
232
|
+
def parse(text)
|
233
|
+
raise ArgumentError.new 'Text to parse cannot be nil' if text.nil?
|
212
234
|
if block_given?
|
213
|
-
|
214
|
-
nodes.each {|n| yield n }
|
235
|
+
@parse_tonodes.call(text).each {|n| yield n }
|
215
236
|
else
|
216
|
-
@parse_tostr.call(
|
237
|
+
@parse_tostr.call(text)
|
217
238
|
end
|
218
239
|
end
|
219
240
|
|
241
|
+
# Parses the given string `text`, returning an
|
242
|
+
# {http://www.ruby-doc.org/core-2.1.5/Enumerator.html Enumerator} that may be
|
243
|
+
# used to iterate over the resulting {MeCabNode} objects. This is more
|
244
|
+
# efficient than parsing to a simple string, since each node's
|
245
|
+
# information will not be materialized all at once as with it is with
|
246
|
+
# string output.
|
247
|
+
#
|
248
|
+
# MeCab nodes contain much more detailed information about
|
249
|
+
# the morpheme. Node-formatting may also be used to customize
|
250
|
+
# the resulting node's `feature` attribute.
|
251
|
+
#
|
252
|
+
# @param [String] text
|
253
|
+
# @return [Enumerator] of MeCabNode instances
|
254
|
+
# @raise [MeCabError] if the `mecab` tagger cannot parse the given `text`
|
255
|
+
# @raise [ArgumentError] if the given string `text` argument is `nil`
|
256
|
+
# @see MeCabNode
|
257
|
+
# @see http://www.ruby-doc.org/core-2.1.5/Enumerator.html
|
258
|
+
def enum_parse(text)
|
259
|
+
raise ArgumentError.new 'Text to parse cannot be nil' if text.nil?
|
260
|
+
@parse_tonodes.call(text)
|
261
|
+
end
|
262
|
+
|
263
|
+
# @deprecated
|
264
|
+
# DEPRECATED: use enum_parse instead, this convenience method is useless.
|
220
265
|
# Parses the given string `str`, and returns
|
221
266
|
# a list of `mecab` nodes.
|
222
267
|
# @param [String] str
|
@@ -225,10 +270,14 @@ module Natto
|
|
225
270
|
# @raise [ArgumentError] if the given string `str` argument is `nil`
|
226
271
|
# @see MeCabNode
|
227
272
|
def parse_as_nodes(str)
|
273
|
+
$stderr.puts 'DEPRECATED: use enum_parse instead'
|
274
|
+
$stderr.puts ' This method will be removed in the next release!'
|
228
275
|
raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
|
229
276
|
@parse_tonodes.call(str)
|
230
277
|
end
|
231
278
|
|
279
|
+
# @deprecated
|
280
|
+
# DEPRECATED: use enum_parse instead, this convenience method is useless.
|
232
281
|
# Parses the given string `str`, and returns
|
233
282
|
# a list of `mecab` result strings.
|
234
283
|
# @param [String] str
|
@@ -236,19 +285,29 @@ module Natto
|
|
236
285
|
# @raise [MeCabError] if the `mecab` tagger cannot parse the given string `str`
|
237
286
|
# @raise [ArgumentError] if the given string `str` argument is `nil`
|
238
287
|
def parse_as_strings(str)
|
288
|
+
$stderr.puts 'DEPRECATED: use enum_parse instead'
|
289
|
+
$stderr.puts ' This method will be removed in the next release!'
|
239
290
|
raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
|
240
|
-
@parse_tostr.call(str).
|
291
|
+
@parse_tostr.call(str).lines.to_a
|
241
292
|
end
|
242
293
|
|
243
|
-
#
|
294
|
+
# @deprecated
|
295
|
+
# DEPRECATED: use enum_parse instead, this convenience method is useless.
|
296
|
+
# @param [String] str
|
297
|
+
# @return [Array] of parsed `mecab` nodes.
|
244
298
|
def readnodes(str)
|
245
|
-
$
|
299
|
+
$stderr.puts 'DEPRECATED: use enum_parse instead'
|
300
|
+
$stderr.puts ' This method will be removed in the next release!'
|
246
301
|
parse_as_nodes(str)
|
247
302
|
end
|
248
303
|
|
249
|
-
#
|
304
|
+
# @deprecated
|
305
|
+
# DEPRECATED: use enum_parse instead, this convenience method is useless.
|
306
|
+
# @param [String] str
|
307
|
+
# @return [Array] of parsed `mecab` result strings.
|
250
308
|
def readlines(str)
|
251
|
-
$
|
309
|
+
$stderr.puts 'DEPRECATED: use enum_parse instead'
|
310
|
+
$stderr.puts ' This method will be removed in the next release!'
|
252
311
|
parse_as_strings(str)
|
253
312
|
end
|
254
313
|
|
@@ -257,18 +316,27 @@ module Natto
|
|
257
316
|
#
|
258
317
|
# - encoded object id
|
259
318
|
# - underlying FFI pointer to the `mecab` tagger
|
319
|
+
# - real file path to `mecab` library
|
260
320
|
# - options hash
|
261
321
|
# - list of dictionaries
|
262
322
|
# - MeCab version
|
263
323
|
#
|
264
|
-
# @return [String] encoded object id, underlying FFI pointer,
|
324
|
+
# @return [String] encoded object id, underlying FFI pointer,
|
325
|
+
# file path to `mecab` library, options hash,
|
326
|
+
# list of dictionaries and MeCab version
|
265
327
|
def to_s
|
266
|
-
|
328
|
+
[ super.chop,
|
329
|
+
"@tagger=#{@tagger},",
|
330
|
+
"@libpath=\"#{@libpath}\",",
|
331
|
+
"@options=#{@options.inspect},",
|
332
|
+
"@dicts=#{@dicts.to_s},",
|
333
|
+
"@version=#{@version.to_s}>" ].join(' ')
|
267
334
|
end
|
268
335
|
|
269
336
|
# Overrides `Object#inspect`.
|
270
337
|
#
|
271
|
-
# @return [String] encoded object id, FFI pointer, options hash,
|
338
|
+
# @return [String] encoded object id, FFI pointer, options hash,
|
339
|
+
# list of dictionaries, and MeCab version
|
272
340
|
# @see #to_s
|
273
341
|
def inspect
|
274
342
|
self.to_s
|
@@ -293,3 +361,29 @@ module Natto
|
|
293
361
|
# for the `Natto` module.
|
294
362
|
class MeCabError < RuntimeError; end
|
295
363
|
end
|
364
|
+
|
365
|
+
# Copyright (c) 2014-2015, Brooke M. Fujita.
|
366
|
+
# All rights reserved.
|
367
|
+
#
|
368
|
+
# Redistribution and use in source and binary forms, with or without
|
369
|
+
# modification, are permitted provided that the following conditions are met:
|
370
|
+
#
|
371
|
+
# * Redistributions of source code must retain the above
|
372
|
+
# copyright notice, this list of conditions and the
|
373
|
+
# following disclaimer.
|
374
|
+
#
|
375
|
+
# * Redistributions in binary form must reproduce the above
|
376
|
+
# copyright notice, this list of conditions and the
|
377
|
+
# following disclaimer in the documentation and/or other
|
378
|
+
# materials provided with the distribution.
|
379
|
+
#
|
380
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
381
|
+
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
382
|
+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
383
|
+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
384
|
+
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
385
|
+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
386
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
387
|
+
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
388
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
389
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/lib/natto/option_parse.rb
CHANGED
@@ -115,3 +115,29 @@ module Natto
|
|
115
115
|
end
|
116
116
|
end
|
117
117
|
end
|
118
|
+
|
119
|
+
# Copyright (c) 2014-2015, Brooke M. Fujita.
|
120
|
+
# All rights reserved.
|
121
|
+
#
|
122
|
+
# Redistribution and use in source and binary forms, with or without
|
123
|
+
# modification, are permitted provided that the following conditions are met:
|
124
|
+
#
|
125
|
+
# * Redistributions of source code must retain the above
|
126
|
+
# copyright notice, this list of conditions and the
|
127
|
+
# following disclaimer.
|
128
|
+
#
|
129
|
+
# * Redistributions in binary form must reproduce the above
|
130
|
+
# copyright notice, this list of conditions and the
|
131
|
+
# following disclaimer in the documentation and/or other
|
132
|
+
# materials provided with the distribution.
|
133
|
+
#
|
134
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
135
|
+
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
136
|
+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
137
|
+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
138
|
+
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
139
|
+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
140
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
141
|
+
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
142
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
143
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/lib/natto/struct.rb
CHANGED
@@ -21,21 +21,21 @@ module Natto
|
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
-
# `DictionaryInfo` is a wrapper for `struct mecab_dictionary_info_t`
|
25
|
-
#
|
24
|
+
# `DictionaryInfo` is a wrapper for the `struct mecab_dictionary_info_t`
|
25
|
+
# structure holding the `MeCab` instance's related dictionary information.
|
26
26
|
#
|
27
27
|
# Values for the `mecab` dictionary attributes may be
|
28
28
|
# obtained by using the following `Symbol`s as keys
|
29
29
|
# to the layout associative array of `FFI::Struct` members.
|
30
30
|
#
|
31
|
-
# - :filename
|
32
|
-
# - :charset
|
33
|
-
# - :size
|
34
|
-
# - :type
|
35
|
-
# - :lsize
|
36
|
-
# - :rsize
|
37
|
-
# - :version
|
38
|
-
# - :next
|
31
|
+
# - :filename - filename of dictionary; on Windows, filename is stored in UTF-8 encoding
|
32
|
+
# - :charset - character set of the dictionary
|
33
|
+
# - :size - number of words contained in dictionary
|
34
|
+
# - :type - dictionary type: 0 (system), 1 (user-defined), 2 (unknown)
|
35
|
+
# - :lsize - left attributes size
|
36
|
+
# - :rsize - right attributes size
|
37
|
+
# - :version - version of this dictionary
|
38
|
+
# - :next - pointer to next dictionary in list
|
39
39
|
#
|
40
40
|
# ## Usage
|
41
41
|
# `mecab` dictionary attributes can be obtained by
|
@@ -44,16 +44,20 @@ module Natto
|
|
44
44
|
# nm = Natto::MeCab.new
|
45
45
|
#
|
46
46
|
# sysdic = nm.dicts.first
|
47
|
-
#
|
48
|
-
#
|
49
|
-
#
|
47
|
+
#
|
48
|
+
# # display the real path to the mecab lib
|
49
|
+
# puts sysdic.filepath
|
50
|
+
# => /usr/local/lib/mecab/dic/ipadic/sys.dic
|
50
51
|
#
|
51
52
|
# puts sysdic.charset
|
52
|
-
# =>
|
53
|
+
# => utf8
|
53
54
|
#
|
54
55
|
# puts sysdic.is_sysdic?
|
55
56
|
# => true
|
56
57
|
class DictionaryInfo < MeCabStruct
|
58
|
+
# @return [String] Absolute filepath to MeCab dictionary.
|
59
|
+
attr_reader :filepath
|
60
|
+
|
57
61
|
# System dictionary.
|
58
62
|
SYS_DIC = 0
|
59
63
|
# User dictionary.
|
@@ -83,17 +87,31 @@ module Natto
|
|
83
87
|
end
|
84
88
|
end
|
85
89
|
|
90
|
+
# Initializes this dictionary info instance.
|
91
|
+
# Sets the `DictionaryInfo` filepath value.
|
92
|
+
#
|
93
|
+
# @param [FFI::Pointer] ptr pointer to MeCab dictionary
|
94
|
+
def initialize(ptr)
|
95
|
+
super(ptr)
|
96
|
+
|
97
|
+
@filepath = File.absolute_path(self[:filename])
|
98
|
+
end
|
99
|
+
|
86
100
|
# Returns human-readable details for this `mecab` dictionary.
|
87
101
|
# Overrides `Object#to_s`.
|
88
102
|
#
|
89
103
|
# - encoded object id
|
90
|
-
# - dictionary
|
91
|
-
# - full-path dictionary filename
|
104
|
+
# - real file path to this dictionary
|
92
105
|
# - dictionary charset
|
106
|
+
# - dictionary type
|
93
107
|
#
|
94
|
-
# @return [String] encoded object id,
|
108
|
+
# @return [String] encoded object id, file path to dictionary, charset and
|
109
|
+
# type
|
95
110
|
def to_s
|
96
|
-
|
111
|
+
[ super.chop,
|
112
|
+
"@filepath=\"#{@filepath}\",",
|
113
|
+
"charset=#{self.charset},",
|
114
|
+
"type=#{self.type}>" ].join(' ')
|
97
115
|
end
|
98
116
|
|
99
117
|
# Overrides `Object#inspect`.
|
@@ -123,35 +141,35 @@ module Natto
|
|
123
141
|
end
|
124
142
|
end
|
125
143
|
|
126
|
-
# `MeCabNode` is a wrapper for the
|
127
|
-
# the parsed `node`.
|
144
|
+
# `MeCabNode` is a wrapper for the `struct mecab_node_t`
|
145
|
+
# structure holding the parsed `node`.
|
128
146
|
#
|
129
147
|
# Values for the `mecab` node attributes may be
|
130
148
|
# obtained by using the following `Symbol`s as keys
|
131
149
|
# to the layout associative array of `FFI::Struct` members.
|
132
150
|
#
|
133
|
-
# - :prev
|
134
|
-
# - :next
|
135
|
-
# - :enext
|
136
|
-
# - :bnext
|
137
|
-
# - :rpath
|
138
|
-
# - :lpath
|
139
|
-
# - :surface
|
140
|
-
# - :feature
|
141
|
-
# - :id
|
142
|
-
# - :length
|
143
|
-
# - :rlength
|
144
|
-
# - :rcAttr
|
145
|
-
# - :lcAttr
|
146
|
-
# - :posid
|
147
|
-
# - :char_type
|
148
|
-
# - :stat
|
149
|
-
# - :isbest
|
150
|
-
# - :alpha
|
151
|
-
# - :beta
|
152
|
-
# - :prob
|
153
|
-
# - :wcost
|
154
|
-
# - :cost
|
151
|
+
# - :prev - pointer to previous node
|
152
|
+
# - :next - pointer to next node
|
153
|
+
# - :enext - pointer to the node which ends at the same position
|
154
|
+
# - :bnext - pointer to the node which starts at the same position
|
155
|
+
# - :rpath - pointer to the right path; nil if MECAB_ONE_BEST mode
|
156
|
+
# - :lpath - pointer to the right path; nil if MECAB_ONE_BEST mode
|
157
|
+
# - :surface - surface string; length may be obtained with length/rlength members
|
158
|
+
# - :feature - feature string
|
159
|
+
# - :id - unique node id
|
160
|
+
# - :length - length of surface form
|
161
|
+
# - :rlength - length of the surface form including white space before the morph
|
162
|
+
# - :rcAttr - right attribute id
|
163
|
+
# - :lcAttr - left attribute id
|
164
|
+
# - :posid - part-of-speech id
|
165
|
+
# - :char_type - character type
|
166
|
+
# - :stat - node status; 0 (NOR), 1 (UNK), 2 (BOS), 3 (EOS), 4 (EON)
|
167
|
+
# - :isbest - 1 if this node is best node
|
168
|
+
# - :alpha - forward accumulative log summation, only with marginal probability flag
|
169
|
+
# - :beta - backward accumulative log summation, only with marginal probability flag
|
170
|
+
# - :prob - marginal probability, only with marginal probability flag
|
171
|
+
# - :wcost - word cost
|
172
|
+
# - :cost - best accumulative cost from bos node to this node
|
155
173
|
#
|
156
174
|
# ## Usage
|
157
175
|
# An instance of `MeCabNode` is yielded to the block
|
@@ -164,7 +182,7 @@ module Natto
|
|
164
182
|
# puts "#{n.surface}\t#{n.cost}" if n.is_nor?
|
165
183
|
# end
|
166
184
|
# 卓球 2874
|
167
|
-
#
|
185
|
+
# なんて 4398
|
168
186
|
# 死ぬ 9261
|
169
187
|
# まで 9386
|
170
188
|
# の 10007
|
@@ -173,36 +191,29 @@ module Natto
|
|
173
191
|
# よ 14396
|
174
192
|
# 。 10194
|
175
193
|
#
|
176
|
-
#
|
194
|
+
# While it is also possible to use the `Symbol` for the
|
177
195
|
# `mecab` node member to index into the
|
178
|
-
# `FFI::Struct` layout associative array
|
179
|
-
#
|
180
|
-
#
|
181
|
-
#
|
182
|
-
# 動詞,自立,*,*,五段・ワ行促音便,基本形,笑う,ワラウ,ワラウ
|
183
|
-
# 助詞,接続助詞,*,*,*,*,と,ト,ト
|
184
|
-
# 副詞,一般,*,*,*,*,結構,ケッコウ,ケッコー
|
185
|
-
# 形容詞,自立,*,*,形容詞・イ段,基本形,可愛い,カワイイ,カワイイ
|
186
|
-
# 名詞,一般,*,*,*,*,顔,カオ,カオ
|
187
|
-
# 動詞,自立,*,*,サ変・スル,連用形,する,シ,シ
|
188
|
-
# 動詞,非自立,*,*,一段,体言接続特殊,てる,テン,テン
|
189
|
-
# 助詞,終助詞,*,*,*,*,よ,ヨ,ヨ
|
190
|
-
# 記号,句点,*,*,*,*,。,。,。
|
191
|
-
# BOS/EOS,*,*,*,*,*,*,*,*
|
192
|
-
#
|
196
|
+
# `FFI::Struct` layout associative array, please use the attribute
|
197
|
+
# accessors. In the case of `:surface` and `:feature`, `mecab`
|
198
|
+
# returns the raw bytes, so `natto` will convert that into
|
199
|
+
# a string using the default encoding.
|
193
200
|
class MeCabNode < MeCabStruct
|
194
|
-
|
201
|
+
# @return [String] surface morpheme surface value.
|
202
|
+
attr_accessor :surface
|
203
|
+
# @return [String] corresponding feature value.
|
204
|
+
attr_accessor :feature
|
205
|
+
# @return [FFI::Pointer] pointer to MeCab node struct.
|
195
206
|
attr_reader :pointer
|
196
207
|
|
197
|
-
# Normal `mecab` node defined in the dictionary.
|
208
|
+
# Normal `mecab` node defined in the dictionary, c.f. `stat`.
|
198
209
|
NOR_NODE = 0
|
199
|
-
# Unknown `mecab` node not defined in the dictionary.
|
210
|
+
# Unknown `mecab` node not defined in the dictionary, c.f. `stat`.
|
200
211
|
UNK_NODE = 1
|
201
|
-
# Virtual node representing the beginning of the sentence.
|
212
|
+
# Virtual node representing the beginning of the sentence, c.f. `stat`.
|
202
213
|
BOS_NODE = 2
|
203
|
-
# Virutual node representing the end of the sentence.
|
214
|
+
# Virutual node representing the end of the sentence, c.f. `stat`.
|
204
215
|
EOS_NODE = 3
|
205
|
-
# Virtual node representing the end of an N-Best `mecab` node list.
|
216
|
+
# Virtual node representing the end of an N-Best `mecab` node list, c.f. `stat`.
|
206
217
|
EON_NODE = 4
|
207
218
|
|
208
219
|
layout :prev, :pointer,
|
@@ -227,25 +238,11 @@ module Natto
|
|
227
238
|
:prob, :float,
|
228
239
|
:wcost, :short,
|
229
240
|
:cost, :long
|
230
|
-
|
231
|
-
#if RUBY_VERSION.to_f < 1.9
|
232
|
-
# alias_method :deprecated_id, :id
|
233
|
-
# # `Object#id` override defined when `RUBY_VERSION` is
|
234
|
-
# # older than 1.9. This is a hack to avoid the `Object#id`
|
235
|
-
# # deprecation warning thrown up in Ruby 1.8.7.
|
236
|
-
# #
|
237
|
-
# # <i>This method override is not defined when the Ruby interpreter
|
238
|
-
# # is 1.9 or greater.</i>
|
239
|
-
# # @return [Fixnum] `mecab` node id
|
240
|
-
# def id
|
241
|
-
# self[:id]
|
242
|
-
# end
|
243
|
-
#end
|
244
241
|
|
245
242
|
# Initializes this node instance.
|
246
243
|
# Sets the `MeCab` feature value for this node.
|
247
244
|
#
|
248
|
-
# @param [FFI::Pointer]
|
245
|
+
# @param [FFI::Pointer] ptr pointer to MeCab node
|
249
246
|
def initialize(ptr)
|
250
247
|
super(ptr)
|
251
248
|
@pointer = ptr
|
@@ -308,3 +305,29 @@ module Natto
|
|
308
305
|
end
|
309
306
|
end
|
310
307
|
end
|
308
|
+
|
309
|
+
# Copyright (c) 2014-2015, Brooke M. Fujita.
|
310
|
+
# All rights reserved.
|
311
|
+
#
|
312
|
+
# Redistribution and use in source and binary forms, with or without
|
313
|
+
# modification, are permitted provided that the following conditions are met:
|
314
|
+
#
|
315
|
+
# * Redistributions of source code must retain the above
|
316
|
+
# copyright notice, this list of conditions and the
|
317
|
+
# following disclaimer.
|
318
|
+
#
|
319
|
+
# * Redistributions in binary form must reproduce the above
|
320
|
+
# copyright notice, this list of conditions and the
|
321
|
+
# following disclaimer in the documentation and/or other
|
322
|
+
# materials provided with the distribution.
|
323
|
+
#
|
324
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
325
|
+
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
326
|
+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
327
|
+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
328
|
+
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
329
|
+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
330
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
331
|
+
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
332
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
333
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|