natto 0.9.5 → 0.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG +11 -0
- data/README.md +17 -22
- data/lib/natto.rb +1 -599
- data/lib/natto/binding.rb +36 -17
- data/lib/natto/natto.rb +295 -0
- data/lib/natto/option_parse.rb +36 -28
- data/lib/natto/struct.rb +310 -0
- data/lib/natto/version.rb +16 -16
- metadata +30 -33
- data/lib/natto/utils.rb +0 -16
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 14ae50169a93b3810e5ae2258187d71f80d8be1a
|
4
|
+
data.tar.gz: 8b89a9be35a76123c1955d85913c7665636235b1
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 996501067f551d7e7497155f5a128b256adc858c4ebcb127218eca393398bd70cd26911de4516e18fcf7450c48c00da598d5a54e6fe5e91025907121c2f6fc8c
|
7
|
+
data.tar.gz: 308595234aa422803e3a0ac09573601a3e8360837918964e46c1151cb65f03121acc85c789d0c18be86428d7947c4025055925f6dc03b709b6fdc734b55e5c74
|
data/CHANGELOG
CHANGED
@@ -1,5 +1,16 @@
|
|
1
1
|
## CHANGELOG
|
2
2
|
|
3
|
+
- __2013/07/07__: 0.9.6 release.
|
4
|
+
- Upgrade to mecab 0.996
|
5
|
+
- Adding support for partial parsing mode (-p / --partial)
|
6
|
+
- Adding support for marginal probability output mode (-m / --marginal)
|
7
|
+
- Adding support for maximum grouping size for unknown words (-M / --max-grouping-size)
|
8
|
+
- Outputting warning message for deprecation of :lattice_level option
|
9
|
+
- Requiring ffi 1.9.0 or greater
|
10
|
+
- Dropping support for Ruby 1.8.7
|
11
|
+
- Migrating to minitest
|
12
|
+
- Removing automatic library load for Cygwin platform (does not compile)
|
13
|
+
|
3
14
|
- __2012/09/16__: 0.9.5 release.
|
4
15
|
- Fixed [Issue 9: trimされていない文字列のparse](https://bitbucket.org/buruzaemon/natto/issue/9/trim-parse)
|
5
16
|
- Fixed [Issue 10: BUG Segmentation Fault](https://bitbucket.org/buruzaemon/natto/issue/10/bug-segmentation-fault)
|
data/README.md
CHANGED
@@ -11,31 +11,31 @@ You can learn more about [natto at bitbucket](https://bitbucket.org/buruzaemon/n
|
|
11
11
|
## Requirements
|
12
12
|
natto requires the following:
|
13
13
|
|
14
|
-
- [MeCab _0.
|
15
|
-
- [ffi
|
16
|
-
- Ruby _1.
|
14
|
+
- [MeCab _0.996_](http://code.google.com/p/mecab/downloads/list)
|
15
|
+
- [ffi _1.9.0 or greater_](http://rubygems.org/gems/ffi)
|
16
|
+
- Ruby _1.9 or greater_
|
17
17
|
|
18
|
-
## Installation on *NIX/Mac
|
18
|
+
## Installation on *NIX/Mac
|
19
19
|
Install natto with the following gem command:
|
20
20
|
|
21
21
|
gem install natto
|
22
22
|
|
23
|
-
This will automatically install the [ffi](http://rubygems.org/gems/ffi) rubygem, which natto uses to bind to the
|
23
|
+
This will automatically install the [ffi](http://rubygems.org/gems/ffi) rubygem, which natto uses to bind to the `mecab` library.
|
24
24
|
|
25
25
|
## Installation on Windows
|
26
|
-
However, if you are using a CRuby on Windows, then you will first need to install the [RubyInstaller Development Kit (DevKit)](https://github.com/oneclick/rubyinstaller/wiki/Development-Kit), a MSYS/MinGW based toolkit than enables your Windows Ruby installation to build many of the native C/C++ extensions available, including
|
26
|
+
However, if you are using a CRuby on Windows, then you will first need to install the [RubyInstaller Development Kit (DevKit)](https://github.com/oneclick/rubyinstaller/wiki/Development-Kit), a MSYS/MinGW based toolkit than enables your Windows Ruby installation to build many of the native C/C++ extensions available, including `ffi`.
|
27
27
|
|
28
28
|
1. Download the latest release for RubyInstaller for Windows platforms and the corresponding DevKit from the [RubyInstaller for Windows downloads page](http://rubyinstaller.org/downloads/).
|
29
|
-
2. After installing RubyInstaller for Windows, double-click on the DevKit-tdm installer
|
30
|
-
3. Open a command window under
|
31
|
-
4. Next, execute:
|
32
|
-
5. Install
|
29
|
+
2. After installing RubyInstaller for Windows, double-click on the DevKit-tdm installer `.exe`, and expand the contents to an appropriate location, for example `C:\devkit`.
|
30
|
+
3. Open a command window under `C:\devkit`, and execute: `ruby dk.rb init`. This will locate all known ruby installations, and add them to `C:\devkit\config.yml`.
|
31
|
+
4. Next, execute: `ruby dk.rb install`, which will add the DevKit to all of the installed rubies listed in your `C:\devkit\config.yml`. Now you should be able to install and build the `ffi` rubygem correctly on your Windows-installed ruby.
|
32
|
+
5. Install `natto` with:
|
33
33
|
|
34
34
|
gem install natto
|
35
35
|
|
36
36
|
## Configuration
|
37
|
-
- natto will try to locate the
|
38
|
-
- In case of
|
37
|
+
- natto will try to locate the `mecab` library based upon its runtime environment.
|
38
|
+
- In case of `LoadError`, please set the `MECAB_PATH` environment variable to the exact name/path to your `mecab` library.
|
39
39
|
|
40
40
|
e.g., for bash on UNIX/Linux
|
41
41
|
|
@@ -45,16 +45,11 @@ e.g., on Windows
|
|
45
45
|
|
46
46
|
set MECAB_PATH=C:\Program Files\MeCab\bin\libmecab.dll
|
47
47
|
|
48
|
-
e.g., for Cygwin
|
49
|
-
|
50
|
-
export MECAB_PATH=cygmecab-1
|
51
|
-
|
52
48
|
e.g., from within a Ruby program
|
53
49
|
|
54
|
-
ENV['MECAB_PATH']
|
50
|
+
ENV['MECAB_PATH']='/usr/local/lib/libmecab.so'
|
55
51
|
|
56
52
|
## Usage
|
57
|
-
require 'rubygems' if RUBY_VERSION.to_f < 1.9
|
58
53
|
require 'natto'
|
59
54
|
|
60
55
|
nm = Natto::MeCab.new
|
@@ -65,10 +60,10 @@ e.g., from within a Ruby program
|
|
65
60
|
type="0", \
|
66
61
|
filename="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
|
67
62
|
charset="utf8">], \
|
68
|
-
@version="0.
|
63
|
+
@version="0.996">
|
69
64
|
|
70
65
|
puts nm.version
|
71
|
-
=> "0.
|
66
|
+
=> "0.996"
|
72
67
|
|
73
68
|
sysdic = nm.dicts.first
|
74
69
|
|
@@ -103,8 +98,8 @@ e.g., from within a Ruby program
|
|
103
98
|
- Fork the project.
|
104
99
|
- Start a feature/bugfix branch.
|
105
100
|
- Commit and push until you are happy with your contribution.
|
106
|
-
- Make sure to add tests for it. This is important so I don't break it in a future version unintentionally. I use [
|
107
|
-
- Please try not to mess with the Rakefile,
|
101
|
+
- Make sure to add tests for it. This is important so I don't break it in a future version unintentionally. I use [MiniTest::Unit](http://rubydoc.info/gems/minitest/MiniTest/Unit) as it is very natural and easy-to-use.
|
102
|
+
- Please try not to mess with the Rakefile, CHANGELOG, or version. If you must have your own version, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
108
103
|
|
109
104
|
## Changelog
|
110
105
|
Please see the {file:CHANGELOG} for this gem's release history.
|
data/lib/natto.rb
CHANGED
@@ -1,599 +1 @@
|
|
1
|
-
|
2
|
-
require 'rubygems' if RUBY_VERSION.to_f < 1.9
|
3
|
-
require 'natto/binding'
|
4
|
-
require 'natto/option_parse'
|
5
|
-
require 'natto/utils'
|
6
|
-
|
7
|
-
module Natto
|
8
|
-
require 'ffi'
|
9
|
-
|
10
|
-
# <tt>MeCab</tt> is a wrapper class for the <tt>mecab</tt> tagger.
|
11
|
-
# Options to the <tt>mecab</tt> tagger are passed in as a string
|
12
|
-
# (MeCab command-line style) or as a Ruby-style hash at
|
13
|
-
# initialization.
|
14
|
-
#
|
15
|
-
# <h2>Usage</h2>
|
16
|
-
#
|
17
|
-
# require 'rubygems' if RUBY_VERSION.to_f < 1.9
|
18
|
-
# require 'natto'
|
19
|
-
#
|
20
|
-
# nm = Natto::MeCab.new('-Ochasen')
|
21
|
-
# => #<Natto::MeCab:0x28d3bdc8 \
|
22
|
-
# @tagger=#<FFI::Pointer address=0x28afb980>, \
|
23
|
-
# @options={:output_format_type=>"chasen"}, \
|
24
|
-
# @dicts=[#<Natto::DictionaryInfo:0x289a1f14 \
|
25
|
-
# type="0", \
|
26
|
-
# filename="/usr/local/lib/mecab/dic/ipadic/sys.dic", \
|
27
|
-
# charset="utf8">], \
|
28
|
-
# @version="0.994">
|
29
|
-
#
|
30
|
-
# nm.parse('凡人にしか見えねえ風景ってのがあるんだよ。') do |n|
|
31
|
-
# puts "#{n.surface}\t#{n.feature}"
|
32
|
-
# end
|
33
|
-
# 凡人 名詞,一般,*,*,*,*,凡人,ボンジン,ボンジン
|
34
|
-
# に 助詞,格助詞,一般,*,*,*,に,ニ,ニ
|
35
|
-
# しか 助詞,係助詞,*,*,*,*,しか,シカ,シカ
|
36
|
-
# 見え 動詞,自立,*,*,一段,未然形,見える,ミエ,ミエ
|
37
|
-
# ねえ 助動詞,*,*,*,特殊・ナイ,音便基本形,ない,ネエ,ネー
|
38
|
-
# 風景 名詞,一般,*,*,*,*,風景,フウケイ,フーケイ
|
39
|
-
# って 助詞,格助詞,連語,*,*,*,って,ッテ,ッテ
|
40
|
-
# の 名詞,非自立,一般,*,*,*,の,ノ,ノ
|
41
|
-
# が 助詞,格助詞,一般,*,*,*,が,ガ,ガ
|
42
|
-
# ある 動詞,自立,*,*,五段・ラ行,基本形,ある,アル,アル
|
43
|
-
# ん 名詞,非自立,一般,*,*,*,ん,ン,ン
|
44
|
-
# だ 助動詞,*,*,*一般,特殊・ダ,基本形,だ,ダ,ダ
|
45
|
-
# よ 助詞,終助詞,*,*,*,*,よ,ã¨,ヨ
|
46
|
-
# 。 記号,句点,*,*,*,*,。,。,。
|
47
|
-
# BOS/EOS,*,*,*,*,*,*,*,*BOS
|
48
|
-
#
|
49
|
-
class MeCab
|
50
|
-
include Natto::Binding
|
51
|
-
include Natto::OptionParse
|
52
|
-
include Natto::Utils
|
53
|
-
|
54
|
-
attr_reader :tagger, :options, :dicts, :version
|
55
|
-
|
56
|
-
# Initializes the wrapped <tt>mecab</tt> instance with the
|
57
|
-
# given <tt>options</tt>.
|
58
|
-
#
|
59
|
-
# Options supported are:
|
60
|
-
#
|
61
|
-
# - :rcfile -- resource file
|
62
|
-
# - :dicdir -- system dicdir
|
63
|
-
# - :userdic -- user dictionary
|
64
|
-
# - :lattice_level -- lattice information level (DEPRECATED)
|
65
|
-
# - :output_format_type -- output format type (wakati, chasen, yomi, etc.)
|
66
|
-
# - :all_morphs -- output all morphs (default false)
|
67
|
-
# - :nbest -- output N best results (integer, default 1), requires lattice level >= 1
|
68
|
-
# - :node_format -- user-defined node format
|
69
|
-
# - :unk_format -- user-defined unknown node format
|
70
|
-
# - :bos_format -- user-defined beginning-of-sentence format
|
71
|
-
# - :eos_format -- user-defined end-of-sentence format
|
72
|
-
# - :eon_format -- user-defined end-of-NBest format
|
73
|
-
# - :unk_feature -- feature for unknown word
|
74
|
-
# - :input_buffer_size -- set input buffer size (default 8192)
|
75
|
-
# - :allocate_sentence -- allocate new memory for input sentence
|
76
|
-
# - :theta -- temperature parameter theta (float, default 0.75)
|
77
|
-
# - :cost_factor -- cost factor (integer, default 700)
|
78
|
-
#
|
79
|
-
# <p>MeCab command-line arguments (-F) or long (--node-format) may be used in
|
80
|
-
# addition to Ruby-style <code>Hash</code>es</p>
|
81
|
-
# <i>Use single-quotes to preserve format options that contain escape chars.</i><br/>
|
82
|
-
# e.g.<br/>
|
83
|
-
#
|
84
|
-
# nm = Natto::MeCab.new(:node_format=>'%m¥t%f[7]¥n')
|
85
|
-
# => #<Natto::MeCab:0x28d2ae10
|
86
|
-
# @tagger=#<FFI::Pointer address=0x28a97980>, \
|
87
|
-
# @options={:node_format=>"%m¥t%f[7]¥n"}, \
|
88
|
-
# @dicts=[#<Natto::DictionaryInfo:0x28d2a85c \
|
89
|
-
# type="0", \
|
90
|
-
# filename="/usr/local/lib/mecab/dic/ipadic/sys.dic" \
|
91
|
-
# charset="utf8">], \
|
92
|
-
# @version="0.994">
|
93
|
-
#
|
94
|
-
# puts nm.parse('才能とは求める人間に与えられるものではない。')
|
95
|
-
# 才能 サイノウ
|
96
|
-
# と ト
|
97
|
-
# は ハ
|
98
|
-
# 求 モトメル
|
99
|
-
# 人間 ニンゲン
|
100
|
-
# に ニ
|
101
|
-
# 与え アタエ
|
102
|
-
# られる ラレル
|
103
|
-
# もの モノ
|
104
|
-
# で デ
|
105
|
-
# は ハ
|
106
|
-
# ない ナイ
|
107
|
-
# 。 。
|
108
|
-
# EOS
|
109
|
-
#
|
110
|
-
# @param [Hash or String]
|
111
|
-
# @raise [MeCabError] if <tt>mecab</tt> cannot be initialized with the given <tt>options</tt>
|
112
|
-
def initialize(options={})
|
113
|
-
@options = self.class.parse_mecab_options(options)
|
114
|
-
@dicts = []
|
115
|
-
|
116
|
-
opt_str = self.class.build_options_str(@options)
|
117
|
-
@tagger = self.mecab_new2(opt_str)
|
118
|
-
raise MeCabError.new("Could not initialize MeCab with options: '#{opt_str}'") if @tagger.address == 0x0
|
119
|
-
|
120
|
-
self.mecab_set_theta(@tagger, @options[:theta]) if @options[:theta]
|
121
|
-
self.mecab_set_lattice_level(@tagger, @options[:lattice_level]) if @options[:lattice_level]
|
122
|
-
self.mecab_set_all_morphs(@tagger, 1) if @options[:all_morphs]
|
123
|
-
|
124
|
-
# Set mecab parsing implementations for N-best and regular parsing,
|
125
|
-
# for both parsing as string and yielding a node object
|
126
|
-
# N-Best parsing implementations
|
127
|
-
if @options[:nbest] && @options[:nbest] > 1
|
128
|
-
self.mecab_set_lattice_level(@tagger, (@options[:lattice_level] || 1))
|
129
|
-
@parse_tostr = lambda do |str|
|
130
|
-
return self.mecab_nbest_sparse_tostr(@tagger, @options[:nbest], str) ||
|
131
|
-
raise(MeCabError.new(self.mecab_strerror(@tagger)))
|
132
|
-
end
|
133
|
-
@parse_tonodes = lambda do |str|
|
134
|
-
nodes = []
|
135
|
-
if @options[:nbest] && @options[:nbest] > 1
|
136
|
-
self.mecab_nbest_init(@tagger, str)
|
137
|
-
n = self.mecab_nbest_next_tonode(@tagger)
|
138
|
-
raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
|
139
|
-
nlen = @options[:nbest]
|
140
|
-
nlen.times do |i|
|
141
|
-
s = str.bytes.to_a
|
142
|
-
while n && n.address != 0x0
|
143
|
-
mn = Natto::MeCabNode.new(n)
|
144
|
-
s = s.drop_while {|e| (e==0xa || e==0x20)}
|
145
|
-
if !s.empty?
|
146
|
-
sarr = []
|
147
|
-
mn.length.times { sarr << s.shift }
|
148
|
-
surf = sarr.pack('C*')
|
149
|
-
mn.surface = self.class.force_enc(surf)
|
150
|
-
end
|
151
|
-
if @options[:output_format_type] || @options[:node_format]
|
152
|
-
mn.feature = self.class.force_enc(self.mecab_format_node(@tagger, n))
|
153
|
-
end
|
154
|
-
nodes << mn if !mn.is_bos?
|
155
|
-
n = mn.next
|
156
|
-
end
|
157
|
-
n = self.mecab_nbest_next_tonode(@tagger)
|
158
|
-
end
|
159
|
-
end
|
160
|
-
return nodes
|
161
|
-
end
|
162
|
-
else
|
163
|
-
# default parsing implementations
|
164
|
-
@parse_tostr = lambda do |str|
|
165
|
-
return self.mecab_sparse_tostr(@tagger, str) ||
|
166
|
-
raise(MeCabError.new(self.mecab_strerror(@tagger)))
|
167
|
-
end
|
168
|
-
@parse_tonodes = lambda do |str|
|
169
|
-
nodes = []
|
170
|
-
n = self.mecab_sparse_tonode(@tagger, str)
|
171
|
-
raise(MeCabError.new(self.mecab_strerror(@tagger))) if n.nil? || n.address==0x0
|
172
|
-
mn = Natto::MeCabNode.new(n)
|
173
|
-
n = mn.next if mn.next.address!=0x0
|
174
|
-
s = str.bytes.to_a
|
175
|
-
while n && n.address!=0x0
|
176
|
-
mn = Natto::MeCabNode.new(n)
|
177
|
-
s = s.drop_while {|e| (e==0xa || e==0x20)}
|
178
|
-
if !s.empty?
|
179
|
-
sarr = []
|
180
|
-
mn.length.times { sarr << s.shift }
|
181
|
-
surf = sarr.pack('C*')
|
182
|
-
mn.surface = self.class.force_enc(surf)
|
183
|
-
end
|
184
|
-
nodes << mn
|
185
|
-
n = mn.next
|
186
|
-
end
|
187
|
-
return nodes
|
188
|
-
end
|
189
|
-
end
|
190
|
-
|
191
|
-
@dicts << Natto::DictionaryInfo.new(Natto::Binding.mecab_dictionary_info(@tagger))
|
192
|
-
while @dicts.last.next.address != 0x0
|
193
|
-
@dicts << Natto::DictionaryInfo.new(@dicts.last.next)
|
194
|
-
end
|
195
|
-
|
196
|
-
@version = self.mecab_version
|
197
|
-
|
198
|
-
ObjectSpace.define_finalizer(self, self.class.create_free_proc(@tagger))
|
199
|
-
end
|
200
|
-
|
201
|
-
# Parses the given string <tt>str</tt>. If a block is passed to this method,
|
202
|
-
# then node parsing will be used and each node yielded to the given block.
|
203
|
-
#
|
204
|
-
# @param [String] str
|
205
|
-
# @return parsing result from <tt>mecab</tt>
|
206
|
-
# @raise [MeCabError] if the <tt>mecab</tt> tagger cannot parse the given string <tt>str</tt>
|
207
|
-
# @raise [ArgumentError] if the given string <tt>str</tt> argument is <tt>nil</tt>
|
208
|
-
# @see MeCabNode
|
209
|
-
def parse(str)
|
210
|
-
raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
|
211
|
-
if block_given?
|
212
|
-
nodes = @parse_tonodes.call(str)
|
213
|
-
nodes.each {|n| yield n }
|
214
|
-
else
|
215
|
-
self.class.force_enc(@parse_tostr.call(str))
|
216
|
-
end
|
217
|
-
end
|
218
|
-
|
219
|
-
# Parses the given string <tt>str</tt>, and returns
|
220
|
-
# a list of <tt>mecab</tt> nodes.
|
221
|
-
# @param [String] str
|
222
|
-
# @return [Array] of parsed <tt>mecab</tt> nodes.
|
223
|
-
# @raise [MeCabError] if the <tt>mecab</tt> tagger cannot parse the given string <tt>str</tt>
|
224
|
-
# @raise [ArgumentError] if the given string <tt>str</tt> argument is <tt>nil</tt>
|
225
|
-
# @see MeCabNode
|
226
|
-
def parse_as_nodes(str)
|
227
|
-
raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
|
228
|
-
@parse_tonodes.call(str)
|
229
|
-
end
|
230
|
-
|
231
|
-
# Parses the given string <tt>str</tt>, and returns
|
232
|
-
# a list of <tt>mecab</tt> result strings.
|
233
|
-
# @param [String] str
|
234
|
-
# @return [Array] of parsed <tt>mecab</tt> result strings.
|
235
|
-
# @raise [MeCabError] if the <tt>mecab</tt> tagger cannot parse the given string <tt>str</tt>
|
236
|
-
# @raise [ArgumentError] if the given string <tt>str</tt> argument is <tt>nil</tt>
|
237
|
-
def parse_as_strings(str)
|
238
|
-
raise ArgumentError.new 'String to parse cannot be nil' if str.nil?
|
239
|
-
self.class.force_enc(@parse_tostr.call(str)).lines.to_a
|
240
|
-
end
|
241
|
-
|
242
|
-
# DEPRECATED: use parse_as_nodes instead.
|
243
|
-
def readnodes(str)
|
244
|
-
$stdout.puts 'DEPRECATED: use parse_as_nodes instead'
|
245
|
-
parse_as_nodes(str)
|
246
|
-
end
|
247
|
-
|
248
|
-
# DEPRECATED: use parse_as_strings instead.
|
249
|
-
def readlines(str)
|
250
|
-
$stdout.puts 'DEPRECATED: use parse_as_strings instead'
|
251
|
-
parse_as_strings(str)
|
252
|
-
end
|
253
|
-
|
254
|
-
# Returns human-readable details for the wrapped <tt>mecab</tt> tagger.
|
255
|
-
# Overrides <tt>Object#to_s</tt>.
|
256
|
-
#
|
257
|
-
# - encoded object id
|
258
|
-
# - underlying FFI pointer to the <tt>mecab</tt> tagger
|
259
|
-
# - options hash
|
260
|
-
# - list of dictionaries
|
261
|
-
# - MeCab version
|
262
|
-
#
|
263
|
-
# @return [String] encoded object id, underlying FFI pointer, options hash, list of dictionaries, and MeCab version
|
264
|
-
def to_s
|
265
|
-
%(#{super.chop} @tagger=#{@tagger}, @options=#{@options.inspect}, @dicts=#{@dicts.to_s}, @version="#{@version.to_s}">)
|
266
|
-
end
|
267
|
-
|
268
|
-
# Overrides <tt>Object#inspect</tt>.
|
269
|
-
#
|
270
|
-
# @return [String] encoded object id, FFI pointer, options hash, list of dictionaries, and MeCab version
|
271
|
-
# @see #to_s
|
272
|
-
def inspect
|
273
|
-
self.to_s
|
274
|
-
end
|
275
|
-
|
276
|
-
# Returns a <tt>Proc</tt> that will properly free resources
|
277
|
-
# when this <tt>MeCab</tt> instance is garbage collected.
|
278
|
-
# The <tt>Proc</tt> returned is registered to be invoked
|
279
|
-
# after the <tt>MeCab</tt> instance owning <tt>ptr</tt>
|
280
|
-
# has been destroyed.
|
281
|
-
#
|
282
|
-
# @param [FFI::Pointer] ptr
|
283
|
-
# @return [Proc] to release <tt>mecab</tt> resources properly
|
284
|
-
def self.create_free_proc(ptr)
|
285
|
-
Proc.new do
|
286
|
-
self.mecab_destroy(ptr)
|
287
|
-
end
|
288
|
-
end
|
289
|
-
end
|
290
|
-
|
291
|
-
# <tt>MeCabError</tt> is a general error class
|
292
|
-
# for the <tt>Natto</tt> module.
|
293
|
-
class MeCabError < RuntimeError; end
|
294
|
-
|
295
|
-
# <tt>MeCabStruct</tt> is a general base class
|
296
|
-
# for <tt>FFI::Struct</tt> objects in the <tt>Natto</tt> module.
|
297
|
-
class MeCabStruct < FFI::Struct
|
298
|
-
# Provides accessor methods for the members of the <tt>mecab</tt> struct.
|
299
|
-
#
|
300
|
-
# @param [String] attr_name
|
301
|
-
# @return member values for the <tt>mecab</tt> struct
|
302
|
-
# @raise [NoMethodError] if <tt>attr_name</tt> is not a member of this <tt>mecab</tt> struct
|
303
|
-
def method_missing(attr_name)
|
304
|
-
member_sym = attr_name.id2name.to_sym
|
305
|
-
return self[member_sym] if self.members.include?(member_sym)
|
306
|
-
raise(NoMethodError.new("undefined method '#{attr_name}' for #{self}"))
|
307
|
-
end
|
308
|
-
end
|
309
|
-
|
310
|
-
# <tt>DictionaryInfo</tt> is a wrapper for the structure holding
|
311
|
-
# the <tt>MeCab</tt> instance's related dictionary information.
|
312
|
-
#
|
313
|
-
# Values for the <tt>mecab</tt> dictionary attributes may be
|
314
|
-
# obtained by using the following <tt>Symbol</tt>s as keys
|
315
|
-
# to the layout associative array of <tt>FFI::Struct</tt> members.
|
316
|
-
#
|
317
|
-
# - :filename
|
318
|
-
# - :charset
|
319
|
-
# - :size
|
320
|
-
# - :type
|
321
|
-
# - :lsize
|
322
|
-
# - :rsize
|
323
|
-
# - :version
|
324
|
-
# - :next
|
325
|
-
#
|
326
|
-
# <h2>Usage</h2>
|
327
|
-
# <tt>mecab</tt> dictionary attributes can be obtained by
|
328
|
-
# using their corresponding accessor.
|
329
|
-
#
|
330
|
-
# nm = Natto::MeCab.new
|
331
|
-
#
|
332
|
-
# sysdic = nm.dicts.first
|
333
|
-
#
|
334
|
-
# puts sysdic.filename
|
335
|
-
# => "/usr/local/lib/mecab/dic/ipadic/sys.dic"
|
336
|
-
#
|
337
|
-
# puts sysdic.charset
|
338
|
-
# => "utf8"
|
339
|
-
#
|
340
|
-
# puts sysdic.is_sysdic?
|
341
|
-
# => true
|
342
|
-
class DictionaryInfo < MeCabStruct
|
343
|
-
# System dictionary.
|
344
|
-
SYS_DIC = 0
|
345
|
-
# User dictionary.
|
346
|
-
USR_DIC = 1
|
347
|
-
# Unknown dictionary.
|
348
|
-
UNK_DIC = 2
|
349
|
-
|
350
|
-
layout :filename, :string,
|
351
|
-
:charset, :string,
|
352
|
-
:size, :uint,
|
353
|
-
:type, :int,
|
354
|
-
:lsize, :uint,
|
355
|
-
:rsize, :uint,
|
356
|
-
:version, :ushort,
|
357
|
-
:next, :pointer
|
358
|
-
|
359
|
-
if Object.respond_to?(:type) && Object.respond_to?(:class)
|
360
|
-
alias_method :deprecated_type, :type
|
361
|
-
# <tt>Object#type</tt> override defined when both <tt>type</tt> and
|
362
|
-
# <tt>class</tt> are Object methods. This is a hack to avoid the
|
363
|
-
# <tt>Object#type</tt> deprecation warning thrown up in Ruby 1.8.7
|
364
|
-
# and in JRuby.
|
365
|
-
#
|
366
|
-
# @return [Fixnum] <tt>mecab</tt> dictionary type
|
367
|
-
def type
|
368
|
-
self[:type]
|
369
|
-
end
|
370
|
-
end
|
371
|
-
|
372
|
-
# Returns human-readable details for this <tt>mecab</tt> dictionary.
|
373
|
-
# Overrides <tt>Object#to_s</tt>.
|
374
|
-
#
|
375
|
-
# - encoded object id
|
376
|
-
# - dictionary type
|
377
|
-
# - full-path dictionary filename
|
378
|
-
# - dictionary charset
|
379
|
-
#
|
380
|
-
# @return [String] encoded object id, type, dictionary filename, and charset
|
381
|
-
def to_s
|
382
|
-
%(#{super.chop} type="#{self.type}", filename="#{self.filename}", charset="#{self.charset}">)
|
383
|
-
end
|
384
|
-
|
385
|
-
# Overrides <tt>Object#inspect</tt>.
|
386
|
-
#
|
387
|
-
# @return [String] encoded object id, dictionary filename, and charset
|
388
|
-
# @see #to_s
|
389
|
-
def inspect
|
390
|
-
self.to_s
|
391
|
-
end
|
392
|
-
|
393
|
-
# Returns <tt>true</tt> if this is a system dictionary.
|
394
|
-
# @return [Boolean]
|
395
|
-
def is_sysdic?
|
396
|
-
self.type == SYS_DIC
|
397
|
-
end
|
398
|
-
|
399
|
-
# Returns <tt>true</tt> if this is a user dictionary.
|
400
|
-
# @return [Boolean]
|
401
|
-
def is_usrdic?
|
402
|
-
self.type == USR_DIC
|
403
|
-
end
|
404
|
-
|
405
|
-
# Returns <tt>true</tt> if this is a unknown dictionary type.
|
406
|
-
# @return [Boolean]
|
407
|
-
def is_unkdic?
|
408
|
-
self.type == UNK_DIC
|
409
|
-
end
|
410
|
-
end
|
411
|
-
|
412
|
-
# <tt>MeCabNode</tt> is a wrapper for the structure holding
|
413
|
-
# the parsed <tt>node</tt>.
|
414
|
-
#
|
415
|
-
# Values for the <tt>mecab</tt> node attributes may be
|
416
|
-
# obtained by using the following <tt>Symbol</tt>s as keys
|
417
|
-
# to the layout associative array of <tt>FFI::Struct</tt> members.
|
418
|
-
#
|
419
|
-
# - :prev
|
420
|
-
# - :next
|
421
|
-
# - :enext
|
422
|
-
# - :bnext
|
423
|
-
# - :rpath
|
424
|
-
# - :lpath
|
425
|
-
# - :surface
|
426
|
-
# - :feature
|
427
|
-
# - :id
|
428
|
-
# - :length
|
429
|
-
# - :rlength
|
430
|
-
# - :rcAttr
|
431
|
-
# - :lcAttr
|
432
|
-
# - :posid
|
433
|
-
# - :char_type
|
434
|
-
# - :stat
|
435
|
-
# - :isbest
|
436
|
-
# - :alpha
|
437
|
-
# - :beta
|
438
|
-
# - :beta
|
439
|
-
# - :prob
|
440
|
-
# - :wcost
|
441
|
-
# - :cost
|
442
|
-
#
|
443
|
-
# <h2>Usage</h2>
|
444
|
-
# An instance of <tt>MeCabNode</tt> is yielded to the block
|
445
|
-
# used with <tt>MeCab#parse</tt>, where the above-mentioned
|
446
|
-
# node attributes may be accessed by name.
|
447
|
-
#
|
448
|
-
# nm = Natto::MeCab.new
|
449
|
-
#
|
450
|
-
# nm.parse('卓球なんて死ぬまでの暇つぶしだよ。') do |n|
|
451
|
-
# puts "#{n.surface}\t#{n.cost}" if n.is_nor?
|
452
|
-
# end
|
453
|
-
# 卓球 2874
|
454
|
-
# な 4398
|
455
|
-
# 死ぬ 9261
|
456
|
-
# まで 9386
|
457
|
-
# の 10007
|
458
|
-
# 暇つぶし 13324
|
459
|
-
# だ 15346
|
460
|
-
# よ 14396
|
461
|
-
# 。 10194
|
462
|
-
#
|
463
|
-
# It is also possible to use the <tt>Symbol</tt> for the
|
464
|
-
# <tt>mecab</tt> node member to index into the
|
465
|
-
# <tt>FFI::Struct</tt> layout associative array like so:
|
466
|
-
#
|
467
|
-
# nm.parse('あいつ笑うと結構可愛い顔してんよ。') {|n| puts n[:feature] }
|
468
|
-
# 名詞,代名詞,一般,*,*,*,あいつ,アイツ,アイツ
|
469
|
-
# 動詞,自立,*,*,五段・ワ行促音便,基本形,笑う,ワラウ,ワラウ
|
470
|
-
# 助詞,接続助詞,*,*,*,*,と,ト,ト
|
471
|
-
# 副詞,一般,*,*,*,*,結構,ケッコウ,ケッコー
|
472
|
-
# 形容詞,自立,*,*,形容詞・イ段,基本形,可愛い,カワイイ,カワイイ
|
473
|
-
# 名詞,一般,*,*,*,*,顔,カオ,カオ
|
474
|
-
# 動詞,自立,*,*,サ変・スル,連用形,する,シ,シ
|
475
|
-
# 動詞,非自立,*,*,一段,体言接続特殊,てる,テン,テン
|
476
|
-
# 助詞,終助詞,*,*,*,*,よ,ヨ,ヨ
|
477
|
-
# 記号,句点,*,*,*,*,。,。,。
|
478
|
-
# BOS/EOS,*,*,*,*,*,*,*,*
|
479
|
-
#
|
480
|
-
class MeCabNode < MeCabStruct
|
481
|
-
include Natto::Utils
|
482
|
-
|
483
|
-
attr_accessor :surface, :feature
|
484
|
-
attr_reader :pointer
|
485
|
-
|
486
|
-
# Normal <tt>mecab</tt> node defined in the dictionary.
|
487
|
-
NOR_NODE = 0
|
488
|
-
# Unknown <tt>mecab</tt> node not defined in the dictionary.
|
489
|
-
UNK_NODE = 1
|
490
|
-
# Virtual node representing the beginning of the sentence.
|
491
|
-
BOS_NODE = 2
|
492
|
-
# Virutual node representing the end of the sentence.
|
493
|
-
EOS_NODE = 3
|
494
|
-
# Virtual node representing the end of an N-Best <tt>mecab</tt> node list.
|
495
|
-
EON_NODE = 4
|
496
|
-
|
497
|
-
layout :prev, :pointer,
|
498
|
-
:next, :pointer,
|
499
|
-
:enext, :pointer,
|
500
|
-
:bnext, :pointer,
|
501
|
-
:rpath, :pointer,
|
502
|
-
:lpath, :pointer,
|
503
|
-
:surface, :string,
|
504
|
-
:feature, :string,
|
505
|
-
:id, :uint,
|
506
|
-
:length, :ushort,
|
507
|
-
:rlength, :ushort,
|
508
|
-
:rcAttr, :ushort,
|
509
|
-
:lcAttr, :ushort,
|
510
|
-
:posid, :ushort,
|
511
|
-
:char_type, :uchar,
|
512
|
-
:stat, :uchar,
|
513
|
-
:isbest, :uchar,
|
514
|
-
:alpha, :float,
|
515
|
-
:beta, :float,
|
516
|
-
:prob, :float,
|
517
|
-
:wcost, :short,
|
518
|
-
:cost, :long
|
519
|
-
|
520
|
-
if RUBY_VERSION.to_f < 1.9
|
521
|
-
alias_method :deprecated_id, :id
|
522
|
-
# <tt>Object#id</tt> override defined when <tt>RUBY_VERSION</tt> is
|
523
|
-
# older than 1.9. This is a hack to avoid the <tt>Object#id</tt>
|
524
|
-
# deprecation warning thrown up in Ruby 1.8.7.
|
525
|
-
#
|
526
|
-
# <i>This method override is not defined when the Ruby interpreter
|
527
|
-
# is 1.9 or greater.</i>
|
528
|
-
# @return [Fixnum] <tt>mecab</tt> node id
|
529
|
-
def id
|
530
|
-
self[:id]
|
531
|
-
end
|
532
|
-
end
|
533
|
-
|
534
|
-
# Initializes this node instance.
|
535
|
-
# Sets the <tt>MeCab</tt> feature value for this node.
|
536
|
-
#
|
537
|
-
# @param [FFI::Pointer]
|
538
|
-
def initialize(ptr)
|
539
|
-
super(ptr)
|
540
|
-
@pointer = ptr
|
541
|
-
|
542
|
-
if self[:feature]
|
543
|
-
@feature = self.class.force_enc(self[:feature])
|
544
|
-
end
|
545
|
-
end
|
546
|
-
|
547
|
-
# Returns human-readable details for the <tt>mecab</tt> node.
|
548
|
-
# Overrides <tt>Object#to_s</tt>.
|
549
|
-
#
|
550
|
-
# - encoded object id
|
551
|
-
# - underlying FFI pointer to MeCab Node
|
552
|
-
# - stat (node type: NOR, UNK, BOS/EOS, EON)
|
553
|
-
# - surface
|
554
|
-
# - feature
|
555
|
-
#
|
556
|
-
# @return [String] encoded object id, underlying FFI pointer, stat, surface, and feature
|
557
|
-
def to_s
|
558
|
-
%(#{super.chop} @pointer=#{@pointer}, stat=#{self[:stat]}, @surface="#{self.surface}", @feature="#{self.feature}">)
|
559
|
-
end
|
560
|
-
|
561
|
-
# Overrides <tt>Object#inspect</tt>.
|
562
|
-
#
|
563
|
-
# @return [String] encoded object id, stat, surface, and feature
|
564
|
-
# @see #to_s
|
565
|
-
def inspect
|
566
|
-
self.to_s
|
567
|
-
end
|
568
|
-
|
569
|
-
# Returns <tt>true</tt> if this is a normal <tt>mecab</tt> node found in the dictionary.
|
570
|
-
# @return [Boolean]
|
571
|
-
def is_nor?
|
572
|
-
self.stat == NOR_NODE
|
573
|
-
end
|
574
|
-
|
575
|
-
# Returns <tt>true</tt> if this is an unknown <tt>mecab</tt> node not found in the dictionary.
|
576
|
-
# @return [Boolean]
|
577
|
-
def is_unk?
|
578
|
-
self.stat == UNK_NODE
|
579
|
-
end
|
580
|
-
|
581
|
-
# Returns <tt>true</tt> if this is a virtual <tt>mecab</tt> node representing the beginning of the sentence.
|
582
|
-
# @return [Boolean]
|
583
|
-
def is_bos?
|
584
|
-
self.stat == BOS_NODE
|
585
|
-
end
|
586
|
-
|
587
|
-
# Returns <tt>true</tt> if this is a virtual <tt>mecab</tt> node representing the end of the sentence.
|
588
|
-
# @return [Boolean]
|
589
|
-
def is_eos?
|
590
|
-
self.stat == EOS_NODE
|
591
|
-
end
|
592
|
-
|
593
|
-
# Returns <tt>true</tt> if this is a virtual <tt>mecab</tt> node representing the end of the node list.
|
594
|
-
# @return [Boolean]
|
595
|
-
def is_eon?
|
596
|
-
self.stat == EON_NODE
|
597
|
-
end
|
598
|
-
end
|
599
|
-
end
|
1
|
+
require 'natto/natto'
|