natto 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +1 -1
- data/README +67 -0
- data/lib/natto.rb +114 -10
- metadata +4 -3
data/LICENSE
CHANGED
data/README
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
= natto: A Tasty Ruby Binding with MeCab
|
2
|
+
|
3
|
+
== What is natto?
|
4
|
+
|
5
|
+
natto provides a Ruby binding with MeCab,
|
6
|
+
the part-of-speech and morphological analyzer
|
7
|
+
for the Japanese language.
|
8
|
+
|
9
|
+
== Try It! Try It!
|
10
|
+
|
11
|
+
=== Requirements
|
12
|
+
natto requires the following:
|
13
|
+
* {http://sourceforge.net/projects/mecab/files/mecab/ MeCab 0.98}
|
14
|
+
* {http://rubygems.org/gems/ffi ffi 0.63 or greater}
|
15
|
+
* Ruby 1.8.7 or greater
|
16
|
+
|
17
|
+
=== Installation
|
18
|
+
Install natto with the following gem command:
|
19
|
+
* <code>gem install natto</code>
|
20
|
+
|
21
|
+
=== Configuration
|
22
|
+
* natto will try to locate the <tt>mecab</tt> library based upon its runtime environment.
|
23
|
+
* In case of <tt>LoadError</tt>, please set the <tt>MECAB_PATH</tt> environment variable to the exact name/path to your <tt>mecab</tt> library.
|
24
|
+
|
25
|
+
== Usage
|
26
|
+
require 'natto'
|
27
|
+
|
28
|
+
m = Natto::MeCab.new
|
29
|
+
puts m.parse("すもももももももものうち")
|
30
|
+
すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
|
31
|
+
も 助詞,係助詞,*,*,*,*,も,モ,モ
|
32
|
+
もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
33
|
+
も 助詞,係助詞,*,*,*,*,も,モ,モ
|
34
|
+
もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
35
|
+
の 助詞,連体化,*,*,*,*,の,ノ,ノ
|
36
|
+
うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
|
37
|
+
EOS
|
38
|
+
=> nil
|
39
|
+
|
40
|
+
== Contributing to natto
|
41
|
+
|
42
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
43
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
|
44
|
+
* Fork the project
|
45
|
+
* Start a feature/bugfix branch
|
46
|
+
* Commit and push until you are happy with your contribution
|
47
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
48
|
+
* Please try not to mess with the Rakefile, version, or history. If you must have your own version, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
49
|
+
|
50
|
+
== Changelog
|
51
|
+
|
52
|
+
- **2010/12/22**: 0.0.3 release.
|
53
|
+
- On-going refactoring
|
54
|
+
- Adding documentation via yard
|
55
|
+
|
56
|
+
- **2010/12/20**: 0.0.2 release.
|
57
|
+
- Continuing development on proper resource deallocation
|
58
|
+
- Adding options hash in object initializer
|
59
|
+
|
60
|
+
- **2010/12/13**: Released version 0.0.1. The objective is to provide
|
61
|
+
an easy-to-use, production-level Ruby binding to MeCab.
|
62
|
+
- Initial release
|
63
|
+
|
64
|
+
|
65
|
+
== Copyright
|
66
|
+
|
67
|
+
natto (c) 2010-2013 by Brooke M. Fujita, licensed under the new BSD license. Please see the {file:LICENSE} document for further details.
|
data/lib/natto.rb
CHANGED
@@ -1,38 +1,106 @@
|
|
1
1
|
# -*- coding: UTF-8 -*-
|
2
2
|
require 'rubygems' if RUBY_VERSION.to_f < 1.9
|
3
3
|
|
4
|
-
|
4
|
+
# natto combines the Ruby programming language with MeCab,
|
5
|
+
# the part-of-speech and morphological analyzer for the
|
6
|
+
# Japanese language.
|
7
|
+
#
|
8
|
+
# === Requirements
|
9
|
+
# natto requires the following:
|
10
|
+
# * {http://sourceforge.net/projects/mecab/files/mecab/ MeCab 0.98}
|
11
|
+
# * {http://rubygems.org/gems/ffi ffi 0.63 or greater}
|
12
|
+
#
|
13
|
+
# === Installation
|
14
|
+
# Install natto with the following gem command:
|
15
|
+
# * <code>gem install natto</code>
|
16
|
+
#
|
17
|
+
# === Configuration
|
18
|
+
# * natto will try to locate the <tt>mecab</tt> library based upon its runtime environment.
|
19
|
+
# * In case of <tt>LoadError</tt>, please set the <tt>MECAB_PATH</tt> environment variable to the exact name/path to your <tt>mecab</tt> library.
|
20
|
+
#
|
21
|
+
#== Usage
|
22
|
+
# require 'natto'
|
23
|
+
#
|
24
|
+
# m = Natto::MeCab.new
|
25
|
+
# puts m.parse("すもももももももものうち")
|
26
|
+
# すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
|
27
|
+
# も 助詞,係助詞,*,*,*,*,も,モ,モ
|
28
|
+
# もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
29
|
+
# も 助詞,係助詞,*,*,*,*,も,モ,モ
|
30
|
+
# もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
31
|
+
# の 助詞,連体化,*,*,*,*,の,ノ,ノ
|
32
|
+
# うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
|
33
|
+
# EOS
|
34
|
+
# => nil
|
35
|
+
#
|
36
|
+
# @author Brooke M. Fujita (buruzaemon)
|
37
|
+
module Natto
|
5
38
|
require 'ffi'
|
6
39
|
|
40
|
+
# <tt>MeCab</tt> is a wrapper class to the <tt>mecab</tt> parser.
|
41
|
+
# Options to the <tt>mecab</tt> parser are passed in as a hash to
|
42
|
+
# #initialize.
|
43
|
+
#
|
44
|
+
# @see {SUPPORTED_OPTS}
|
7
45
|
class MeCab
|
46
|
+
# Supported options to the <tt>mecab</tt> parser.
|
47
|
+
# See the <tt>mecab</tt> help for more details.
|
8
48
|
SUPPORTED_OPTS = [ :rcfile, :dicdir, :userdic, :output_format_type, :lattice_level,
|
9
49
|
:node_format, :unk_format, :bos_format, :eos_format, :eon_format,
|
10
|
-
:unk_feature, :nbest, :theta, :cost_factor ]
|
50
|
+
:unk_feature, :nbest, :theta, :cost_factor ].freeze
|
11
51
|
# :all_morphs, :partial, :allocate_sentence ]
|
12
|
-
attr_reader :ptr
|
13
52
|
|
53
|
+
# Initialize the wrapped <tt>mecab</tt> instance, with the
|
54
|
+
# given <tt>options</tt> hash.
|
55
|
+
# <br/>
|
56
|
+
# Options supported are:
|
57
|
+
# * :rcfile -- resource file
|
58
|
+
# * :dicdir -- system dicdir
|
59
|
+
# * :userdic -- user dictionary
|
60
|
+
# * :lattice_level -- lattice information level (integer, default 0)
|
61
|
+
# * :output_format_type -- output format type (wakati, chasen, yomi, dump)
|
62
|
+
# * :node_format -- user-defined node format
|
63
|
+
# * :unk_format -- user-defined unknown node format
|
64
|
+
# * :bos_format -- user-defined beginning-of-sentence format
|
65
|
+
# * :eos_format -- user-defined end-of-sentence format
|
66
|
+
# * :eon_format -- user-defined end-of-NBest format
|
67
|
+
# * :unk_feature -- feature for unknown word
|
68
|
+
# * :nbest -- output N best results (integer, default 1)
|
69
|
+
# * :theta -- temperature parameter theta (float, default 0.75)
|
70
|
+
# * :cost_factor -- cost factor (integer, default 700)
|
71
|
+
#
|
72
|
+
# @param [Hash]
|
73
|
+
# @see {SUPPORTED_OPTS}
|
14
74
|
def initialize(options={})
|
15
75
|
opt_str = self.class.build_options_str(options)
|
16
|
-
#@ptr = FFI::MemoryPointer.new :pointer
|
17
76
|
@ptr = Natto::Binding.mecab_new2(opt_str)
|
18
|
-
|
19
|
-
raise MeCabError.new("MeCab initialiation error with '#{opt_str}'") if @ptr.address == 0
|
77
|
+
raise MeCabError.new("MeCab initialization error with '#{opt_str}'") if @ptr.address == 0
|
20
78
|
#@dict = Natto::DictionaryInfo.new(Natto::Binding.mecab_dictionary_info(@ptr))
|
21
79
|
ObjectSpace.define_finalizer(self, self.class.create_free_proc(@ptr))
|
22
80
|
end
|
23
81
|
|
82
|
+
# Parses the given string <tt>s</tt>.
|
83
|
+
#
|
84
|
+
# @param [String]
|
24
85
|
def parse(s)
|
25
86
|
Natto::Binding.mecab_sparse_tostr(@ptr, s) ||
|
26
87
|
raise(MeCabError.new(Natto::Binding.mecab_strerror(@ptr)))
|
27
88
|
end
|
28
|
-
|
89
|
+
|
90
|
+
# Returns a <tt>Proc</tt> that is registered to be invoked
|
91
|
+
# after the object owning <tt>ptr</tt> has been destroyed.
|
92
|
+
#
|
93
|
+
# @param [FFI::MemoryPointer] ptr
|
29
94
|
def self.create_free_proc(ptr)
|
30
95
|
Proc.new do
|
31
|
-
#puts "mecab_destroy #{ptr}"
|
32
96
|
Natto::Binding.mecab_destroy(ptr)
|
33
97
|
end
|
34
98
|
end
|
35
99
|
|
100
|
+
# Returns a string-representation of the options to
|
101
|
+
# be passed in the construction of <tt>mecab</tt>.
|
102
|
+
#
|
103
|
+
# @param [Hash] options
|
36
104
|
def self.build_options_str(options={})
|
37
105
|
opt = []
|
38
106
|
SUPPORTED_OPTS.each do |k|
|
@@ -44,9 +112,31 @@ module Natto
|
|
44
112
|
opt.join(" ")
|
45
113
|
end
|
46
114
|
end
|
47
|
-
|
115
|
+
|
116
|
+
# <tt>MeCabError</tt> is a general error class
|
117
|
+
# for the <tt>Natto</tt> module.
|
48
118
|
class MeCabError < RuntimeError; end
|
49
119
|
|
120
|
+
# <tt>DictionaryInfo</tt> is a wrapper for a <tt>MeCab</tt>
|
121
|
+
# instance's related dictionary information.
|
122
|
+
# <br>
|
123
|
+
# Values may be obtained by using the following symbols
|
124
|
+
# as keys to the hash of <tt>mecab</tt> dictionary information.
|
125
|
+
# * :filename
|
126
|
+
# * :charset
|
127
|
+
# * :size
|
128
|
+
# * :type
|
129
|
+
# * :lsize
|
130
|
+
# * :rsize
|
131
|
+
# * :version
|
132
|
+
# * :next
|
133
|
+
# <br>
|
134
|
+
#
|
135
|
+
# dict = Natto::DictionaryInfo.new(mecab_ptr)
|
136
|
+
# puts dict[:filename]
|
137
|
+
# => /usr/local/lib/mecab/dic/ipadic/sys.dic
|
138
|
+
# puts dict[:charset]
|
139
|
+
# => utf8
|
50
140
|
class DictionaryInfo < FFI::Struct
|
51
141
|
layout :filename, :string,
|
52
142
|
:charset, :string,
|
@@ -58,16 +148,29 @@ module Natto
|
|
58
148
|
:next, :pointer
|
59
149
|
end
|
60
150
|
|
151
|
+
# Module <tt>Binding</tt> encapsulates operations which are
|
152
|
+
# made available via <tt>FFI</tt> bindings to <tt>mecab</tt>
|
61
153
|
module Binding
|
62
154
|
require 'rbconfig'
|
63
155
|
extend FFI::Library
|
64
156
|
|
157
|
+
# String name for the environment variable used by
|
158
|
+
# <tt>Natto</tt> to indicate the exact name / full path
|
159
|
+
# to the <tt>mecab</tt> library.
|
65
160
|
MECAB_PATH = 'MECAB_PATH'
|
66
|
-
|
161
|
+
|
162
|
+
# @private
|
67
163
|
def self.included(base)
|
68
164
|
base.extend(ClassMethods)
|
69
165
|
end
|
70
166
|
|
167
|
+
# Returns the name of the <tt>mecab</tt> library based on
|
168
|
+
# the runtime environment. The value of the environment
|
169
|
+
# parameter <tt>MECAB_PATH</tt> is checked before this
|
170
|
+
# function is invoked, and in the case of Windows, a
|
171
|
+
# <tt>LoadError</tt> will be raised if <tt>MECAB_PATH</tt>
|
172
|
+
# is <b>not</b> set to the full path of the <tt>mecab</tt>
|
173
|
+
# library.
|
71
174
|
def self.find_library
|
72
175
|
host_os = RbConfig::CONFIG['host_os']
|
73
176
|
|
@@ -89,6 +192,7 @@ module Natto
|
|
89
192
|
attach_function :mecab_strerror, [:pointer],:string
|
90
193
|
attach_function :mecab_dictionary_info, [:pointer], :pointer
|
91
194
|
|
195
|
+
# @private
|
92
196
|
module ClassMethods
|
93
197
|
def mecab_version
|
94
198
|
Natto::Binding.mecab_version
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 3
|
9
|
+
version: 0.0.3
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Brooke M. Fujita
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-12-
|
17
|
+
date: 2010-12-22 00:00:00 +09:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -44,6 +44,7 @@ files:
|
|
44
44
|
- lib/natto.rb
|
45
45
|
- test/test_natto.rb
|
46
46
|
- LICENSE
|
47
|
+
- README
|
47
48
|
has_rdoc: true
|
48
49
|
homepage: http://code.google.com/p/natto/
|
49
50
|
licenses:
|