natto 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +1 -1
- data/README +67 -0
- data/lib/natto.rb +114 -10
- metadata +4 -3
data/LICENSE
CHANGED
data/README
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
= natto: A Tasty Ruby Binding with MeCab
|
2
|
+
|
3
|
+
== What is natto?
|
4
|
+
|
5
|
+
natto provides a Ruby binding with MeCab,
|
6
|
+
the part-of-speech and morphological analyzer
|
7
|
+
for the Japanese language.
|
8
|
+
|
9
|
+
== Try It! Try It!
|
10
|
+
|
11
|
+
=== Requirements
|
12
|
+
natto requires the following:
|
13
|
+
* {http://sourceforge.net/projects/mecab/files/mecab/ MeCab 0.98}
|
14
|
+
* {http://rubygems.org/gems/ffi ffi 0.63 or greater}
|
15
|
+
* Ruby 1.8.7 or greater
|
16
|
+
|
17
|
+
=== Installation
|
18
|
+
Install natto with the following gem command:
|
19
|
+
* <code>gem install natto</code>
|
20
|
+
|
21
|
+
=== Configuration
|
22
|
+
* natto will try to locate the <tt>mecab</tt> library based upon its runtime environment.
|
23
|
+
* In case of <tt>LoadError</tt>, please set the <tt>MECAB_PATH</tt> environment variable to the exact name/path to your <tt>mecab</tt> library.
|
24
|
+
|
25
|
+
== Usage
|
26
|
+
require 'natto'
|
27
|
+
|
28
|
+
m = Natto::MeCab.new
|
29
|
+
puts m.parse("すもももももももものうち")
|
30
|
+
すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
|
31
|
+
も 助詞,係助詞,*,*,*,*,も,モ,モ
|
32
|
+
もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
33
|
+
も 助詞,係助詞,*,*,*,*,も,モ,モ
|
34
|
+
もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
35
|
+
の 助詞,連体化,*,*,*,*,の,ノ,ノ
|
36
|
+
うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
|
37
|
+
EOS
|
38
|
+
=> nil
|
39
|
+
|
40
|
+
== Contributing to natto
|
41
|
+
|
42
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
43
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
|
44
|
+
* Fork the project
|
45
|
+
* Start a feature/bugfix branch
|
46
|
+
* Commit and push until you are happy with your contribution
|
47
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
48
|
+
* Please try not to mess with the Rakefile, version, or history. If you must have your own version, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
49
|
+
|
50
|
+
== Changelog
|
51
|
+
|
52
|
+
- **2010/12/22**: 0.0.3 release.
|
53
|
+
- On-going refactoring
|
54
|
+
- Adding documentation via yard
|
55
|
+
|
56
|
+
- **2010/12/20**: 0.0.2 release.
|
57
|
+
- Continuing development on proper resource deallocation
|
58
|
+
- Adding options hash in object initializer
|
59
|
+
|
60
|
+
- **2010/12/13**: Released version 0.0.1. The objective is to provide
|
61
|
+
an easy-to-use, production-level Ruby binding to MeCab.
|
62
|
+
- Initial release
|
63
|
+
|
64
|
+
|
65
|
+
== Copyright
|
66
|
+
|
67
|
+
natto (c) 2010-2013 by Brooke M. Fujita, licensed under the new BSD license. Please see the {file:LICENSE} document for further details.
|
data/lib/natto.rb
CHANGED
@@ -1,38 +1,106 @@
|
|
1
1
|
# -*- coding: UTF-8 -*-
|
2
2
|
require 'rubygems' if RUBY_VERSION.to_f < 1.9
|
3
3
|
|
4
|
-
|
4
|
+
# natto combines the Ruby programming language with MeCab,
|
5
|
+
# the part-of-speech and morphological analyzer for the
|
6
|
+
# Japanese language.
|
7
|
+
#
|
8
|
+
# === Requirements
|
9
|
+
# natto requires the following:
|
10
|
+
# * {http://sourceforge.net/projects/mecab/files/mecab/ MeCab 0.98}
|
11
|
+
# * {http://rubygems.org/gems/ffi ffi 0.63 or greater}
|
12
|
+
#
|
13
|
+
# === Installation
|
14
|
+
# Install natto with the following gem command:
|
15
|
+
# * <code>gem install natto</code>
|
16
|
+
#
|
17
|
+
# === Configuration
|
18
|
+
# * natto will try to locate the <tt>mecab</tt> library based upon its runtime environment.
|
19
|
+
# * In case of <tt>LoadError</tt>, please set the <tt>MECAB_PATH</tt> environment variable to the exact name/path to your <tt>mecab</tt> library.
|
20
|
+
#
|
21
|
+
#== Usage
|
22
|
+
# require 'natto'
|
23
|
+
#
|
24
|
+
# m = Natto::MeCab.new
|
25
|
+
# puts m.parse("すもももももももものうち")
|
26
|
+
# すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
|
27
|
+
# も 助詞,係助詞,*,*,*,*,も,モ,モ
|
28
|
+
# もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
29
|
+
# も 助詞,係助詞,*,*,*,*,も,モ,モ
|
30
|
+
# もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
31
|
+
# の 助詞,連体化,*,*,*,*,の,ノ,ノ
|
32
|
+
# うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
|
33
|
+
# EOS
|
34
|
+
# => nil
|
35
|
+
#
|
36
|
+
# @author Brooke M. Fujita (buruzaemon)
|
37
|
+
module Natto
|
5
38
|
require 'ffi'
|
6
39
|
|
40
|
+
# <tt>MeCab</tt> is a wrapper class to the <tt>mecab</tt> parser.
|
41
|
+
# Options to the <tt>mecab</tt> parser are passed in as a hash to
|
42
|
+
# #initialize.
|
43
|
+
#
|
44
|
+
# @see {SUPPORTED_OPTS}
|
7
45
|
class MeCab
|
46
|
+
# Supported options to the <tt>mecab</tt> parser.
|
47
|
+
# See the <tt>mecab</tt> help for more details.
|
8
48
|
SUPPORTED_OPTS = [ :rcfile, :dicdir, :userdic, :output_format_type, :lattice_level,
|
9
49
|
:node_format, :unk_format, :bos_format, :eos_format, :eon_format,
|
10
|
-
:unk_feature, :nbest, :theta, :cost_factor ]
|
50
|
+
:unk_feature, :nbest, :theta, :cost_factor ].freeze
|
11
51
|
# :all_morphs, :partial, :allocate_sentence ]
|
12
|
-
attr_reader :ptr
|
13
52
|
|
53
|
+
# Initialize the wrapped <tt>mecab</tt> instance, with the
|
54
|
+
# given <tt>options</tt> hash.
|
55
|
+
# <br/>
|
56
|
+
# Options supported are:
|
57
|
+
# * :rcfile -- resource file
|
58
|
+
# * :dicdir -- system dicdir
|
59
|
+
# * :userdic -- user dictionary
|
60
|
+
# * :lattice_level -- lattice information level (integer, default 0)
|
61
|
+
# * :output_format_type -- output format type (wakati, chasen, yomi, dump)
|
62
|
+
# * :node_format -- user-defined node format
|
63
|
+
# * :unk_format -- user-defined unknown node format
|
64
|
+
# * :bos_format -- user-defined beginning-of-sentence format
|
65
|
+
# * :eos_format -- user-defined end-of-sentence format
|
66
|
+
# * :eon_format -- user-defined end-of-NBest format
|
67
|
+
# * :unk_feature -- feature for unknown word
|
68
|
+
# * :nbest -- output N best results (integer, default 1)
|
69
|
+
# * :theta -- temperature parameter theta (float, default 0.75)
|
70
|
+
# * :cost_factor -- cost factor (integer, default 700)
|
71
|
+
#
|
72
|
+
# @param [Hash]
|
73
|
+
# @see {SUPPORTED_OPTS}
|
14
74
|
def initialize(options={})
|
15
75
|
opt_str = self.class.build_options_str(options)
|
16
|
-
#@ptr = FFI::MemoryPointer.new :pointer
|
17
76
|
@ptr = Natto::Binding.mecab_new2(opt_str)
|
18
|
-
|
19
|
-
raise MeCabError.new("MeCab initialiation error with '#{opt_str}'") if @ptr.address == 0
|
77
|
+
raise MeCabError.new("MeCab initialization error with '#{opt_str}'") if @ptr.address == 0
|
20
78
|
#@dict = Natto::DictionaryInfo.new(Natto::Binding.mecab_dictionary_info(@ptr))
|
21
79
|
ObjectSpace.define_finalizer(self, self.class.create_free_proc(@ptr))
|
22
80
|
end
|
23
81
|
|
82
|
+
# Parses the given string <tt>s</tt>.
|
83
|
+
#
|
84
|
+
# @param [String]
|
24
85
|
def parse(s)
|
25
86
|
Natto::Binding.mecab_sparse_tostr(@ptr, s) ||
|
26
87
|
raise(MeCabError.new(Natto::Binding.mecab_strerror(@ptr)))
|
27
88
|
end
|
28
|
-
|
89
|
+
|
90
|
+
# Returns a <tt>Proc</tt> that is registered to be invoked
|
91
|
+
# after the object owning <tt>ptr</tt> has been destroyed.
|
92
|
+
#
|
93
|
+
# @param [FFI::MemoryPointer] ptr
|
29
94
|
def self.create_free_proc(ptr)
|
30
95
|
Proc.new do
|
31
|
-
#puts "mecab_destroy #{ptr}"
|
32
96
|
Natto::Binding.mecab_destroy(ptr)
|
33
97
|
end
|
34
98
|
end
|
35
99
|
|
100
|
+
# Returns a string-representation of the options to
|
101
|
+
# be passed in the construction of <tt>mecab</tt>.
|
102
|
+
#
|
103
|
+
# @param [Hash] options
|
36
104
|
def self.build_options_str(options={})
|
37
105
|
opt = []
|
38
106
|
SUPPORTED_OPTS.each do |k|
|
@@ -44,9 +112,31 @@ module Natto
|
|
44
112
|
opt.join(" ")
|
45
113
|
end
|
46
114
|
end
|
47
|
-
|
115
|
+
|
116
|
+
# <tt>MeCabError</tt> is a general error class
|
117
|
+
# for the <tt>Natto</tt> module.
|
48
118
|
class MeCabError < RuntimeError; end
|
49
119
|
|
120
|
+
# <tt>DictionaryInfo</tt> is a wrapper for a <tt>MeCab</tt>
|
121
|
+
# instance's related dictionary information.
|
122
|
+
# <br>
|
123
|
+
# Values may be obtained by using the following symbols
|
124
|
+
# as keys to the hash of <tt>mecab</tt> dictionary information.
|
125
|
+
# * :filename
|
126
|
+
# * :charset
|
127
|
+
# * :size
|
128
|
+
# * :type
|
129
|
+
# * :lsize
|
130
|
+
# * :rsize
|
131
|
+
# * :version
|
132
|
+
# * :next
|
133
|
+
# <br>
|
134
|
+
#
|
135
|
+
# dict = Natto::DictionaryInfo.new(mecab_ptr)
|
136
|
+
# puts dict[:filename]
|
137
|
+
# => /usr/local/lib/mecab/dic/ipadic/sys.dic
|
138
|
+
# puts dict[:charset]
|
139
|
+
# => utf8
|
50
140
|
class DictionaryInfo < FFI::Struct
|
51
141
|
layout :filename, :string,
|
52
142
|
:charset, :string,
|
@@ -58,16 +148,29 @@ module Natto
|
|
58
148
|
:next, :pointer
|
59
149
|
end
|
60
150
|
|
151
|
+
# Module <tt>Binding</tt> encapsulates operations which are
|
152
|
+
# made available via <tt>FFI</tt> bindings to <tt>mecab</tt>
|
61
153
|
module Binding
|
62
154
|
require 'rbconfig'
|
63
155
|
extend FFI::Library
|
64
156
|
|
157
|
+
# String name for the environment variable used by
|
158
|
+
# <tt>Natto</tt> to indicate the exact name / full path
|
159
|
+
# to the <tt>mecab</tt> library.
|
65
160
|
MECAB_PATH = 'MECAB_PATH'
|
66
|
-
|
161
|
+
|
162
|
+
# @private
|
67
163
|
def self.included(base)
|
68
164
|
base.extend(ClassMethods)
|
69
165
|
end
|
70
166
|
|
167
|
+
# Returns the name of the <tt>mecab</tt> library based on
|
168
|
+
# the runtime environment. The value of the environment
|
169
|
+
# parameter <tt>MECAB_PATH</tt> is checked before this
|
170
|
+
# function is invoked, and in the case of Windows, a
|
171
|
+
# <tt>LoadError</tt> will be raised if <tt>MECAB_PATH</tt>
|
172
|
+
# is <b>not</b> set to the full path of the <tt>mecab</tt>
|
173
|
+
# library.
|
71
174
|
def self.find_library
|
72
175
|
host_os = RbConfig::CONFIG['host_os']
|
73
176
|
|
@@ -89,6 +192,7 @@ module Natto
|
|
89
192
|
attach_function :mecab_strerror, [:pointer],:string
|
90
193
|
attach_function :mecab_dictionary_info, [:pointer], :pointer
|
91
194
|
|
195
|
+
# @private
|
92
196
|
module ClassMethods
|
93
197
|
def mecab_version
|
94
198
|
Natto::Binding.mecab_version
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 3
|
9
|
+
version: 0.0.3
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Brooke M. Fujita
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-12-
|
17
|
+
date: 2010-12-22 00:00:00 +09:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -44,6 +44,7 @@ files:
|
|
44
44
|
- lib/natto.rb
|
45
45
|
- test/test_natto.rb
|
46
46
|
- LICENSE
|
47
|
+
- README
|
47
48
|
has_rdoc: true
|
48
49
|
homepage: http://code.google.com/p/natto/
|
49
50
|
licenses:
|