chawan 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +20 -0
- data/README +67 -0
- data/Rakefile +51 -0
- data/lib/chawan/analyzers/abstract_analyzer.rb +15 -0
- data/lib/chawan/analyzers/chasen_analyzer.rb +27 -0
- data/lib/chawan/analyzers/manager.rb +43 -0
- data/lib/chawan/analyzers/mecab_analyzer.rb +35 -0
- data/lib/chawan/analyzers.rb +23 -0
- data/lib/chawan/commands.rb +11 -0
- data/lib/chawan/node.rb +40 -0
- data/lib/chawan.rb +12 -0
- data/spec/analyzer_spec.rb +28 -0
- data/spec/api_spec.rb +18 -0
- data/spec/fixtures/example1.txt +1 -0
- data/spec/fixtures/example2.txt +1 -0
- data/spec/fixtures/example3.txt +1 -0
- data/spec/mecab_spec.rb +27 -0
- data/spec/node_spec.rb +13 -0
- data/spec/spec_helper.rb +10 -0
- metadata +74 -0
data/MIT-LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010 [maiha@wota.jp]
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
chawan
|
2
|
+
======
|
3
|
+
|
4
|
+
A cup for chasen that provides an easy to use for extracting Japanese
|
5
|
+
|
6
|
+
|
7
|
+
Methods
|
8
|
+
=======
|
9
|
+
|
10
|
+
* Chawan.parse(text)
|
11
|
+
parse the given text by analyzer, where default analyzer is :mecab
|
12
|
+
|
13
|
+
* Chawan.analyzer(xxx) (same as Chawan[xxx], Chawan.xxx)
|
14
|
+
specify analyzer
|
15
|
+
|
16
|
+
|
17
|
+
Class
|
18
|
+
=====
|
19
|
+
|
20
|
+
* Chawan::Node (Chawan.parse returns an array of Chawan::Node)
|
21
|
+
#category : part of speech
|
22
|
+
#word : text
|
23
|
+
#attributes : keys and vals hash
|
24
|
+
|
25
|
+
|
26
|
+
Example
|
27
|
+
=======
|
28
|
+
|
29
|
+
Chawan[:mecab].parse('test')
|
30
|
+
=> [<名詞: 'test'>]
|
31
|
+
|
32
|
+
# same as
|
33
|
+
# Chawan.mecab.parse('test')
|
34
|
+
# Chawan.analyzer(:mecab).parse('test')
|
35
|
+
# Chawan.parse('test') # default analyzer is :mecab
|
36
|
+
|
37
|
+
Chawan[:chasen].parse('test')
|
38
|
+
# ChasenAnalyzer is not implemented yet
|
39
|
+
|
40
|
+
|
41
|
+
Chawan.parse('本日は晴天なり')
|
42
|
+
=> [<名詞: '本日'>, <助詞: 'は'>, <名詞: '晴天'>, <助動詞: 'なり'>]
|
43
|
+
|
44
|
+
Chawan.parse('本日は晴天なり').select{|node| node.category == '名詞'}.join
|
45
|
+
=> "本日晴天"
|
46
|
+
|
47
|
+
|
48
|
+
Required
|
49
|
+
========
|
50
|
+
|
51
|
+
* UTF-8
|
52
|
+
* 'mecab' unix command (and its path)
|
53
|
+
|
54
|
+
|
55
|
+
Todo
|
56
|
+
====
|
57
|
+
|
58
|
+
* implement ChasenAnalyzer
|
59
|
+
* gateway interface to Chawan#parse such as grep, noun, ...
|
60
|
+
* use open3 rather than backquote for executing unix commands
|
61
|
+
|
62
|
+
|
63
|
+
Author
|
64
|
+
======
|
65
|
+
|
66
|
+
maiha@wota.jp
|
67
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake/gempackagetask'
|
3
|
+
|
4
|
+
GEM_NAME = "chawan"
|
5
|
+
AUTHOR = "maiha"
|
6
|
+
EMAIL = "maiha@wota.jp"
|
7
|
+
HOMEPAGE = "http://github.com/maiha/chawan"
|
8
|
+
SUMMARY = "A cup for chasen that provides an easy to use for extracting Japanese"
|
9
|
+
GEM_VERSION = "0.0.1"
|
10
|
+
|
11
|
+
spec = Gem::Specification.new do |s|
|
12
|
+
s.rubyforge_project = 'asakusarb'
|
13
|
+
s.executables = []
|
14
|
+
s.name = GEM_NAME
|
15
|
+
s.version = GEM_VERSION
|
16
|
+
s.platform = Gem::Platform::RUBY
|
17
|
+
s.has_rdoc = true
|
18
|
+
s.extra_rdoc_files = ["README", "MIT-LICENSE"]
|
19
|
+
s.summary = SUMMARY
|
20
|
+
s.description = s.summary
|
21
|
+
s.author = AUTHOR
|
22
|
+
s.email = EMAIL
|
23
|
+
s.homepage = HOMEPAGE
|
24
|
+
s.require_path = 'lib'
|
25
|
+
s.files = %w(MIT-LICENSE README Rakefile) + Dir.glob("{lib,spec,app,public,stubs}/**/*")
|
26
|
+
end
|
27
|
+
|
28
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
29
|
+
pkg.gem_spec = spec
|
30
|
+
end
|
31
|
+
|
32
|
+
desc "Install the gem"
|
33
|
+
task :install do
|
34
|
+
Merb::RakeHelper.install(GEM_NAME, :version => GEM_VERSION)
|
35
|
+
end
|
36
|
+
|
37
|
+
desc "Uninstall the gem"
|
38
|
+
task :uninstall do
|
39
|
+
Merb::RakeHelper.uninstall(GEM_NAME, :version => GEM_VERSION)
|
40
|
+
end
|
41
|
+
|
42
|
+
desc "Create a gemspec file"
|
43
|
+
task :gemspec do
|
44
|
+
File.open("#{GEM_NAME}.gemspec", "w") do |file|
|
45
|
+
file.puts spec.to_ruby
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
require 'spec/rake/spectask'
|
50
|
+
desc 'Default: run spec examples'
|
51
|
+
task :default => 'spec'
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Chawan
|
2
|
+
module Analyzers
|
3
|
+
class AbstractAnalyzer
|
4
|
+
attr_reader :options
|
5
|
+
|
6
|
+
def initialize(options = {})
|
7
|
+
@options = options.dup.freeze
|
8
|
+
end
|
9
|
+
|
10
|
+
def parse(text)
|
11
|
+
raise NotImplementedError, "#{self.class}#parse should be implemented"
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
|
2
|
+
module Chawan
|
3
|
+
module Analyzers
|
4
|
+
class ChasenAnalyzer < AbstractAnalyzer
|
5
|
+
Fields = []
|
6
|
+
|
7
|
+
def parse(text)
|
8
|
+
lines = execute(text).split(/\n/)
|
9
|
+
lines.pop == "EOS" or # "EOS"
|
10
|
+
raise CannotAnalyze
|
11
|
+
return lines.map{|line| instantiate(line, self.class::Fields)}
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
def execute(text)
|
16
|
+
require 'chasen'
|
17
|
+
# format = %w[%m %y %M %Y %h %P- %t %T- %f %F- %?U/unknown/known/].join("\t") + "\t\n"
|
18
|
+
Chasen.getopt('-i', 'w')
|
19
|
+
Chasen.sparse(text)
|
20
|
+
end
|
21
|
+
|
22
|
+
def instantiate(line, fields)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Chawan
|
2
|
+
module Analyzers
|
3
|
+
module Manager
|
4
|
+
def analyzers
|
5
|
+
@analyzers ||= {}
|
6
|
+
end
|
7
|
+
|
8
|
+
def analyzer(name = nil)
|
9
|
+
name ? analyzer_for(name) : current_analyzer
|
10
|
+
end
|
11
|
+
|
12
|
+
def analyzer_for(name)
|
13
|
+
analyzers[name.to_s] or raise AnalyzerNotFound, name.to_s
|
14
|
+
end
|
15
|
+
|
16
|
+
def current_analyzer
|
17
|
+
@analyzer or raise AnalyzerNotSetup
|
18
|
+
end
|
19
|
+
|
20
|
+
def define_analyzer(name, analyzer)
|
21
|
+
analyzers[name] = analyzer
|
22
|
+
|
23
|
+
unless respond_to?(name)
|
24
|
+
eval(<<-RUBY)
|
25
|
+
def Chawan.#{name}
|
26
|
+
Chawan['#{name}']
|
27
|
+
end
|
28
|
+
RUBY
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def setup(name, setter = nil)
|
33
|
+
define_analyzer(name.to_s, setter) if setter
|
34
|
+
@analyzer = analyzer_for(name)
|
35
|
+
end
|
36
|
+
|
37
|
+
def [](name)
|
38
|
+
analyzer_for(name)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'tempfile'
|
2
|
+
require File.dirname(__FILE__) + '/chasen_analyzer'
|
3
|
+
|
4
|
+
module Chawan
|
5
|
+
module Analyzers
|
6
|
+
class MecabAnalyzer < ChasenAnalyzer
|
7
|
+
Fields = "表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音".split(/,|\s/)
|
8
|
+
|
9
|
+
class Node < Chawan::Node
|
10
|
+
# not used yet
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
def instantiate(line, fields)
|
15
|
+
Node.new(line.split(/,|\s/), fields)
|
16
|
+
end
|
17
|
+
|
18
|
+
def execute(text)
|
19
|
+
transaction(text) do |tmp|
|
20
|
+
`mecab #{tmp.path}`
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def transaction(buffer, &block)
|
25
|
+
tmp = Tempfile.new("mecab-text")
|
26
|
+
tmp.print(buffer)
|
27
|
+
tmp.close
|
28
|
+
block.call(tmp)
|
29
|
+
ensure
|
30
|
+
tmp.close(true)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
|
2
|
+
require File.dirname(__FILE__) + '/analyzers/manager'
|
3
|
+
require File.dirname(__FILE__) + '/analyzers/abstract_analyzer'
|
4
|
+
|
5
|
+
module Chawan
|
6
|
+
extend Analyzers::Manager
|
7
|
+
end
|
8
|
+
|
9
|
+
Dir.glob( File.dirname(__FILE__) + '/analyzers/*_analyzer.rb' ).sort.each do |path|
|
10
|
+
require path
|
11
|
+
end
|
12
|
+
|
13
|
+
module Chawan
|
14
|
+
module Analyzers
|
15
|
+
constants.sort.grep(/(.*?)Analyzer$/) do
|
16
|
+
name = $1.downcase
|
17
|
+
klass = Chawan::Analyzers.const_get($&)
|
18
|
+
next if name == 'abstract'
|
19
|
+
Chawan.setup(name, klass.new)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
data/lib/chawan/node.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
module Chawan
|
2
|
+
class Node
|
3
|
+
attr_reader :vals
|
4
|
+
attr_reader :keys
|
5
|
+
|
6
|
+
def initialize(vals, keys)
|
7
|
+
@vals = vals
|
8
|
+
@keys = keys
|
9
|
+
end
|
10
|
+
|
11
|
+
def attributes
|
12
|
+
@attributes ||= Hash[*keys.zip(vals).flatten]
|
13
|
+
end
|
14
|
+
|
15
|
+
def [](index)
|
16
|
+
case index
|
17
|
+
when Integer
|
18
|
+
vals[keys[index]]
|
19
|
+
else
|
20
|
+
attributes[index.to_s]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def word
|
25
|
+
vals.first.to_s
|
26
|
+
end
|
27
|
+
|
28
|
+
def category
|
29
|
+
vals[1]
|
30
|
+
end
|
31
|
+
|
32
|
+
def to_s
|
33
|
+
word
|
34
|
+
end
|
35
|
+
|
36
|
+
def inspect
|
37
|
+
"<%s: '%s'>" % [category, to_s]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/lib/chawan.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
|
2
|
+
module Chawan
|
3
|
+
class AnalyzerNotFound < StandardError; end
|
4
|
+
class AnalyzerNotSetup < StandardError; end
|
5
|
+
class CannotAnalyze < StandardError; end
|
6
|
+
end
|
7
|
+
|
8
|
+
|
9
|
+
require File.dirname(__FILE__) + '/chawan/node'
|
10
|
+
require File.dirname(__FILE__) + '/chawan/commands'
|
11
|
+
require File.dirname(__FILE__) + '/chawan/analyzers'
|
12
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper.rb')
|
3
|
+
|
4
|
+
describe Chawan do
|
5
|
+
it "should provide .[]" do
|
6
|
+
Chawan.should respond_to(:[])
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "[:mecab]" do
|
10
|
+
it "should return MecabAnalyzer" do
|
11
|
+
Chawan[:mecab].should be_kind_of(Chawan::Analyzers::MecabAnalyzer)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "[:chasen]" do
|
16
|
+
it "should return ChasenAnalyzer" do
|
17
|
+
Chawan[:chasen].should be_kind_of(Chawan::Analyzers::ChasenAnalyzer)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should provide .mecab" do
|
22
|
+
Chawan.should respond_to(:mecab)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should provide .chasen" do
|
26
|
+
Chawan.should respond_to(:chasen)
|
27
|
+
end
|
28
|
+
end
|
data/spec/api_spec.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), 'spec_helper.rb')
|
4
|
+
|
5
|
+
describe Chawan do
|
6
|
+
it "should provide .parse" do
|
7
|
+
Chawan.should respond_to(:parse)
|
8
|
+
end
|
9
|
+
|
10
|
+
describe ".parse" do
|
11
|
+
it "should delegate to analyzer" do
|
12
|
+
pending "cannot mock within rr" do
|
13
|
+
mock(Chawan).analyzer
|
14
|
+
Chawan.parse('test')
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
本日は晴天なり。
|
@@ -0,0 +1 @@
|
|
1
|
+
貴社の記者が汽車で帰社した。
|
@@ -0,0 +1 @@
|
|
1
|
+
玉露の一番搾りを一日に百五十杯飲むと、人間は死んでしまうらしいです。
|
data/spec/mecab_spec.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper.rb')
|
3
|
+
|
4
|
+
describe Chawan::Analyzers::MecabAnalyzer do
|
5
|
+
before do
|
6
|
+
@a = Chawan::Analyzers::MecabAnalyzer.new
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should provide #parse" do
|
10
|
+
@a.should respond_to(:parse)
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "#parse" do
|
14
|
+
it "should return an Array of Node" do
|
15
|
+
text = data("example1.txt")
|
16
|
+
@a.parse(text).each do |node|
|
17
|
+
node.should be_kind_of(Chawan::Node)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should work as expected" do
|
22
|
+
text = data("example1.txt")
|
23
|
+
@a.parse(text).map(&:inspect).join.should ==
|
24
|
+
"<名詞: '本日'><助詞: 'は'><名詞: '晴天'><助動詞: 'なり'><記号: '。'>"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/spec/node_spec.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper.rb')
|
3
|
+
|
4
|
+
describe Chawan::Node do
|
5
|
+
subject { Chawan::Node.new(["Ruby","名詞"], ["表記","品詞"]) }
|
6
|
+
|
7
|
+
its('keys') {should == ["表記","品詞"] }
|
8
|
+
its('vals') {should == ["Ruby","名詞"] }
|
9
|
+
its('attributes') {should == {"表記"=>"Ruby", "品詞"=>"名詞"}}
|
10
|
+
its('word') {should == "Ruby"}
|
11
|
+
its('category') {should == "名詞"}
|
12
|
+
its('inspect') {should == "<名詞: 'Ruby'>"}
|
13
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: chawan
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- maiha
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-01-03 00:00:00 +09:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: A cup for chasen that provides an easy to use for extracting Japanese
|
17
|
+
email: maiha@wota.jp
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README
|
24
|
+
- MIT-LICENSE
|
25
|
+
files:
|
26
|
+
- MIT-LICENSE
|
27
|
+
- README
|
28
|
+
- Rakefile
|
29
|
+
- lib/chawan/analyzers.rb
|
30
|
+
- lib/chawan/analyzers/mecab_analyzer.rb
|
31
|
+
- lib/chawan/analyzers/chasen_analyzer.rb
|
32
|
+
- lib/chawan/analyzers/abstract_analyzer.rb
|
33
|
+
- lib/chawan/analyzers/manager.rb
|
34
|
+
- lib/chawan/node.rb
|
35
|
+
- lib/chawan/commands.rb
|
36
|
+
- lib/chawan.rb
|
37
|
+
- spec/api_spec.rb
|
38
|
+
- spec/mecab_spec.rb
|
39
|
+
- spec/analyzer_spec.rb
|
40
|
+
- spec/fixtures/example1.txt
|
41
|
+
- spec/fixtures/example2.txt
|
42
|
+
- spec/fixtures/example3.txt
|
43
|
+
- spec/spec_helper.rb
|
44
|
+
- spec/node_spec.rb
|
45
|
+
has_rdoc: true
|
46
|
+
homepage: http://github.com/maiha/chawan
|
47
|
+
licenses: []
|
48
|
+
|
49
|
+
post_install_message:
|
50
|
+
rdoc_options: []
|
51
|
+
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: "0"
|
59
|
+
version:
|
60
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: "0"
|
65
|
+
version:
|
66
|
+
requirements: []
|
67
|
+
|
68
|
+
rubyforge_project: asakusarb
|
69
|
+
rubygems_version: 1.3.5
|
70
|
+
signing_key:
|
71
|
+
specification_version: 3
|
72
|
+
summary: A cup for chasen that provides an easy to use for extracting Japanese
|
73
|
+
test_files: []
|
74
|
+
|