chawan 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +20 -0
- data/README +67 -0
- data/Rakefile +51 -0
- data/lib/chawan/analyzers/abstract_analyzer.rb +15 -0
- data/lib/chawan/analyzers/chasen_analyzer.rb +27 -0
- data/lib/chawan/analyzers/manager.rb +43 -0
- data/lib/chawan/analyzers/mecab_analyzer.rb +35 -0
- data/lib/chawan/analyzers.rb +23 -0
- data/lib/chawan/commands.rb +11 -0
- data/lib/chawan/node.rb +40 -0
- data/lib/chawan.rb +12 -0
- data/spec/analyzer_spec.rb +28 -0
- data/spec/api_spec.rb +18 -0
- data/spec/fixtures/example1.txt +1 -0
- data/spec/fixtures/example2.txt +1 -0
- data/spec/fixtures/example3.txt +1 -0
- data/spec/mecab_spec.rb +27 -0
- data/spec/node_spec.rb +13 -0
- data/spec/spec_helper.rb +10 -0
- metadata +74 -0
data/MIT-LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010 [maiha@wota.jp]
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
chawan
|
2
|
+
======
|
3
|
+
|
4
|
+
A cup for chasen that provides an easy to use for extracting Japanese
|
5
|
+
|
6
|
+
|
7
|
+
Methods
|
8
|
+
=======
|
9
|
+
|
10
|
+
* Chawan.parse(text)
|
11
|
+
parse the given text by analyzer, where default analyzer is :mecab
|
12
|
+
|
13
|
+
* Chawan.analyzer(xxx) (same as Chawan[xxx], Chawan.xxx)
|
14
|
+
specify analyzer
|
15
|
+
|
16
|
+
|
17
|
+
Class
|
18
|
+
=====
|
19
|
+
|
20
|
+
* Chawan::Node (Chawan.parse returns an array of Chawan::Node)
|
21
|
+
#category : part of speech
|
22
|
+
#word : text
|
23
|
+
#attributes : keys and vals hash
|
24
|
+
|
25
|
+
|
26
|
+
Example
|
27
|
+
=======
|
28
|
+
|
29
|
+
Chawan[:mecab].parse('test')
|
30
|
+
=> [<名詞: 'test'>]
|
31
|
+
|
32
|
+
# same as
|
33
|
+
# Chawan.mecab.parse('test')
|
34
|
+
# Chawan.analyzer(:mecab).parse('test')
|
35
|
+
# Chawan.parse('test') # default analyzer is :mecab
|
36
|
+
|
37
|
+
Chawan[:chasen].parse('test')
|
38
|
+
# ChasenAnalyzer is not implemented yet
|
39
|
+
|
40
|
+
|
41
|
+
Chawan.parse('本日は晴天なり')
|
42
|
+
=> [<名詞: '本日'>, <助詞: 'は'>, <名詞: '晴天'>, <助動詞: 'なり'>]
|
43
|
+
|
44
|
+
Chawan.parse('本日は晴天なり').select{|node| node.category == '名詞'}.join
|
45
|
+
=> "本日晴天"
|
46
|
+
|
47
|
+
|
48
|
+
Required
|
49
|
+
========
|
50
|
+
|
51
|
+
* UTF-8
|
52
|
+
* 'mecab' unix command (and its path)
|
53
|
+
|
54
|
+
|
55
|
+
Todo
|
56
|
+
====
|
57
|
+
|
58
|
+
* implement ChasenAnalyzer
|
59
|
+
* gateway interface to Chawan#parse such as grep, noun, ...
|
60
|
+
* use open3 rather than backquote for executing unix commands
|
61
|
+
|
62
|
+
|
63
|
+
Author
|
64
|
+
======
|
65
|
+
|
66
|
+
maiha@wota.jp
|
67
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake/gempackagetask'
|
3
|
+
|
4
|
+
GEM_NAME = "chawan"
|
5
|
+
AUTHOR = "maiha"
|
6
|
+
EMAIL = "maiha@wota.jp"
|
7
|
+
HOMEPAGE = "http://github.com/maiha/chawan"
|
8
|
+
SUMMARY = "A cup for chasen that provides an easy to use for extracting Japanese"
|
9
|
+
GEM_VERSION = "0.0.1"
|
10
|
+
|
11
|
+
spec = Gem::Specification.new do |s|
|
12
|
+
s.rubyforge_project = 'asakusarb'
|
13
|
+
s.executables = []
|
14
|
+
s.name = GEM_NAME
|
15
|
+
s.version = GEM_VERSION
|
16
|
+
s.platform = Gem::Platform::RUBY
|
17
|
+
s.has_rdoc = true
|
18
|
+
s.extra_rdoc_files = ["README", "MIT-LICENSE"]
|
19
|
+
s.summary = SUMMARY
|
20
|
+
s.description = s.summary
|
21
|
+
s.author = AUTHOR
|
22
|
+
s.email = EMAIL
|
23
|
+
s.homepage = HOMEPAGE
|
24
|
+
s.require_path = 'lib'
|
25
|
+
s.files = %w(MIT-LICENSE README Rakefile) + Dir.glob("{lib,spec,app,public,stubs}/**/*")
|
26
|
+
end
|
27
|
+
|
28
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
29
|
+
pkg.gem_spec = spec
|
30
|
+
end
|
31
|
+
|
32
|
+
desc "Install the gem"
|
33
|
+
task :install do
|
34
|
+
Merb::RakeHelper.install(GEM_NAME, :version => GEM_VERSION)
|
35
|
+
end
|
36
|
+
|
37
|
+
desc "Uninstall the gem"
|
38
|
+
task :uninstall do
|
39
|
+
Merb::RakeHelper.uninstall(GEM_NAME, :version => GEM_VERSION)
|
40
|
+
end
|
41
|
+
|
42
|
+
desc "Create a gemspec file"
|
43
|
+
task :gemspec do
|
44
|
+
File.open("#{GEM_NAME}.gemspec", "w") do |file|
|
45
|
+
file.puts spec.to_ruby
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
require 'spec/rake/spectask'
|
50
|
+
desc 'Default: run spec examples'
|
51
|
+
task :default => 'spec'
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Chawan
|
2
|
+
module Analyzers
|
3
|
+
class AbstractAnalyzer
|
4
|
+
attr_reader :options
|
5
|
+
|
6
|
+
def initialize(options = {})
|
7
|
+
@options = options.dup.freeze
|
8
|
+
end
|
9
|
+
|
10
|
+
def parse(text)
|
11
|
+
raise NotImplementedError, "#{self.class}#parse should be implemented"
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
|
2
|
+
module Chawan
|
3
|
+
module Analyzers
|
4
|
+
class ChasenAnalyzer < AbstractAnalyzer
|
5
|
+
Fields = []
|
6
|
+
|
7
|
+
def parse(text)
|
8
|
+
lines = execute(text).split(/\n/)
|
9
|
+
lines.pop == "EOS" or # "EOS"
|
10
|
+
raise CannotAnalyze
|
11
|
+
return lines.map{|line| instantiate(line, self.class::Fields)}
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
def execute(text)
|
16
|
+
require 'chasen'
|
17
|
+
# format = %w[%m %y %M %Y %h %P- %t %T- %f %F- %?U/unknown/known/].join("\t") + "\t\n"
|
18
|
+
Chasen.getopt('-i', 'w')
|
19
|
+
Chasen.sparse(text)
|
20
|
+
end
|
21
|
+
|
22
|
+
def instantiate(line, fields)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Chawan
|
2
|
+
module Analyzers
|
3
|
+
module Manager
|
4
|
+
def analyzers
|
5
|
+
@analyzers ||= {}
|
6
|
+
end
|
7
|
+
|
8
|
+
def analyzer(name = nil)
|
9
|
+
name ? analyzer_for(name) : current_analyzer
|
10
|
+
end
|
11
|
+
|
12
|
+
def analyzer_for(name)
|
13
|
+
analyzers[name.to_s] or raise AnalyzerNotFound, name.to_s
|
14
|
+
end
|
15
|
+
|
16
|
+
def current_analyzer
|
17
|
+
@analyzer or raise AnalyzerNotSetup
|
18
|
+
end
|
19
|
+
|
20
|
+
def define_analyzer(name, analyzer)
|
21
|
+
analyzers[name] = analyzer
|
22
|
+
|
23
|
+
unless respond_to?(name)
|
24
|
+
eval(<<-RUBY)
|
25
|
+
def Chawan.#{name}
|
26
|
+
Chawan['#{name}']
|
27
|
+
end
|
28
|
+
RUBY
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def setup(name, setter = nil)
|
33
|
+
define_analyzer(name.to_s, setter) if setter
|
34
|
+
@analyzer = analyzer_for(name)
|
35
|
+
end
|
36
|
+
|
37
|
+
def [](name)
|
38
|
+
analyzer_for(name)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'tempfile'
|
2
|
+
require File.dirname(__FILE__) + '/chasen_analyzer'
|
3
|
+
|
4
|
+
module Chawan
|
5
|
+
module Analyzers
|
6
|
+
class MecabAnalyzer < ChasenAnalyzer
|
7
|
+
Fields = "表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音".split(/,|\s/)
|
8
|
+
|
9
|
+
class Node < Chawan::Node
|
10
|
+
# not used yet
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
def instantiate(line, fields)
|
15
|
+
Node.new(line.split(/,|\s/), fields)
|
16
|
+
end
|
17
|
+
|
18
|
+
def execute(text)
|
19
|
+
transaction(text) do |tmp|
|
20
|
+
`mecab #{tmp.path}`
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def transaction(buffer, &block)
|
25
|
+
tmp = Tempfile.new("mecab-text")
|
26
|
+
tmp.print(buffer)
|
27
|
+
tmp.close
|
28
|
+
block.call(tmp)
|
29
|
+
ensure
|
30
|
+
tmp.close(true)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
|
2
|
+
require File.dirname(__FILE__) + '/analyzers/manager'
|
3
|
+
require File.dirname(__FILE__) + '/analyzers/abstract_analyzer'
|
4
|
+
|
5
|
+
module Chawan
|
6
|
+
extend Analyzers::Manager
|
7
|
+
end
|
8
|
+
|
9
|
+
Dir.glob( File.dirname(__FILE__) + '/analyzers/*_analyzer.rb' ).sort.each do |path|
|
10
|
+
require path
|
11
|
+
end
|
12
|
+
|
13
|
+
module Chawan
|
14
|
+
module Analyzers
|
15
|
+
constants.sort.grep(/(.*?)Analyzer$/) do
|
16
|
+
name = $1.downcase
|
17
|
+
klass = Chawan::Analyzers.const_get($&)
|
18
|
+
next if name == 'abstract'
|
19
|
+
Chawan.setup(name, klass.new)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
data/lib/chawan/node.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
module Chawan
|
2
|
+
class Node
|
3
|
+
attr_reader :vals
|
4
|
+
attr_reader :keys
|
5
|
+
|
6
|
+
def initialize(vals, keys)
|
7
|
+
@vals = vals
|
8
|
+
@keys = keys
|
9
|
+
end
|
10
|
+
|
11
|
+
def attributes
|
12
|
+
@attributes ||= Hash[*keys.zip(vals).flatten]
|
13
|
+
end
|
14
|
+
|
15
|
+
def [](index)
|
16
|
+
case index
|
17
|
+
when Integer
|
18
|
+
vals[keys[index]]
|
19
|
+
else
|
20
|
+
attributes[index.to_s]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def word
|
25
|
+
vals.first.to_s
|
26
|
+
end
|
27
|
+
|
28
|
+
def category
|
29
|
+
vals[1]
|
30
|
+
end
|
31
|
+
|
32
|
+
def to_s
|
33
|
+
word
|
34
|
+
end
|
35
|
+
|
36
|
+
def inspect
|
37
|
+
"<%s: '%s'>" % [category, to_s]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/lib/chawan.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
|
2
|
+
module Chawan
|
3
|
+
class AnalyzerNotFound < StandardError; end
|
4
|
+
class AnalyzerNotSetup < StandardError; end
|
5
|
+
class CannotAnalyze < StandardError; end
|
6
|
+
end
|
7
|
+
|
8
|
+
|
9
|
+
require File.dirname(__FILE__) + '/chawan/node'
|
10
|
+
require File.dirname(__FILE__) + '/chawan/commands'
|
11
|
+
require File.dirname(__FILE__) + '/chawan/analyzers'
|
12
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper.rb')
|
3
|
+
|
4
|
+
describe Chawan do
|
5
|
+
it "should provide .[]" do
|
6
|
+
Chawan.should respond_to(:[])
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "[:mecab]" do
|
10
|
+
it "should return MecabAnalyzer" do
|
11
|
+
Chawan[:mecab].should be_kind_of(Chawan::Analyzers::MecabAnalyzer)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "[:chasen]" do
|
16
|
+
it "should return ChasenAnalyzer" do
|
17
|
+
Chawan[:chasen].should be_kind_of(Chawan::Analyzers::ChasenAnalyzer)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should provide .mecab" do
|
22
|
+
Chawan.should respond_to(:mecab)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should provide .chasen" do
|
26
|
+
Chawan.should respond_to(:chasen)
|
27
|
+
end
|
28
|
+
end
|
data/spec/api_spec.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), 'spec_helper.rb')
|
4
|
+
|
5
|
+
describe Chawan do
|
6
|
+
it "should provide .parse" do
|
7
|
+
Chawan.should respond_to(:parse)
|
8
|
+
end
|
9
|
+
|
10
|
+
describe ".parse" do
|
11
|
+
it "should delegate to analyzer" do
|
12
|
+
pending "cannot mock within rr" do
|
13
|
+
mock(Chawan).analyzer
|
14
|
+
Chawan.parse('test')
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
本日は晴天なり。
|
@@ -0,0 +1 @@
|
|
1
|
+
貴社の記者が汽車で帰社した。
|
@@ -0,0 +1 @@
|
|
1
|
+
玉露の一番搾りを一日に百五十杯飲むと、人間は死んでしまうらしいです。
|
data/spec/mecab_spec.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper.rb')
|
3
|
+
|
4
|
+
describe Chawan::Analyzers::MecabAnalyzer do
|
5
|
+
before do
|
6
|
+
@a = Chawan::Analyzers::MecabAnalyzer.new
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should provide #parse" do
|
10
|
+
@a.should respond_to(:parse)
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "#parse" do
|
14
|
+
it "should return an Array of Node" do
|
15
|
+
text = data("example1.txt")
|
16
|
+
@a.parse(text).each do |node|
|
17
|
+
node.should be_kind_of(Chawan::Node)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should work as expected" do
|
22
|
+
text = data("example1.txt")
|
23
|
+
@a.parse(text).map(&:inspect).join.should ==
|
24
|
+
"<名詞: '本日'><助詞: 'は'><名詞: '晴天'><助動詞: 'なり'><記号: '。'>"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/spec/node_spec.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper.rb')
|
3
|
+
|
4
|
+
describe Chawan::Node do
|
5
|
+
subject { Chawan::Node.new(["Ruby","名詞"], ["表記","品詞"]) }
|
6
|
+
|
7
|
+
its('keys') {should == ["表記","品詞"] }
|
8
|
+
its('vals') {should == ["Ruby","名詞"] }
|
9
|
+
its('attributes') {should == {"表記"=>"Ruby", "品詞"=>"名詞"}}
|
10
|
+
its('word') {should == "Ruby"}
|
11
|
+
its('category') {should == "名詞"}
|
12
|
+
its('inspect') {should == "<名詞: 'Ruby'>"}
|
13
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: chawan
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- maiha
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-01-03 00:00:00 +09:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: A cup for chasen that provides an easy to use for extracting Japanese
|
17
|
+
email: maiha@wota.jp
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README
|
24
|
+
- MIT-LICENSE
|
25
|
+
files:
|
26
|
+
- MIT-LICENSE
|
27
|
+
- README
|
28
|
+
- Rakefile
|
29
|
+
- lib/chawan/analyzers.rb
|
30
|
+
- lib/chawan/analyzers/mecab_analyzer.rb
|
31
|
+
- lib/chawan/analyzers/chasen_analyzer.rb
|
32
|
+
- lib/chawan/analyzers/abstract_analyzer.rb
|
33
|
+
- lib/chawan/analyzers/manager.rb
|
34
|
+
- lib/chawan/node.rb
|
35
|
+
- lib/chawan/commands.rb
|
36
|
+
- lib/chawan.rb
|
37
|
+
- spec/api_spec.rb
|
38
|
+
- spec/mecab_spec.rb
|
39
|
+
- spec/analyzer_spec.rb
|
40
|
+
- spec/fixtures/example1.txt
|
41
|
+
- spec/fixtures/example2.txt
|
42
|
+
- spec/fixtures/example3.txt
|
43
|
+
- spec/spec_helper.rb
|
44
|
+
- spec/node_spec.rb
|
45
|
+
has_rdoc: true
|
46
|
+
homepage: http://github.com/maiha/chawan
|
47
|
+
licenses: []
|
48
|
+
|
49
|
+
post_install_message:
|
50
|
+
rdoc_options: []
|
51
|
+
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: "0"
|
59
|
+
version:
|
60
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: "0"
|
65
|
+
version:
|
66
|
+
requirements: []
|
67
|
+
|
68
|
+
rubyforge_project: asakusarb
|
69
|
+
rubygems_version: 1.3.5
|
70
|
+
signing_key:
|
71
|
+
specification_version: 3
|
72
|
+
summary: A cup for chasen that provides an easy to use for extracting Japanese
|
73
|
+
test_files: []
|
74
|
+
|