chawan 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/MIT-LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 [maiha@wota.jp]
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,67 @@
1
+ chawan
2
+ ======
3
+
4
+ A cup for chasen that provides an easy to use for extracting Japanese
5
+
6
+
7
+ Methods
8
+ =======
9
+
10
+ * Chawan.parse(text)
11
+ parse the given text by analyzer, where default analyzer is :mecab
12
+
13
+ * Chawan.analyzer(xxx) (same as Chawan[xxx], Chawan.xxx)
14
+ specify analyzer
15
+
16
+
17
+ Class
18
+ =====
19
+
20
+ * Chawan::Node (Chawan.parse returns an array of Chawan::Node)
21
+ #category : part of speech
22
+ #word : text
23
+ #attributes : keys and vals hash
24
+
25
+
26
+ Example
27
+ =======
28
+
29
+ Chawan[:mecab].parse('test')
30
+ => [<名詞: 'test'>]
31
+
32
+ # same as
33
+ # Chawan.mecab.parse('test')
34
+ # Chawan.analyzer(:mecab).parse('test')
35
+ # Chawan.parse('test') # default analyzer is :mecab
36
+
37
+ Chawan[:chasen].parse('test')
38
+ # ChasenAnalyzer is not implemented yet
39
+
40
+
41
+ Chawan.parse('本日は晴天なり')
42
+ => [<名詞: '本日'>, <助詞: 'は'>, <名詞: '晴天'>, <助動詞: 'なり'>]
43
+
44
+ Chawan.parse('本日は晴天なり').select{|node| node.category == '名詞'}.join
45
+ => "本日晴天"
46
+
47
+
48
+ Required
49
+ ========
50
+
51
+ * UTF-8
52
+ * 'mecab' unix command (and its path)
53
+
54
+
55
+ Todo
56
+ ====
57
+
58
+ * implement ChasenAnalyzer
59
+ * gateway interface to Chawan#parse such as grep, noun, ...
60
+ * use open3 rather than backquote for executing unix commands
61
+
62
+
63
+ Author
64
+ ======
65
+
66
+ maiha@wota.jp
67
+
data/Rakefile ADDED
@@ -0,0 +1,51 @@
1
+ require 'rubygems'
2
+ require 'rake/gempackagetask'
3
+
4
+ GEM_NAME = "chawan"
5
+ AUTHOR = "maiha"
6
+ EMAIL = "maiha@wota.jp"
7
+ HOMEPAGE = "http://github.com/maiha/chawan"
8
+ SUMMARY = "A cup for chasen that provides an easy to use for extracting Japanese"
9
+ GEM_VERSION = "0.0.1"
10
+
11
+ spec = Gem::Specification.new do |s|
12
+ s.rubyforge_project = 'asakusarb'
13
+ s.executables = []
14
+ s.name = GEM_NAME
15
+ s.version = GEM_VERSION
16
+ s.platform = Gem::Platform::RUBY
17
+ s.has_rdoc = true
18
+ s.extra_rdoc_files = ["README", "MIT-LICENSE"]
19
+ s.summary = SUMMARY
20
+ s.description = s.summary
21
+ s.author = AUTHOR
22
+ s.email = EMAIL
23
+ s.homepage = HOMEPAGE
24
+ s.require_path = 'lib'
25
+ s.files = %w(MIT-LICENSE README Rakefile) + Dir.glob("{lib,spec,app,public,stubs}/**/*")
26
+ end
27
+
28
+ Rake::GemPackageTask.new(spec) do |pkg|
29
+ pkg.gem_spec = spec
30
+ end
31
+
32
+ desc "Install the gem"
33
+ task :install do
34
+ Merb::RakeHelper.install(GEM_NAME, :version => GEM_VERSION)
35
+ end
36
+
37
+ desc "Uninstall the gem"
38
+ task :uninstall do
39
+ Merb::RakeHelper.uninstall(GEM_NAME, :version => GEM_VERSION)
40
+ end
41
+
42
+ desc "Create a gemspec file"
43
+ task :gemspec do
44
+ File.open("#{GEM_NAME}.gemspec", "w") do |file|
45
+ file.puts spec.to_ruby
46
+ end
47
+ end
48
+
49
+ require 'spec/rake/spectask'
50
+ desc 'Default: run spec examples'
51
+ task :default => 'spec'
@@ -0,0 +1,15 @@
1
+ module Chawan
2
+ module Analyzers
3
+ class AbstractAnalyzer
4
+ attr_reader :options
5
+
6
+ def initialize(options = {})
7
+ @options = options.dup.freeze
8
+ end
9
+
10
+ def parse(text)
11
+ raise NotImplementedError, "#{self.class}#parse should be implemented"
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,27 @@
1
+
2
+ module Chawan
3
+ module Analyzers
4
+ class ChasenAnalyzer < AbstractAnalyzer
5
+ Fields = []
6
+
7
+ def parse(text)
8
+ lines = execute(text).split(/\n/)
9
+ lines.pop == "EOS" or # "EOS"
10
+ raise CannotAnalyze
11
+ return lines.map{|line| instantiate(line, self.class::Fields)}
12
+ end
13
+
14
+ private
15
+ def execute(text)
16
+ require 'chasen'
17
+ # format = %w[%m %y %M %Y %h %P- %t %T- %f %F- %?U/unknown/known/].join("\t") + "\t\n"
18
+ Chasen.getopt('-i', 'w')
19
+ Chasen.sparse(text)
20
+ end
21
+
22
+ def instantiate(line, fields)
23
+ end
24
+ end
25
+ end
26
+ end
27
+
@@ -0,0 +1,43 @@
1
+ module Chawan
2
+ module Analyzers
3
+ module Manager
4
+ def analyzers
5
+ @analyzers ||= {}
6
+ end
7
+
8
+ def analyzer(name = nil)
9
+ name ? analyzer_for(name) : current_analyzer
10
+ end
11
+
12
+ def analyzer_for(name)
13
+ analyzers[name.to_s] or raise AnalyzerNotFound, name.to_s
14
+ end
15
+
16
+ def current_analyzer
17
+ @analyzer or raise AnalyzerNotSetup
18
+ end
19
+
20
+ def define_analyzer(name, analyzer)
21
+ analyzers[name] = analyzer
22
+
23
+ unless respond_to?(name)
24
+ eval(<<-RUBY)
25
+ def Chawan.#{name}
26
+ Chawan['#{name}']
27
+ end
28
+ RUBY
29
+ end
30
+ end
31
+
32
+ def setup(name, setter = nil)
33
+ define_analyzer(name.to_s, setter) if setter
34
+ @analyzer = analyzer_for(name)
35
+ end
36
+
37
+ def [](name)
38
+ analyzer_for(name)
39
+ end
40
+ end
41
+ end
42
+ end
43
+
@@ -0,0 +1,35 @@
1
+ require 'tempfile'
2
+ require File.dirname(__FILE__) + '/chasen_analyzer'
3
+
4
+ module Chawan
5
+ module Analyzers
6
+ class MecabAnalyzer < ChasenAnalyzer
7
+ Fields = "表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音".split(/,|\s/)
8
+
9
+ class Node < Chawan::Node
10
+ # not used yet
11
+ end
12
+
13
+ private
14
+ def instantiate(line, fields)
15
+ Node.new(line.split(/,|\s/), fields)
16
+ end
17
+
18
+ def execute(text)
19
+ transaction(text) do |tmp|
20
+ `mecab #{tmp.path}`
21
+ end
22
+ end
23
+
24
+ def transaction(buffer, &block)
25
+ tmp = Tempfile.new("mecab-text")
26
+ tmp.print(buffer)
27
+ tmp.close
28
+ block.call(tmp)
29
+ ensure
30
+ tmp.close(true)
31
+ end
32
+ end
33
+ end
34
+ end
35
+
@@ -0,0 +1,23 @@
1
+
2
+ require File.dirname(__FILE__) + '/analyzers/manager'
3
+ require File.dirname(__FILE__) + '/analyzers/abstract_analyzer'
4
+
5
+ module Chawan
6
+ extend Analyzers::Manager
7
+ end
8
+
9
+ Dir.glob( File.dirname(__FILE__) + '/analyzers/*_analyzer.rb' ).sort.each do |path|
10
+ require path
11
+ end
12
+
13
+ module Chawan
14
+ module Analyzers
15
+ constants.sort.grep(/(.*?)Analyzer$/) do
16
+ name = $1.downcase
17
+ klass = Chawan::Analyzers.const_get($&)
18
+ next if name == 'abstract'
19
+ Chawan.setup(name, klass.new)
20
+ end
21
+ end
22
+ end
23
+
@@ -0,0 +1,11 @@
1
+ module Chawan
2
+ module Commands
3
+ def parse(text)
4
+ analyzer.parse(text)
5
+ end
6
+ end
7
+ end
8
+
9
+ module Chawan
10
+ extend Commands
11
+ end
@@ -0,0 +1,40 @@
1
+ module Chawan
2
+ class Node
3
+ attr_reader :vals
4
+ attr_reader :keys
5
+
6
+ def initialize(vals, keys)
7
+ @vals = vals
8
+ @keys = keys
9
+ end
10
+
11
+ def attributes
12
+ @attributes ||= Hash[*keys.zip(vals).flatten]
13
+ end
14
+
15
+ def [](index)
16
+ case index
17
+ when Integer
18
+ vals[keys[index]]
19
+ else
20
+ attributes[index.to_s]
21
+ end
22
+ end
23
+
24
+ def word
25
+ vals.first.to_s
26
+ end
27
+
28
+ def category
29
+ vals[1]
30
+ end
31
+
32
+ def to_s
33
+ word
34
+ end
35
+
36
+ def inspect
37
+ "<%s: '%s'>" % [category, to_s]
38
+ end
39
+ end
40
+ end
data/lib/chawan.rb ADDED
@@ -0,0 +1,12 @@
1
+
2
+ module Chawan
3
+ class AnalyzerNotFound < StandardError; end
4
+ class AnalyzerNotSetup < StandardError; end
5
+ class CannotAnalyze < StandardError; end
6
+ end
7
+
8
+
9
+ require File.dirname(__FILE__) + '/chawan/node'
10
+ require File.dirname(__FILE__) + '/chawan/commands'
11
+ require File.dirname(__FILE__) + '/chawan/analyzers'
12
+
@@ -0,0 +1,28 @@
1
+
2
+ require File.join(File.dirname(__FILE__), 'spec_helper.rb')
3
+
4
+ describe Chawan do
5
+ it "should provide .[]" do
6
+ Chawan.should respond_to(:[])
7
+ end
8
+
9
+ describe "[:mecab]" do
10
+ it "should return MecabAnalyzer" do
11
+ Chawan[:mecab].should be_kind_of(Chawan::Analyzers::MecabAnalyzer)
12
+ end
13
+ end
14
+
15
+ describe "[:chasen]" do
16
+ it "should return ChasenAnalyzer" do
17
+ Chawan[:chasen].should be_kind_of(Chawan::Analyzers::ChasenAnalyzer)
18
+ end
19
+ end
20
+
21
+ it "should provide .mecab" do
22
+ Chawan.should respond_to(:mecab)
23
+ end
24
+
25
+ it "should provide .chasen" do
26
+ Chawan.should respond_to(:chasen)
27
+ end
28
+ end
data/spec/api_spec.rb ADDED
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require File.join(File.dirname(__FILE__), 'spec_helper.rb')
4
+
5
+ describe Chawan do
6
+ it "should provide .parse" do
7
+ Chawan.should respond_to(:parse)
8
+ end
9
+
10
+ describe ".parse" do
11
+ it "should delegate to analyzer" do
12
+ pending "cannot mock within rr" do
13
+ mock(Chawan).analyzer
14
+ Chawan.parse('test')
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1 @@
1
+ 本日は晴天なり。
@@ -0,0 +1 @@
1
+ 貴社の記者が汽車で帰社した。
@@ -0,0 +1 @@
1
+ 玉露の一番搾りを一日に百五十杯飲むと、人間は死んでしまうらしいです。
@@ -0,0 +1,27 @@
1
+
2
+ require File.join(File.dirname(__FILE__), 'spec_helper.rb')
3
+
4
+ describe Chawan::Analyzers::MecabAnalyzer do
5
+ before do
6
+ @a = Chawan::Analyzers::MecabAnalyzer.new
7
+ end
8
+
9
+ it "should provide #parse" do
10
+ @a.should respond_to(:parse)
11
+ end
12
+
13
+ describe "#parse" do
14
+ it "should return an Array of Node" do
15
+ text = data("example1.txt")
16
+ @a.parse(text).each do |node|
17
+ node.should be_kind_of(Chawan::Node)
18
+ end
19
+ end
20
+
21
+ it "should work as expected" do
22
+ text = data("example1.txt")
23
+ @a.parse(text).map(&:inspect).join.should ==
24
+ "<名詞: '本日'><助詞: 'は'><名詞: '晴天'><助動詞: 'なり'><記号: '。'>"
25
+ end
26
+ end
27
+ end
data/spec/node_spec.rb ADDED
@@ -0,0 +1,13 @@
1
+
2
+ require File.join(File.dirname(__FILE__), 'spec_helper.rb')
3
+
4
+ describe Chawan::Node do
5
+ subject { Chawan::Node.new(["Ruby","名詞"], ["表記","品詞"]) }
6
+
7
+ its('keys') {should == ["表記","品詞"] }
8
+ its('vals') {should == ["Ruby","名詞"] }
9
+ its('attributes') {should == {"表記"=>"Ruby", "品詞"=>"名詞"}}
10
+ its('word') {should == "Ruby"}
11
+ its('category') {should == "名詞"}
12
+ its('inspect') {should == "<名詞: 'Ruby'>"}
13
+ end
@@ -0,0 +1,10 @@
1
+
2
+ require 'spec'
3
+ require 'rr'
4
+
5
+ require File.join(File.dirname(__FILE__), '/../lib/chawan')
6
+
7
+ def data(key)
8
+ path = File.join(File.dirname(__FILE__) + "/fixtures/#{key}")
9
+ File.read(path){}
10
+ end
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: chawan
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - maiha
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-03 00:00:00 +09:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: A cup for chasen that provides an easy to use for extracting Japanese
17
+ email: maiha@wota.jp
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - README
24
+ - MIT-LICENSE
25
+ files:
26
+ - MIT-LICENSE
27
+ - README
28
+ - Rakefile
29
+ - lib/chawan/analyzers.rb
30
+ - lib/chawan/analyzers/mecab_analyzer.rb
31
+ - lib/chawan/analyzers/chasen_analyzer.rb
32
+ - lib/chawan/analyzers/abstract_analyzer.rb
33
+ - lib/chawan/analyzers/manager.rb
34
+ - lib/chawan/node.rb
35
+ - lib/chawan/commands.rb
36
+ - lib/chawan.rb
37
+ - spec/api_spec.rb
38
+ - spec/mecab_spec.rb
39
+ - spec/analyzer_spec.rb
40
+ - spec/fixtures/example1.txt
41
+ - spec/fixtures/example2.txt
42
+ - spec/fixtures/example3.txt
43
+ - spec/spec_helper.rb
44
+ - spec/node_spec.rb
45
+ has_rdoc: true
46
+ homepage: http://github.com/maiha/chawan
47
+ licenses: []
48
+
49
+ post_install_message:
50
+ rdoc_options: []
51
+
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: "0"
59
+ version:
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: "0"
65
+ version:
66
+ requirements: []
67
+
68
+ rubyforge_project: asakusarb
69
+ rubygems_version: 1.3.5
70
+ signing_key:
71
+ specification_version: 3
72
+ summary: A cup for chasen that provides an easy to use for extracting Japanese
73
+ test_files: []
74
+