chawan 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/MIT-LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 [maiha@wota.jp]
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,67 @@
1
+ chawan
2
+ ======
3
+
4
+ A cup for chasen that provides an easy to use for extracting Japanese
5
+
6
+
7
+ Methods
8
+ =======
9
+
10
+ * Chawan.parse(text)
11
+ parse the given text by analyzer, where default analyzer is :mecab
12
+
13
+ * Chawan.analyzer(xxx) (same as Chawan[xxx], Chawan.xxx)
14
+ specify analyzer
15
+
16
+
17
+ Class
18
+ =====
19
+
20
+ * Chawan::Node (Chawan.parse returns an array of Chawan::Node)
21
+ #category : part of speech
22
+ #word : text
23
+ #attributes : keys and vals hash
24
+
25
+
26
+ Example
27
+ =======
28
+
29
+ Chawan[:mecab].parse('test')
30
+ => [<名詞: 'test'>]
31
+
32
+ # same as
33
+ # Chawan.mecab.parse('test')
34
+ # Chawan.analyzer(:mecab).parse('test')
35
+ # Chawan.parse('test') # default analyzer is :mecab
36
+
37
+ Chawan[:chasen].parse('test')
38
+ # ChasenAnalyzer is not implemented yet
39
+
40
+
41
+ Chawan.parse('本日は晴天なり')
42
+ => [<名詞: '本日'>, <助詞: 'は'>, <名詞: '晴天'>, <助動詞: 'なり'>]
43
+
44
+ Chawan.parse('本日は晴天なり').select{|node| node.category == '名詞'}.join
45
+ => "本日晴天"
46
+
47
+
48
+ Required
49
+ ========
50
+
51
+ * UTF-8
52
+ * 'mecab' unix command (and its path)
53
+
54
+
55
+ Todo
56
+ ====
57
+
58
+ * implement ChasenAnalyzer
59
+ * gateway interface to Chawan#parse such as grep, noun, ...
60
+ * use open3 rather than backquote for executing unix commands
61
+
62
+
63
+ Author
64
+ ======
65
+
66
+ maiha@wota.jp
67
+
data/Rakefile ADDED
@@ -0,0 +1,51 @@
1
+ require 'rubygems'
2
+ require 'rake/gempackagetask'
3
+
4
+ GEM_NAME = "chawan"
5
+ AUTHOR = "maiha"
6
+ EMAIL = "maiha@wota.jp"
7
+ HOMEPAGE = "http://github.com/maiha/chawan"
8
+ SUMMARY = "A cup for chasen that provides an easy to use for extracting Japanese"
9
+ GEM_VERSION = "0.0.1"
10
+
11
+ spec = Gem::Specification.new do |s|
12
+ s.rubyforge_project = 'asakusarb'
13
+ s.executables = []
14
+ s.name = GEM_NAME
15
+ s.version = GEM_VERSION
16
+ s.platform = Gem::Platform::RUBY
17
+ s.has_rdoc = true
18
+ s.extra_rdoc_files = ["README", "MIT-LICENSE"]
19
+ s.summary = SUMMARY
20
+ s.description = s.summary
21
+ s.author = AUTHOR
22
+ s.email = EMAIL
23
+ s.homepage = HOMEPAGE
24
+ s.require_path = 'lib'
25
+ s.files = %w(MIT-LICENSE README Rakefile) + Dir.glob("{lib,spec,app,public,stubs}/**/*")
26
+ end
27
+
28
+ Rake::GemPackageTask.new(spec) do |pkg|
29
+ pkg.gem_spec = spec
30
+ end
31
+
32
+ desc "Install the gem"
33
+ task :install do
34
+ Merb::RakeHelper.install(GEM_NAME, :version => GEM_VERSION)
35
+ end
36
+
37
+ desc "Uninstall the gem"
38
+ task :uninstall do
39
+ Merb::RakeHelper.uninstall(GEM_NAME, :version => GEM_VERSION)
40
+ end
41
+
42
+ desc "Create a gemspec file"
43
+ task :gemspec do
44
+ File.open("#{GEM_NAME}.gemspec", "w") do |file|
45
+ file.puts spec.to_ruby
46
+ end
47
+ end
48
+
49
+ require 'spec/rake/spectask'
50
+ desc 'Default: run spec examples'
51
+ task :default => 'spec'
@@ -0,0 +1,15 @@
1
+ module Chawan
2
+ module Analyzers
3
+ class AbstractAnalyzer
4
+ attr_reader :options
5
+
6
+ def initialize(options = {})
7
+ @options = options.dup.freeze
8
+ end
9
+
10
+ def parse(text)
11
+ raise NotImplementedError, "#{self.class}#parse should be implemented"
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,27 @@
1
+
2
+ module Chawan
3
+ module Analyzers
4
+ class ChasenAnalyzer < AbstractAnalyzer
5
+ Fields = []
6
+
7
+ def parse(text)
8
+ lines = execute(text).split(/\n/)
9
+ lines.pop == "EOS" or # "EOS"
10
+ raise CannotAnalyze
11
+ return lines.map{|line| instantiate(line, self.class::Fields)}
12
+ end
13
+
14
+ private
15
+ def execute(text)
16
+ require 'chasen'
17
+ # format = %w[%m %y %M %Y %h %P- %t %T- %f %F- %?U/unknown/known/].join("\t") + "\t\n"
18
+ Chasen.getopt('-i', 'w')
19
+ Chasen.sparse(text)
20
+ end
21
+
22
+ def instantiate(line, fields)
23
+ end
24
+ end
25
+ end
26
+ end
27
+
@@ -0,0 +1,43 @@
1
+ module Chawan
2
+ module Analyzers
3
+ module Manager
4
+ def analyzers
5
+ @analyzers ||= {}
6
+ end
7
+
8
+ def analyzer(name = nil)
9
+ name ? analyzer_for(name) : current_analyzer
10
+ end
11
+
12
+ def analyzer_for(name)
13
+ analyzers[name.to_s] or raise AnalyzerNotFound, name.to_s
14
+ end
15
+
16
+ def current_analyzer
17
+ @analyzer or raise AnalyzerNotSetup
18
+ end
19
+
20
+ def define_analyzer(name, analyzer)
21
+ analyzers[name] = analyzer
22
+
23
+ unless respond_to?(name)
24
+ eval(<<-RUBY)
25
+ def Chawan.#{name}
26
+ Chawan['#{name}']
27
+ end
28
+ RUBY
29
+ end
30
+ end
31
+
32
+ def setup(name, setter = nil)
33
+ define_analyzer(name.to_s, setter) if setter
34
+ @analyzer = analyzer_for(name)
35
+ end
36
+
37
+ def [](name)
38
+ analyzer_for(name)
39
+ end
40
+ end
41
+ end
42
+ end
43
+
@@ -0,0 +1,35 @@
1
+ require 'tempfile'
2
+ require File.dirname(__FILE__) + '/chasen_analyzer'
3
+
4
+ module Chawan
5
+ module Analyzers
6
+ class MecabAnalyzer < ChasenAnalyzer
7
+ Fields = "表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音".split(/,|\s/)
8
+
9
+ class Node < Chawan::Node
10
+ # not used yet
11
+ end
12
+
13
+ private
14
+ def instantiate(line, fields)
15
+ Node.new(line.split(/,|\s/), fields)
16
+ end
17
+
18
+ def execute(text)
19
+ transaction(text) do |tmp|
20
+ `mecab #{tmp.path}`
21
+ end
22
+ end
23
+
24
+ def transaction(buffer, &block)
25
+ tmp = Tempfile.new("mecab-text")
26
+ tmp.print(buffer)
27
+ tmp.close
28
+ block.call(tmp)
29
+ ensure
30
+ tmp.close(true)
31
+ end
32
+ end
33
+ end
34
+ end
35
+
@@ -0,0 +1,23 @@
1
+
2
+ require File.dirname(__FILE__) + '/analyzers/manager'
3
+ require File.dirname(__FILE__) + '/analyzers/abstract_analyzer'
4
+
5
+ module Chawan
6
+ extend Analyzers::Manager
7
+ end
8
+
9
+ Dir.glob( File.dirname(__FILE__) + '/analyzers/*_analyzer.rb' ).sort.each do |path|
10
+ require path
11
+ end
12
+
13
+ module Chawan
14
+ module Analyzers
15
+ constants.sort.grep(/(.*?)Analyzer$/) do
16
+ name = $1.downcase
17
+ klass = Chawan::Analyzers.const_get($&)
18
+ next if name == 'abstract'
19
+ Chawan.setup(name, klass.new)
20
+ end
21
+ end
22
+ end
23
+
@@ -0,0 +1,11 @@
1
+ module Chawan
2
+ module Commands
3
+ def parse(text)
4
+ analyzer.parse(text)
5
+ end
6
+ end
7
+ end
8
+
9
+ module Chawan
10
+ extend Commands
11
+ end
@@ -0,0 +1,40 @@
1
+ module Chawan
2
+ class Node
3
+ attr_reader :vals
4
+ attr_reader :keys
5
+
6
+ def initialize(vals, keys)
7
+ @vals = vals
8
+ @keys = keys
9
+ end
10
+
11
+ def attributes
12
+ @attributes ||= Hash[*keys.zip(vals).flatten]
13
+ end
14
+
15
+ def [](index)
16
+ case index
17
+ when Integer
18
+ vals[keys[index]]
19
+ else
20
+ attributes[index.to_s]
21
+ end
22
+ end
23
+
24
+ def word
25
+ vals.first.to_s
26
+ end
27
+
28
+ def category
29
+ vals[1]
30
+ end
31
+
32
+ def to_s
33
+ word
34
+ end
35
+
36
+ def inspect
37
+ "<%s: '%s'>" % [category, to_s]
38
+ end
39
+ end
40
+ end
data/lib/chawan.rb ADDED
@@ -0,0 +1,12 @@
1
+
2
+ module Chawan
3
+ class AnalyzerNotFound < StandardError; end
4
+ class AnalyzerNotSetup < StandardError; end
5
+ class CannotAnalyze < StandardError; end
6
+ end
7
+
8
+
9
+ require File.dirname(__FILE__) + '/chawan/node'
10
+ require File.dirname(__FILE__) + '/chawan/commands'
11
+ require File.dirname(__FILE__) + '/chawan/analyzers'
12
+
@@ -0,0 +1,28 @@
1
+
2
+ require File.join(File.dirname(__FILE__), 'spec_helper.rb')
3
+
4
+ describe Chawan do
5
+ it "should provide .[]" do
6
+ Chawan.should respond_to(:[])
7
+ end
8
+
9
+ describe "[:mecab]" do
10
+ it "should return MecabAnalyzer" do
11
+ Chawan[:mecab].should be_kind_of(Chawan::Analyzers::MecabAnalyzer)
12
+ end
13
+ end
14
+
15
+ describe "[:chasen]" do
16
+ it "should return ChasenAnalyzer" do
17
+ Chawan[:chasen].should be_kind_of(Chawan::Analyzers::ChasenAnalyzer)
18
+ end
19
+ end
20
+
21
+ it "should provide .mecab" do
22
+ Chawan.should respond_to(:mecab)
23
+ end
24
+
25
+ it "should provide .chasen" do
26
+ Chawan.should respond_to(:chasen)
27
+ end
28
+ end
data/spec/api_spec.rb ADDED
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require File.join(File.dirname(__FILE__), 'spec_helper.rb')
4
+
5
+ describe Chawan do
6
+ it "should provide .parse" do
7
+ Chawan.should respond_to(:parse)
8
+ end
9
+
10
+ describe ".parse" do
11
+ it "should delegate to analyzer" do
12
+ pending "cannot mock within rr" do
13
+ mock(Chawan).analyzer
14
+ Chawan.parse('test')
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1 @@
1
+ 本日は晴天なり。
@@ -0,0 +1 @@
1
+ 貴社の記者が汽車で帰社した。
@@ -0,0 +1 @@
1
+ 玉露の一番搾りを一日に百五十杯飲むと、人間は死んでしまうらしいです。
@@ -0,0 +1,27 @@
1
+
2
+ require File.join(File.dirname(__FILE__), 'spec_helper.rb')
3
+
4
+ describe Chawan::Analyzers::MecabAnalyzer do
5
+ before do
6
+ @a = Chawan::Analyzers::MecabAnalyzer.new
7
+ end
8
+
9
+ it "should provide #parse" do
10
+ @a.should respond_to(:parse)
11
+ end
12
+
13
+ describe "#parse" do
14
+ it "should return an Array of Node" do
15
+ text = data("example1.txt")
16
+ @a.parse(text).each do |node|
17
+ node.should be_kind_of(Chawan::Node)
18
+ end
19
+ end
20
+
21
+ it "should work as expected" do
22
+ text = data("example1.txt")
23
+ @a.parse(text).map(&:inspect).join.should ==
24
+ "<名詞: '本日'><助詞: 'は'><名詞: '晴天'><助動詞: 'なり'><記号: '。'>"
25
+ end
26
+ end
27
+ end
data/spec/node_spec.rb ADDED
@@ -0,0 +1,13 @@
1
+
2
+ require File.join(File.dirname(__FILE__), 'spec_helper.rb')
3
+
4
+ describe Chawan::Node do
5
+ subject { Chawan::Node.new(["Ruby","名詞"], ["表記","品詞"]) }
6
+
7
+ its('keys') {should == ["表記","品詞"] }
8
+ its('vals') {should == ["Ruby","名詞"] }
9
+ its('attributes') {should == {"表記"=>"Ruby", "品詞"=>"名詞"}}
10
+ its('word') {should == "Ruby"}
11
+ its('category') {should == "名詞"}
12
+ its('inspect') {should == "<名詞: 'Ruby'>"}
13
+ end
@@ -0,0 +1,10 @@
1
+
2
+ require 'spec'
3
+ require 'rr'
4
+
5
+ require File.join(File.dirname(__FILE__), '/../lib/chawan')
6
+
7
+ def data(key)
8
+ path = File.join(File.dirname(__FILE__) + "/fixtures/#{key}")
9
+ File.read(path){}
10
+ end
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: chawan
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - maiha
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-03 00:00:00 +09:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: A cup for chasen that provides an easy to use for extracting Japanese
17
+ email: maiha@wota.jp
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - README
24
+ - MIT-LICENSE
25
+ files:
26
+ - MIT-LICENSE
27
+ - README
28
+ - Rakefile
29
+ - lib/chawan/analyzers.rb
30
+ - lib/chawan/analyzers/mecab_analyzer.rb
31
+ - lib/chawan/analyzers/chasen_analyzer.rb
32
+ - lib/chawan/analyzers/abstract_analyzer.rb
33
+ - lib/chawan/analyzers/manager.rb
34
+ - lib/chawan/node.rb
35
+ - lib/chawan/commands.rb
36
+ - lib/chawan.rb
37
+ - spec/api_spec.rb
38
+ - spec/mecab_spec.rb
39
+ - spec/analyzer_spec.rb
40
+ - spec/fixtures/example1.txt
41
+ - spec/fixtures/example2.txt
42
+ - spec/fixtures/example3.txt
43
+ - spec/spec_helper.rb
44
+ - spec/node_spec.rb
45
+ has_rdoc: true
46
+ homepage: http://github.com/maiha/chawan
47
+ licenses: []
48
+
49
+ post_install_message:
50
+ rdoc_options: []
51
+
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: "0"
59
+ version:
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: "0"
65
+ version:
66
+ requirements: []
67
+
68
+ rubyforge_project: asakusarb
69
+ rubygems_version: 1.3.5
70
+ signing_key:
71
+ specification_version: 3
72
+ summary: A cup for chasen that provides an easy to use for extracting Japanese
73
+ test_files: []
74
+