tfidf_ja 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ group :development do
9
+ gem "rspec", "~> 2.1.0"
10
+ gem "bundler", "~> 1.0.0"
11
+ gem "jeweler", "~> 1.5.1"
12
+ gem "rcov", ">= 0"
13
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,28 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ diff-lcs (1.1.2)
5
+ git (1.2.5)
6
+ jeweler (1.5.1)
7
+ bundler (~> 1.0.0)
8
+ git (>= 1.2.5)
9
+ rake
10
+ rake (0.8.7)
11
+ rcov (0.9.9)
12
+ rspec (2.1.0)
13
+ rspec-core (~> 2.1.0)
14
+ rspec-expectations (~> 2.1.0)
15
+ rspec-mocks (~> 2.1.0)
16
+ rspec-core (2.1.0)
17
+ rspec-expectations (2.1.0)
18
+ diff-lcs (~> 1.1.2)
19
+ rspec-mocks (2.1.0)
20
+
21
+ PLATFORMS
22
+ ruby
23
+
24
+ DEPENDENCIES
25
+ bundler (~> 1.0.0)
26
+ jeweler (~> 1.5.1)
27
+ rcov
28
+ rspec (~> 2.1.0)
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 K.Nishi
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,19 @@
1
+ = tfidf_ja
2
+
3
+ Description goes here.
4
+
5
+ == Contributing to tfidf_ja
6
+
7
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
8
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
9
+ * Fork the project
10
+ * Start a feature/bugfix branch
11
+ * Commit and push until you are happy with your contribution
12
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
13
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2011 K.Nishi. See LICENSE.txt for
18
+ further details.
19
+
data/Rakefile ADDED
@@ -0,0 +1,65 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'rake'
11
+
12
+ require 'jeweler'
13
+ Jeweler::Tasks.new do |gem|
14
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
15
+ gem.name = "tfidf_ja"
16
+ gem.homepage = "http://github.com/kyow/tfidf_ja"
17
+ gem.license = "MIT"
18
+ gem.summary = %Q{Computes TF-IDF with Japanese dictionary.}
19
+ gem.description = %Q{
20
+ tfidf_ja computes TF-IDF with a dictionary.
21
+ This gem include a Japanese IDF dictionary that were prepared in Yahoo! API.
22
+ }
23
+ gem.email = "24signals@gmail.com"
24
+ gem.authors = ["K.Nishi"]
25
+ # Include your dependencies below. Runtime dependencies are required when using your gem,
26
+ # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
27
+ # gem.add_runtime_dependency 'jabber4r', '> 0.1'
28
+ # gem.add_development_dependency 'rspec', '> 1.2.3'
29
+
30
+ # gems dependency
31
+ gem.add_runtime_dependency 'igo-ruby', '> 0.1'
32
+
33
+ # include files
34
+ gem.files = Rake::FileList.new('lib/**/*.rb', '[A-Z]*', 'dic/idf.dic')
35
+
36
+ # required gems version
37
+ gem.required_rubygems_version = ">1.3.6"
38
+
39
+ # for RDoc encoding
40
+ gem.rdoc_options << '-c UTF-8' << '-S' << '-U'
41
+ end
42
+ Jeweler::RubygemsDotOrgTasks.new
43
+
44
+ require 'rspec/core'
45
+ require 'rspec/core/rake_task'
46
+ RSpec::Core::RakeTask.new(:spec) do |spec|
47
+ spec.pattern = FileList['spec/**/*_spec.rb']
48
+ end
49
+
50
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
51
+ spec.pattern = 'spec/**/*_spec.rb'
52
+ spec.rcov = true
53
+ end
54
+
55
+ task :default => :spec
56
+
57
+ require 'rake/rdoctask'
58
+ Rake::RDocTask.new do |rdoc|
59
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
60
+
61
+ rdoc.rdoc_dir = 'rdoc'
62
+ rdoc.title = "tfidf_ja #{version}"
63
+ rdoc.rdoc_files.include('README*')
64
+ rdoc.rdoc_files.include('lib/**/*.rb')
65
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/dic/idf.dic ADDED
Binary file
data/lib/dictionary.rb ADDED
@@ -0,0 +1,39 @@
1
+ # coding: utf-8
2
+ #
3
+ #Copyright:: Copyright (c) kyow, 2010
4
+ #Authors:: K.Nishi
5
+
6
+ #
7
+ #
8
+ #
9
+ module TfIdf
10
+ class Dictionary
11
+ def initialize
12
+ @map = {}
13
+ end
14
+
15
+ def set(k, v)
16
+ @map[k] = v
17
+ end
18
+
19
+ def get(k)
20
+ return @map[k]
21
+ end
22
+
23
+ def all
24
+ return @map
25
+ end
26
+
27
+ def exists?(k)
28
+ return @map.key?(k)
29
+ end
30
+ end
31
+
32
+ class DFs < Dictionary
33
+ end
34
+
35
+ class IDFs < Dictionary
36
+ attr_accessor :size
37
+ attr_accessor :average
38
+ end
39
+ end
data/lib/tfidf_ja.rb ADDED
@@ -0,0 +1,71 @@
1
+ # coding: utf-8
2
+ #
3
+ #Copyright:: Copyright (c) kyow, 2011
4
+ #Authors:: K.Nishi
5
+
6
+ $:.unshift(File.dirname(__FILE__))
7
+
8
+ require 'dictionary'
9
+
10
+ #
11
+ #
12
+ #
13
+ module TfIdf
14
+ #
15
+ #
16
+ #
17
+ class Ja
18
+ # コンストラクタ
19
+ def initialize
20
+ @idfs = load_dic
21
+ reset
22
+ end
23
+
24
+ def reset
25
+ @tfs = {}
26
+ end
27
+
28
+ #
29
+ #words:: 形態素配列
30
+ #::
31
+ def tfidf(words)
32
+ tfidfs = {}
33
+ set_tf_map(words)
34
+ @tfs.each_pair { |word, tf|
35
+ tfidfs[word] = tf * idf(word)
36
+ }
37
+ return tfidfs
38
+ end
39
+
40
+ def idf(word)
41
+ idf = @idfs.get(word)
42
+ if(idf.nil?)
43
+ idf = @idfs.average
44
+ end
45
+ return idf
46
+ end
47
+
48
+ private
49
+
50
+ #辞書ファイルを読み込む
51
+ def load_dic
52
+ idf_dic = File.dirname(__FILE__) + '/../dic/idf.dic'
53
+ File.open(idf_dic) { |f|
54
+ return Marshal.load(f)
55
+ }
56
+ end
57
+
58
+ # TF値を計算する
59
+ #words:: 形態素配列
60
+ #return:: keyが形態素、valueがTF値のハッシュテーブル
61
+ def set_tf_map(words)
62
+ words.each { |word|
63
+ if(@tfs.key?(word))
64
+ @tfs[word] += 1
65
+ else
66
+ @tfs[word] = 1
67
+ end
68
+ }
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,47 @@
1
+ # coding: utf-8
2
+ #
3
+ #Copyright:: Copyright (c) kyow, 2011
4
+ #Authors:: K.Nishi
5
+
6
+ $:.unshift(File.dirname(__FILE__))
7
+
8
+ require 'rubygems'
9
+ require 'igo-ruby'
10
+
11
+ #
12
+ #
13
+ #
14
+ module TfIdf
15
+ #
16
+ #
17
+ #
18
+ class IgoExtension
19
+ # コンストラクタ
20
+ def initialize(dir)
21
+ # dir = File::expand_path(dir)
22
+ @tagger = Igo::Tagger.new(dir)
23
+ @tfidf = TfIdf::Ja.new
24
+ end
25
+
26
+ def reset
27
+ @tfidf.reset
28
+ end
29
+
30
+ # igo-rubyを利用して形態素解析を行ない、その結果のTF-IDFを返却する
31
+ #text:: テキスト
32
+ def tfidf(text)
33
+ t = @tagger.parse(text)
34
+ words = []
35
+
36
+ # 品詞が"名詞"の形態素を選択
37
+ t.each{ |m|
38
+ part = m.feature.split(',')[0]
39
+ if(part == '名詞')
40
+ words.push(m.surface)
41
+ end
42
+ }
43
+
44
+ return @tfidf.tfidf(words)
45
+ end
46
+ end
47
+ end
data/lib/utility.rb ADDED
@@ -0,0 +1,63 @@
1
+ # coding: utf-8
2
+ #
3
+ #Copyright:: Copyright (c) kyow, 2011
4
+ #Authors:: K.Nishi
5
+
6
+ $:.unshift(File.dirname(__FILE__))
7
+ require 'dictionary'
8
+
9
+ #
10
+ #
11
+ #
12
+ module TfIdf
13
+ #
14
+ # ユーティリティ
15
+ #
16
+ class Utility
17
+ N = 20000000000
18
+ # df.dicからidf.dicを生成する
19
+ def self.create_dic
20
+ dfs = self.df_load()
21
+ idfs = self.create_idf(dfs)
22
+ self.idf_save(idfs)
23
+ puts "complete."
24
+ puts "size=#{idfs.size} average=#{idfs.average}"
25
+ end
26
+
27
+ def self.get_inverse(df_value)
28
+ return df_value > 0 ? Math::log(N / df_value) : 0
29
+ end
30
+
31
+ private
32
+
33
+ def self.create_idf(dfs)
34
+ idfs = IDFs.new
35
+ total = 0
36
+ dfs.all.each_pair { |k, v|
37
+ idf = TfIdf::Utility.get_inverse(v)
38
+ idfs.set(k, idf)
39
+ total += idf
40
+ }
41
+ idfs.size = dfs.all.size
42
+ idfs.average = total / idfs.size
43
+ return idfs
44
+ end
45
+
46
+ def self.df_load()
47
+ df_dic = File.dirname(__FILE__) + '/../dic/df.dic'
48
+ unless File::exists?(df_dic)
49
+ raise
50
+ end
51
+ File::open(df_dic) { |f|
52
+ return Marshal.load(f)
53
+ }
54
+ end
55
+
56
+ def self.idf_save(idfs)
57
+ idf_dic = File.dirname(__FILE__) + '/../dic/idf.dic'
58
+ File::open(idf_dic, 'wb') { |f|
59
+ Marshal.dump(idfs, f)
60
+ }
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,12 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
3
+ require 'rspec'
4
+ require 'tfidf_ja'
5
+
6
+ # Requires supporting files with custom matchers and macros, etc,
7
+ # in ./support/ and its subdirectories.
8
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
9
+
10
+ RSpec.configure do |config|
11
+
12
+ end
@@ -0,0 +1,7 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "TfidfJa" do
4
+ it "fails" do
5
+ fail "hey buddy, you should probably rename this file and start specing for real"
6
+ end
7
+ end
@@ -0,0 +1,9 @@
1
+ # coding: utf-8
2
+ #
3
+ #Copyright:: Copyright (c) kyow, 2011
4
+ #Authors:: K.Nishi
5
+
6
+ require '../lib/utility'
7
+
8
+ TfIdf::Utility.create_dic
9
+
data/test/test.rb ADDED
@@ -0,0 +1,34 @@
1
+ # coding: utf-8
2
+ #
3
+ #Copyright:: Copyright (c) kyow, 2011
4
+ #Authors:: K.Nishi
5
+
6
+ #require '../lib/tfidf_ja'
7
+ #require '../lib/tfidf_ja/igo_extension'
8
+ require 'rubygems'
9
+ require 'tfidf_ja'
10
+ require 'tfidf_ja/igo_extension'
11
+
12
+ def output(h)
13
+ h.each { |item|
14
+ puts "#{item[0]} => #{item[1]}"
15
+ }
16
+ end
17
+
18
+ tfidf = TfIdf::Ja.new
19
+ output tfidf.tfidf(['この', '文章', 'から', 'TFIDF', '値', 'を', '取得', 'する'])
20
+ output tfidf.tfidf(['文章', 'を', '連続', 'で', '渡す', 'と', 'TFIDF', '値', 'を', '追加', 'で', '算出', 'する'])
21
+
22
+
23
+ # 以下はigo-rubyが必要
24
+ dir = '../../ipadic'
25
+ tie = TfIdf::IgoExtension.new(dir)
26
+ puts s = "この文字列を形態素解析してTFIDF値を取得する"
27
+ output tie.tfidf(s)
28
+
29
+ puts s = "連続してインスタンスを使うと前回解析したTFIDF計算結果を加味する"
30
+ output tie.tfidf(s)
31
+
32
+ tie.reset
33
+ puts s = "TfIdf::IgoExtension.resetでリセットする"
34
+ output tie.tfidf(s)
metadata ADDED
@@ -0,0 +1,165 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tfidf_ja
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - K.Nishi
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-01-08 00:00:00 +09:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ prerelease: false
23
+ version_requirements: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ~>
27
+ - !ruby/object:Gem::Version
28
+ hash: 11
29
+ segments:
30
+ - 2
31
+ - 1
32
+ - 0
33
+ version: 2.1.0
34
+ name: rspec
35
+ requirement: *id001
36
+ type: :development
37
+ - !ruby/object:Gem::Dependency
38
+ prerelease: false
39
+ version_requirements: &id002 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ~>
43
+ - !ruby/object:Gem::Version
44
+ hash: 23
45
+ segments:
46
+ - 1
47
+ - 0
48
+ - 0
49
+ version: 1.0.0
50
+ name: bundler
51
+ requirement: *id002
52
+ type: :development
53
+ - !ruby/object:Gem::Dependency
54
+ prerelease: false
55
+ version_requirements: &id003 !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ~>
59
+ - !ruby/object:Gem::Version
60
+ hash: 1
61
+ segments:
62
+ - 1
63
+ - 5
64
+ - 1
65
+ version: 1.5.1
66
+ name: jeweler
67
+ requirement: *id003
68
+ type: :development
69
+ - !ruby/object:Gem::Dependency
70
+ prerelease: false
71
+ version_requirements: &id004 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ hash: 3
77
+ segments:
78
+ - 0
79
+ version: "0"
80
+ name: rcov
81
+ requirement: *id004
82
+ type: :development
83
+ - !ruby/object:Gem::Dependency
84
+ prerelease: false
85
+ version_requirements: &id005 !ruby/object:Gem::Requirement
86
+ none: false
87
+ requirements:
88
+ - - ">"
89
+ - !ruby/object:Gem::Version
90
+ hash: 9
91
+ segments:
92
+ - 0
93
+ - 1
94
+ version: "0.1"
95
+ name: igo-ruby
96
+ requirement: *id005
97
+ type: :runtime
98
+ description: "\n tfidf_ja computes TF-IDF with a dictionary.\n This gem include a Japanese IDF dictionary that were prepared in Yahoo! API.\n "
99
+ email: 24signals@gmail.com
100
+ executables: []
101
+
102
+ extensions: []
103
+
104
+ extra_rdoc_files:
105
+ - LICENSE.txt
106
+ - README.rdoc
107
+ files:
108
+ - Gemfile
109
+ - Gemfile.lock
110
+ - LICENSE.txt
111
+ - README.rdoc
112
+ - Rakefile
113
+ - VERSION
114
+ - dic/idf.dic
115
+ - lib/dictionary.rb
116
+ - lib/tfidf_ja.rb
117
+ - lib/tfidf_ja/igo_extension.rb
118
+ - lib/utility.rb
119
+ - spec/spec_helper.rb
120
+ - spec/tfidf_ja_spec.rb
121
+ - test/create_idf_dic.rb
122
+ - test/test.rb
123
+ has_rdoc: true
124
+ homepage: http://github.com/kyow/tfidf_ja
125
+ licenses:
126
+ - MIT
127
+ post_install_message:
128
+ rdoc_options:
129
+ - -c UTF-8
130
+ - -S
131
+ - -U
132
+ require_paths:
133
+ - lib
134
+ required_ruby_version: !ruby/object:Gem::Requirement
135
+ none: false
136
+ requirements:
137
+ - - ">="
138
+ - !ruby/object:Gem::Version
139
+ hash: 3
140
+ segments:
141
+ - 0
142
+ version: "0"
143
+ required_rubygems_version: !ruby/object:Gem::Requirement
144
+ none: false
145
+ requirements:
146
+ - - ">"
147
+ - !ruby/object:Gem::Version
148
+ hash: 23
149
+ segments:
150
+ - 1
151
+ - 3
152
+ - 6
153
+ version: 1.3.6
154
+ requirements: []
155
+
156
+ rubyforge_project:
157
+ rubygems_version: 1.3.7
158
+ signing_key:
159
+ specification_version: 3
160
+ summary: Computes TF-IDF with Japanese dictionary.
161
+ test_files:
162
+ - spec/spec_helper.rb
163
+ - spec/tfidf_ja_spec.rb
164
+ - test/create_idf_dic.rb
165
+ - test/test.rb