bm25 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 0ef7fac79a6db09014f07c985635114c9daf1d93
4
- data.tar.gz: 3e027d22087aeec8eca9aae97216d42fa0516ec1
2
+ SHA256:
3
+ metadata.gz: 1a99f7e2ba2f3c27e3683119915f8d0f8bfe2d876ac872d1b9798727f31deefa
4
+ data.tar.gz: c5c3f13b6d5cd86baa620d16f0594ff4257562d259432c1b3695b9c3ba72f4d7
5
5
  SHA512:
6
- metadata.gz: 112549c9a347cfdd8e6fedd50bbf30799d62a66ade8835b527797596eb80f0c2a1b4e4204e47d11374c1cde0b6d0fd5f915abf0600ee718be391af3634332471
7
- data.tar.gz: 06370fa88c366ade0265f361b71963fc67b16cf474820323effbb77cc6b17993dd1c514c5c06aab1cdba8616a472c48b5ba893362b35c43d17394ea293537d45
6
+ metadata.gz: 74aecf440fe5ad44c6e01d4f59b6dd2c082558a466431928c91d9051a2c63d7eb14eb9c5b7bc88be502b38c1dc344a970e1dd329354956e29d0e6f3598b8521f
7
+ data.tar.gz: e7e10625e4ff8d6f1d237712a8c5381b75f1c8a9fb70fc475b55d17783263455722d5b438080681c7dbf317b4960e6bd89e8e8fe88922b57455ae24ffd91abb6
data/.gitignore CHANGED
@@ -9,5 +9,5 @@
9
9
 
10
10
  # rspec failure tracking
11
11
  .rspec_status
12
-
12
+ *.gem
13
13
  test.rb
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- bm25 (0.1.2)
4
+ bm25 (0.1.3)
5
5
  natto
6
6
 
7
7
  GEM
data/README.md CHANGED
@@ -2,10 +2,8 @@
2
2
 
3
3
  Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/bm25`. To experiment with that code, run `bin/console` for an interactive prompt.
4
4
 
5
- TODO: Delete this and the text above, and describe your gem
6
-
7
5
  ## Installation
8
-
6
+ Step. 1
9
7
  Add this line to your application's Gemfile:
10
8
 
11
9
  ```ruby
@@ -20,9 +18,27 @@ Or install it yourself as:
20
18
 
21
19
  $ gem install bm25
22
20
 
21
+ Step. 2 Install Mecab and setup mecab-ipadic-neologd
22
+
23
+ mecab: http://taku910.github.io/mecab/
24
+
25
+ mecab-ipadic-neologd: https://github.com/neologd/mecab-ipadic-neologd
26
+
23
27
  ## Usage
28
+ ```
29
+ require 'bm25'
30
+
31
+ paser = Bm25::Parser.new(['名詞'])
32
+ words = paser.execute("プログラマーだけど肩こりがひどいので懸垂バーを買って背中を鍛えることにした")
33
+ # words = [
34
+ # ['プログラマー', 1.7142857142857146],
35
+ # ['肩こり', 1.7142857142857146],
36
+ # ['懸垂', 1.7142857142857146],
37
+ # ['バー', 1.7142857142857146],
38
+ # ['背中', 1.7142857142857146]
39
+ # ]
40
+ ```
24
41
 
25
- TODO: Write usage instructions here
26
42
 
27
43
  ## Development
28
44
 
@@ -26,7 +26,6 @@ Gem::Specification.new do |spec|
26
26
  spec.files = `git ls-files -z`.split("\x0").reject do |f|
27
27
  f.match(%r{^(test|spec|features)/})
28
28
  end
29
- spec.bindir = "exe"
30
29
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
31
30
  spec.require_paths = ["lib"]
32
31
 
@@ -1,6 +1,5 @@
1
- require "bm25/version"
2
1
  require 'bm25/parser'
3
-
2
+ require 'bm25/utils'
4
3
  module Bm25
5
4
 
6
5
  end
@@ -1,140 +1,145 @@
1
+ require_relative 'utils'
1
2
  require 'natto'
3
+ require 'pp'
2
4
 
3
5
  module Bm25
4
6
  class Parser
7
+ def initialize(scopes = [])
8
+ @base_document = ''
9
+ @docs = []
10
+ @idf_map = {}
11
+ @all_word_length = 0
12
+ @scopes = scopes.join('|')
13
+ end
5
14
 
6
- def initialize(scopes = [])
7
- @base_document = ''
8
- @docs = []
9
- @idf_map = {}
10
- @all_word_length = 0
11
- @scopes = scopes.join('|')
12
- end
13
-
14
- def create_data
15
- self.create_docs
16
- self.create_idf_map
17
- dataset = self.get_dataset
18
- return dataset
19
- end
15
+ def create_data
16
+ self.create_docs
17
+ self.create_idf_map
18
+ dataset = self.get_dataset
19
+ return dataset
20
+ end
20
21
 
21
- def execute(document)
22
- @allword_length = 0
23
- @idf_map = {}
24
- @docs = []
22
+ def execute(document)
23
+ @allword_length = 0
24
+ @idf_map = {}
25
+ @docs = []
25
26
 
26
- @base_document = document
27
- @all_word_length = self.separate_words(document).length
27
+ @base_document = document
28
+ @all_word_length = self.separate_words(document).length
28
29
 
29
- data = self.create_data
30
- data = self.get_important_keyword(data)
31
- return data
32
- end
30
+ data = self.create_data
31
+ data = self.get_important_keyword(data)
32
+ return data
33
+ end
33
34
 
34
- def create_docs
35
- nm = Natto::MeCab.new
36
- doc_list = self.separate_document(@base_document)
35
+ def create_docs
36
+ nm = Natto::MeCab.new
37
+ doc_list = self.separate_document(@base_document)
37
38
 
38
- doc_list.each do |d|
39
- total_words = separate_words(d)
40
- word_map = {}
41
- total_words.each do |w|
42
- count = 0
43
- #単語数
44
- count = d.scan(/#{Regexp.escape(w)}/).length
45
- if word_map[w].nil?
46
- word_map[w] = {
47
- count: count,
48
- tf: count.to_f / total_words.length
49
- }
50
- end
39
+ doc_list.each do |d|
40
+ total_words = separate_words(d)
41
+ word_map = {}
42
+ total_words.each do |w|
43
+ count = 0
44
+ #単語数
45
+ count = d.scan(/#{Regexp.escape(w)}/).length
46
+ if word_map[w].nil?
47
+ word_map[w] = {
48
+ count: count,
49
+ tf: count.to_f / total_words.length
50
+ }
51
51
  end
52
- avarage_word_length = @all_word_length / doc_list.length
53
- # NDL(j) = 文書Djの総単語数 / (すべての文書の平均DL)
54
- @docs.push({
55
- document: d,
56
- words: word_map,
57
- words_length: total_words.length,
58
- dl: total_words.length / avarage_word_length.to_f
59
- })
60
52
  end
53
+ avarage_word_length = @all_word_length / doc_list.length
54
+ # NDL(j) = 文書Djの総単語数 / (すべての文書の平均DL)
55
+ @docs.push({
56
+ document: d,
57
+ words: word_map,
58
+ words_length: total_words.length,
59
+ dl: total_words.length / avarage_word_length.to_f
60
+ })
61
61
  end
62
+ end
62
63
 
63
- def create_idf_map
64
- words = []
65
- @docs.each do |d|
66
- d[:words].each_pair do |k, v|
67
- words.push(k)
68
- end
69
- end
70
- words = words.uniq
71
- words.each do |word|
72
- f = 0
73
- @docs.each{|d| f = f + 1 if d[:words][word]}
74
- idf = f === 0 ? 0 : @docs.length / f
75
- @idf_map[word] = {
76
- df: f,
77
- idf: Math.log(idf) + 1
78
- }
64
+ def create_idf_map
65
+ words = []
66
+ @docs.each do |d|
67
+ d[:words].each_pair do |k, v|
68
+ words.push(k)
79
69
  end
80
70
  end
71
+ words = words.uniq
72
+ words.each do |word|
73
+ f = 0
74
+ @docs.each{|d| f = f + 1 if d[:words][word]}
75
+ idf = f === 0 ? 0 : @docs.length / f
76
+ @idf_map[word] = {
77
+ df: f,
78
+ idf: Math.log(idf) + 1
79
+ }
80
+ end
81
+ end
81
82
 
82
- def get_dataset
83
- data = []
84
- @docs.each do |d|
85
- new_words = []
86
- k1 = 1.2
87
- b = 0.75
88
- d[:words].each_pair do |k, v|
89
- # [ TF(i,j) * IDF(i) * (K1 + 1) ] / [ K1 * (1 - b + (b * NDL(j)) + TF(i,j) ]
90
- new_words.push({
91
- word: k,
92
- tf: v[:tf],
93
- idf: @idf_map[k][:idf],
94
- val: @idf_map[k][:idf] * v[:tf],
95
- bm25: (v[:tf] * @idf_map[k][:idf] + (k1 + 1)) / (k1 * (1 - b + (b * d[:dl])) + v[:tf])
96
- })
97
- end
98
- data.push({
99
- document: d[:document],
100
- words: new_words.sort_by{|w| -w[:bm25]}
83
+ def get_dataset
84
+ data = []
85
+ @docs.each do |d|
86
+ new_words = []
87
+ k1 = 1.2
88
+ b = 0.75
89
+ d[:words].each_pair do |k, v|
90
+ # [ TF(i,j) * IDF(i) * (K1 + 1) ] / [ K1 * (1 - b + (b * NDL(j)) + TF(i,j) ]
91
+ new_words.push({
92
+ word: k,
93
+ tf: v[:tf],
94
+ idf: @idf_map[k][:idf],
95
+ val: @idf_map[k][:idf] * v[:tf],
96
+ bm25: (v[:tf] * @idf_map[k][:idf] + (k1 + 1)) / (k1 * (1 - b + (b * d[:dl])) + v[:tf])
101
97
  })
102
98
  end
103
- return data
99
+ data.push({
100
+ document: d[:document],
101
+ words: new_words.sort_by{|w| -w[:bm25]}
102
+ })
104
103
  end
104
+ return data
105
+ end
105
106
 
106
- def get_important_keyword(dataset)
107
- word_map = {}
108
- dataset.each do |data|
109
- data[:words].each do |val|
110
- k = val[:word]
111
- bm25 = val[:bm25]
112
- if word_map[k]
113
- word_map[k] = word_map[k] + bm25
114
- else
115
- word_map[k] = bm25
116
- end
107
+ def get_important_keyword(dataset)
108
+ word_map = {}
109
+ dataset.each do |data|
110
+ data[:words].each do |val|
111
+ k = val[:word]
112
+ bm25 = val[:bm25]
113
+ if word_map[k]
114
+ word_map[k] = word_map[k] + bm25
115
+ else
116
+ word_map[k] = bm25
117
117
  end
118
118
  end
119
- return word_map.sort {|(k1, v1), (k2, v2)| v2 <=> v1 }
120
119
  end
120
+ return word_map.sort {|(k1, v1), (k2, v2)| v2 <=> v1 }
121
+ end
121
122
 
122
- def separate_words(document)
123
- nm = Natto::MeCab.new
124
- data = []
125
- nm.parse(document) do |n|
126
- if (n.is_bos? || n.is_eos?) || n.feature.scan(/#{@scopes}/).length === 0 || n.surface.match(/[\/\d]/)
127
- next
128
- end
129
- data.push(n.surface)
123
+ def separate_words(document)
124
+ nm = Natto::MeCab.new
125
+ data = []
126
+ nm.parse(document) do |n|
127
+ if (n.is_bos? || n.is_eos?) ||
128
+ n.feature.scan(/#{@scopes}/).length === 0 ||
129
+ n.surface.match(/[\/\d]/) ||
130
+ Bm25::Utils.is_stopword?(n.surface) ||
131
+ Bm25::Utils.is_onechar?(n.surface)
132
+ next
130
133
  end
131
- return data
134
+ data.push(n.surface)
132
135
  end
136
+ return data
137
+ end
133
138
 
134
- def separate_document(document)
135
- docs = document.scan(/[^。^\.^\,\r\n|\n|\r]+/)
136
- return docs
137
- end
139
+ def separate_document(document)
140
+ docs = document.scan(/[^。^\.^\,\r\n|\n|\r]+/)
141
+ return docs
142
+ end
138
143
 
139
144
  end
140
145
  end
@@ -0,0 +1,328 @@
1
+ あそこ
2
+ あたり
3
+ あちら
4
+ あっち
5
+ あと
6
+ あな
7
+ あなた
8
+ あれ
9
+ いくつ
10
+ いつ
11
+ いま
12
+ いや
13
+ いろいろ
14
+ うち
15
+ おおまか
16
+ おまえ
17
+ おれ
18
+ がい
19
+ かく
20
+ かたち
21
+ かやの
22
+ から
23
+ がら
24
+ きた
25
+ くせ
26
+ ここ
27
+ こっち
28
+ こと
29
+ ごと
30
+ こちら
31
+ ごっちゃ
32
+ これ
33
+ これら
34
+ ごろ
35
+ さまざま
36
+ さらい
37
+ さん
38
+ しかた
39
+ しよう
40
+ すか
41
+ ずつ
42
+ すね
43
+ すべて
44
+ ぜんぶ
45
+ そう
46
+ そこ
47
+ そちら
48
+ そっち
49
+ そで
50
+ それ
51
+ それぞれ
52
+ それなり
53
+ たくさん
54
+ たち
55
+ たび
56
+ ため
57
+ だめ
58
+ ちゃ
59
+ ちゃん
60
+ てん
61
+ とおり
62
+ とき
63
+ どこ
64
+ どこか
65
+ ところ
66
+ どちら
67
+ どっか
68
+ どっち
69
+ どれ
70
+ なか
71
+ なかば
72
+ なに
73
+ など
74
+ なん
75
+ はじめ
76
+ はず
77
+ はるか
78
+ ひと
79
+ ひとつ
80
+ ふく
81
+ ぶり
82
+ べつ
83
+ へん
84
+ ぺん
85
+ ほう
86
+ ほか
87
+ まさ
88
+ まし
89
+ まとも
90
+ まま
91
+ みたい
92
+ みつ
93
+ みなさん
94
+ みんな
95
+ もと
96
+ もの
97
+ もん
98
+ やつ
99
+ よう
100
+ よそ
101
+ わけ
102
+ わたし
103
+
104
+ ハイ
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+ 彼女
198
+
199
+
200
+
201
+
202
+
203
+
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
+
212
+
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
+
222
+
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+ 簿
233
+
234
+
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+
243
+
244
+
245
+
246
+
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+
257
+
258
+
259
+
260
+
261
+
262
+
263
+
264
+
265
+
266
+
267
+
268
+
269
+
270
+
271
+
272
+
273
+
274
+
275
+
276
+
277
+
278
+
279
+
280
+
281
+ 下記
282
+ 上記
283
+ 時間
284
+ 今回
285
+ 前回
286
+ 場合
287
+ 一つ
288
+ 年生
289
+ 自分
290
+ ヶ所
291
+ ヵ所
292
+ カ所
293
+ 箇所
294
+ ヶ月
295
+ ヵ月
296
+ カ月
297
+ 箇月
298
+ 名前
299
+ 本当
300
+ 確か
301
+ 時点
302
+ 全部
303
+ 関係
304
+ 近く
305
+ 方法
306
+ 我々
307
+ 違い
308
+ 多く
309
+ 扱い
310
+ 新た
311
+ その後
312
+ 半ば
313
+ 結局
314
+ 様々
315
+ 以前
316
+ 以後
317
+ 以降
318
+ 未満
319
+ 以上
320
+ 以下
321
+ 幾つ
322
+ 毎日
323
+ 自体
324
+ 向こう
325
+ 何人
326
+ 手段
327
+ 同じ
328
+ 感じ
@@ -0,0 +1,30 @@
1
+ module Bm25
2
+
3
+ module Utils
4
+
5
+ class << self
6
+
7
+ def is_stopword? (word)
8
+ match = false
9
+
10
+ File.open("lib/bm25/stopword.txt", "r") do |f|
11
+ f.each_line do |t|
12
+ if t.chomp === word
13
+ match = true
14
+ break
15
+ end
16
+ end
17
+ end
18
+ return match
19
+
20
+ end
21
+
22
+ def is_onechar?(word)
23
+ return word.size == 1
24
+ end
25
+
26
+ end
27
+
28
+ end
29
+
30
+ end
@@ -1,3 +1,3 @@
1
1
  module Bm25
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bm25
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Masayuki Komatsu
8
8
  autorequire:
9
- bindir: exe
9
+ bindir: bin
10
10
  cert_chain: []
11
- date: 2018-03-24 00:00:00.000000000 Z
11
+ date: 2018-03-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -87,6 +87,8 @@ files:
87
87
  - bm25.gemspec
88
88
  - lib/bm25.rb
89
89
  - lib/bm25/parser.rb
90
+ - lib/bm25/stopword.txt
91
+ - lib/bm25/utils.rb
90
92
  - lib/bm25/version.rb
91
93
  homepage: https://github.com/Bit-Pumpkin/bm25
92
94
  licenses:
@@ -108,7 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
108
110
  version: '0'
109
111
  requirements: []
110
112
  rubyforge_project:
111
- rubygems_version: 2.5.1
113
+ rubygems_version: 2.7.4
112
114
  signing_key:
113
115
  specification_version: 4
114
116
  summary: Okapi Bm25 for Japanese