bm25 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -1
- data/Gemfile.lock +1 -1
- data/README.md +20 -4
- data/bm25.gemspec +0 -1
- data/lib/bm25.rb +1 -2
- data/lib/bm25/parser.rb +114 -109
- data/lib/bm25/stopword.txt +328 -0
- data/lib/bm25/utils.rb +30 -0
- data/lib/bm25/version.rb +1 -1
- metadata +6 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 1a99f7e2ba2f3c27e3683119915f8d0f8bfe2d876ac872d1b9798727f31deefa
|
4
|
+
data.tar.gz: c5c3f13b6d5cd86baa620d16f0594ff4257562d259432c1b3695b9c3ba72f4d7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 74aecf440fe5ad44c6e01d4f59b6dd2c082558a466431928c91d9051a2c63d7eb14eb9c5b7bc88be502b38c1dc344a970e1dd329354956e29d0e6f3598b8521f
|
7
|
+
data.tar.gz: e7e10625e4ff8d6f1d237712a8c5381b75f1c8a9fb70fc475b55d17783263455722d5b438080681c7dbf317b4960e6bd89e8e8fe88922b57455ae24ffd91abb6
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -2,10 +2,8 @@
|
|
2
2
|
|
3
3
|
Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/bm25`. To experiment with that code, run `bin/console` for an interactive prompt.
|
4
4
|
|
5
|
-
TODO: Delete this and the text above, and describe your gem
|
6
|
-
|
7
5
|
## Installation
|
8
|
-
|
6
|
+
Step. 1
|
9
7
|
Add this line to your application's Gemfile:
|
10
8
|
|
11
9
|
```ruby
|
@@ -20,9 +18,27 @@ Or install it yourself as:
|
|
20
18
|
|
21
19
|
$ gem install bm25
|
22
20
|
|
21
|
+
Step. 2 Install Mecab and setup mecab-ipadic-neologd
|
22
|
+
|
23
|
+
mecab: http://taku910.github.io/mecab/
|
24
|
+
|
25
|
+
mecab-ipadic-neologd: https://github.com/neologd/mecab-ipadic-neologd
|
26
|
+
|
23
27
|
## Usage
|
28
|
+
```
|
29
|
+
require 'bm25'
|
30
|
+
|
31
|
+
paser = Bm25::Parser.new(['名詞'])
|
32
|
+
words = paser.execute("プログラマーだけど肩こりがひどいので懸垂バーを買って背中を鍛えることにした")
|
33
|
+
# words = [
|
34
|
+
# ['プログラマー', 1.7142857142857146],
|
35
|
+
# ['肩こり', 1.7142857142857146],
|
36
|
+
# ['懸垂', 1.7142857142857146],
|
37
|
+
# ['バー', 1.7142857142857146],
|
38
|
+
# ['背中', 1.7142857142857146]
|
39
|
+
# ]
|
40
|
+
```
|
24
41
|
|
25
|
-
TODO: Write usage instructions here
|
26
42
|
|
27
43
|
## Development
|
28
44
|
|
data/bm25.gemspec
CHANGED
@@ -26,7 +26,6 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
27
27
|
f.match(%r{^(test|spec|features)/})
|
28
28
|
end
|
29
|
-
spec.bindir = "exe"
|
30
29
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
31
30
|
spec.require_paths = ["lib"]
|
32
31
|
|
data/lib/bm25.rb
CHANGED
data/lib/bm25/parser.rb
CHANGED
@@ -1,140 +1,145 @@
|
|
1
|
+
require_relative 'utils'
|
1
2
|
require 'natto'
|
3
|
+
require 'pp'
|
2
4
|
|
3
5
|
module Bm25
|
4
6
|
class Parser
|
7
|
+
def initialize(scopes = [])
|
8
|
+
@base_document = ''
|
9
|
+
@docs = []
|
10
|
+
@idf_map = {}
|
11
|
+
@all_word_length = 0
|
12
|
+
@scopes = scopes.join('|')
|
13
|
+
end
|
5
14
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
end
|
13
|
-
|
14
|
-
def create_data
|
15
|
-
self.create_docs
|
16
|
-
self.create_idf_map
|
17
|
-
dataset = self.get_dataset
|
18
|
-
return dataset
|
19
|
-
end
|
15
|
+
def create_data
|
16
|
+
self.create_docs
|
17
|
+
self.create_idf_map
|
18
|
+
dataset = self.get_dataset
|
19
|
+
return dataset
|
20
|
+
end
|
20
21
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
22
|
+
def execute(document)
|
23
|
+
@allword_length = 0
|
24
|
+
@idf_map = {}
|
25
|
+
@docs = []
|
25
26
|
|
26
|
-
|
27
|
-
|
27
|
+
@base_document = document
|
28
|
+
@all_word_length = self.separate_words(document).length
|
28
29
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
30
|
+
data = self.create_data
|
31
|
+
data = self.get_important_keyword(data)
|
32
|
+
return data
|
33
|
+
end
|
33
34
|
|
34
|
-
|
35
|
-
|
36
|
-
|
35
|
+
def create_docs
|
36
|
+
nm = Natto::MeCab.new
|
37
|
+
doc_list = self.separate_document(@base_document)
|
37
38
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
end
|
39
|
+
doc_list.each do |d|
|
40
|
+
total_words = separate_words(d)
|
41
|
+
word_map = {}
|
42
|
+
total_words.each do |w|
|
43
|
+
count = 0
|
44
|
+
#単語数
|
45
|
+
count = d.scan(/#{Regexp.escape(w)}/).length
|
46
|
+
if word_map[w].nil?
|
47
|
+
word_map[w] = {
|
48
|
+
count: count,
|
49
|
+
tf: count.to_f / total_words.length
|
50
|
+
}
|
51
51
|
end
|
52
|
-
avarage_word_length = @all_word_length / doc_list.length
|
53
|
-
# NDL(j) = 文書Djの総単語数 / (すべての文書の平均DL)
|
54
|
-
@docs.push({
|
55
|
-
document: d,
|
56
|
-
words: word_map,
|
57
|
-
words_length: total_words.length,
|
58
|
-
dl: total_words.length / avarage_word_length.to_f
|
59
|
-
})
|
60
52
|
end
|
53
|
+
avarage_word_length = @all_word_length / doc_list.length
|
54
|
+
# NDL(j) = 文書Djの総単語数 / (すべての文書の平均DL)
|
55
|
+
@docs.push({
|
56
|
+
document: d,
|
57
|
+
words: word_map,
|
58
|
+
words_length: total_words.length,
|
59
|
+
dl: total_words.length / avarage_word_length.to_f
|
60
|
+
})
|
61
61
|
end
|
62
|
+
end
|
62
63
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
end
|
69
|
-
end
|
70
|
-
words = words.uniq
|
71
|
-
words.each do |word|
|
72
|
-
f = 0
|
73
|
-
@docs.each{|d| f = f + 1 if d[:words][word]}
|
74
|
-
idf = f === 0 ? 0 : @docs.length / f
|
75
|
-
@idf_map[word] = {
|
76
|
-
df: f,
|
77
|
-
idf: Math.log(idf) + 1
|
78
|
-
}
|
64
|
+
def create_idf_map
|
65
|
+
words = []
|
66
|
+
@docs.each do |d|
|
67
|
+
d[:words].each_pair do |k, v|
|
68
|
+
words.push(k)
|
79
69
|
end
|
80
70
|
end
|
71
|
+
words = words.uniq
|
72
|
+
words.each do |word|
|
73
|
+
f = 0
|
74
|
+
@docs.each{|d| f = f + 1 if d[:words][word]}
|
75
|
+
idf = f === 0 ? 0 : @docs.length / f
|
76
|
+
@idf_map[word] = {
|
77
|
+
df: f,
|
78
|
+
idf: Math.log(idf) + 1
|
79
|
+
}
|
80
|
+
end
|
81
|
+
end
|
81
82
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
})
|
97
|
-
end
|
98
|
-
data.push({
|
99
|
-
document: d[:document],
|
100
|
-
words: new_words.sort_by{|w| -w[:bm25]}
|
83
|
+
def get_dataset
|
84
|
+
data = []
|
85
|
+
@docs.each do |d|
|
86
|
+
new_words = []
|
87
|
+
k1 = 1.2
|
88
|
+
b = 0.75
|
89
|
+
d[:words].each_pair do |k, v|
|
90
|
+
# [ TF(i,j) * IDF(i) * (K1 + 1) ] / [ K1 * (1 - b + (b * NDL(j)) + TF(i,j) ]
|
91
|
+
new_words.push({
|
92
|
+
word: k,
|
93
|
+
tf: v[:tf],
|
94
|
+
idf: @idf_map[k][:idf],
|
95
|
+
val: @idf_map[k][:idf] * v[:tf],
|
96
|
+
bm25: (v[:tf] * @idf_map[k][:idf] + (k1 + 1)) / (k1 * (1 - b + (b * d[:dl])) + v[:tf])
|
101
97
|
})
|
102
98
|
end
|
103
|
-
|
99
|
+
data.push({
|
100
|
+
document: d[:document],
|
101
|
+
words: new_words.sort_by{|w| -w[:bm25]}
|
102
|
+
})
|
104
103
|
end
|
104
|
+
return data
|
105
|
+
end
|
105
106
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
end
|
107
|
+
def get_important_keyword(dataset)
|
108
|
+
word_map = {}
|
109
|
+
dataset.each do |data|
|
110
|
+
data[:words].each do |val|
|
111
|
+
k = val[:word]
|
112
|
+
bm25 = val[:bm25]
|
113
|
+
if word_map[k]
|
114
|
+
word_map[k] = word_map[k] + bm25
|
115
|
+
else
|
116
|
+
word_map[k] = bm25
|
117
117
|
end
|
118
118
|
end
|
119
|
-
return word_map.sort {|(k1, v1), (k2, v2)| v2 <=> v1 }
|
120
119
|
end
|
120
|
+
return word_map.sort {|(k1, v1), (k2, v2)| v2 <=> v1 }
|
121
|
+
end
|
121
122
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
123
|
+
def separate_words(document)
|
124
|
+
nm = Natto::MeCab.new
|
125
|
+
data = []
|
126
|
+
nm.parse(document) do |n|
|
127
|
+
if (n.is_bos? || n.is_eos?) ||
|
128
|
+
n.feature.scan(/#{@scopes}/).length === 0 ||
|
129
|
+
n.surface.match(/[\/\d]/) ||
|
130
|
+
Bm25::Utils.is_stopword?(n.surface) ||
|
131
|
+
Bm25::Utils.is_onechar?(n.surface)
|
132
|
+
next
|
130
133
|
end
|
131
|
-
|
134
|
+
data.push(n.surface)
|
132
135
|
end
|
136
|
+
return data
|
137
|
+
end
|
133
138
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
139
|
+
def separate_document(document)
|
140
|
+
docs = document.scan(/[^。^\.^\,\r\n|\n|\r]+/)
|
141
|
+
return docs
|
142
|
+
end
|
138
143
|
|
139
144
|
end
|
140
145
|
end
|
@@ -0,0 +1,328 @@
|
|
1
|
+
あそこ
|
2
|
+
あたり
|
3
|
+
あちら
|
4
|
+
あっち
|
5
|
+
あと
|
6
|
+
あな
|
7
|
+
あなた
|
8
|
+
あれ
|
9
|
+
いくつ
|
10
|
+
いつ
|
11
|
+
いま
|
12
|
+
いや
|
13
|
+
いろいろ
|
14
|
+
うち
|
15
|
+
おおまか
|
16
|
+
おまえ
|
17
|
+
おれ
|
18
|
+
がい
|
19
|
+
かく
|
20
|
+
かたち
|
21
|
+
かやの
|
22
|
+
から
|
23
|
+
がら
|
24
|
+
きた
|
25
|
+
くせ
|
26
|
+
ここ
|
27
|
+
こっち
|
28
|
+
こと
|
29
|
+
ごと
|
30
|
+
こちら
|
31
|
+
ごっちゃ
|
32
|
+
これ
|
33
|
+
これら
|
34
|
+
ごろ
|
35
|
+
さまざま
|
36
|
+
さらい
|
37
|
+
さん
|
38
|
+
しかた
|
39
|
+
しよう
|
40
|
+
すか
|
41
|
+
ずつ
|
42
|
+
すね
|
43
|
+
すべて
|
44
|
+
ぜんぶ
|
45
|
+
そう
|
46
|
+
そこ
|
47
|
+
そちら
|
48
|
+
そっち
|
49
|
+
そで
|
50
|
+
それ
|
51
|
+
それぞれ
|
52
|
+
それなり
|
53
|
+
たくさん
|
54
|
+
たち
|
55
|
+
たび
|
56
|
+
ため
|
57
|
+
だめ
|
58
|
+
ちゃ
|
59
|
+
ちゃん
|
60
|
+
てん
|
61
|
+
とおり
|
62
|
+
とき
|
63
|
+
どこ
|
64
|
+
どこか
|
65
|
+
ところ
|
66
|
+
どちら
|
67
|
+
どっか
|
68
|
+
どっち
|
69
|
+
どれ
|
70
|
+
なか
|
71
|
+
なかば
|
72
|
+
なに
|
73
|
+
など
|
74
|
+
なん
|
75
|
+
はじめ
|
76
|
+
はず
|
77
|
+
はるか
|
78
|
+
ひと
|
79
|
+
ひとつ
|
80
|
+
ふく
|
81
|
+
ぶり
|
82
|
+
べつ
|
83
|
+
へん
|
84
|
+
ぺん
|
85
|
+
ほう
|
86
|
+
ほか
|
87
|
+
まさ
|
88
|
+
まし
|
89
|
+
まとも
|
90
|
+
まま
|
91
|
+
みたい
|
92
|
+
みつ
|
93
|
+
みなさん
|
94
|
+
みんな
|
95
|
+
もと
|
96
|
+
もの
|
97
|
+
もん
|
98
|
+
やつ
|
99
|
+
よう
|
100
|
+
よそ
|
101
|
+
わけ
|
102
|
+
わたし
|
103
|
+
|
104
|
+
ハイ
|
105
|
+
|
106
|
+
|
107
|
+
上
|
108
|
+
中
|
109
|
+
下
|
110
|
+
字
|
111
|
+
|
112
|
+
|
113
|
+
年
|
114
|
+
月
|
115
|
+
日
|
116
|
+
時
|
117
|
+
分
|
118
|
+
秒
|
119
|
+
週
|
120
|
+
火
|
121
|
+
水
|
122
|
+
木
|
123
|
+
金
|
124
|
+
土
|
125
|
+
国
|
126
|
+
都
|
127
|
+
道
|
128
|
+
府
|
129
|
+
県
|
130
|
+
市
|
131
|
+
区
|
132
|
+
町
|
133
|
+
村
|
134
|
+
|
135
|
+
|
136
|
+
各
|
137
|
+
第
|
138
|
+
方
|
139
|
+
何
|
140
|
+
的
|
141
|
+
度
|
142
|
+
文
|
143
|
+
者
|
144
|
+
性
|
145
|
+
体
|
146
|
+
人
|
147
|
+
他
|
148
|
+
今
|
149
|
+
部
|
150
|
+
課
|
151
|
+
係
|
152
|
+
外
|
153
|
+
類
|
154
|
+
達
|
155
|
+
気
|
156
|
+
室
|
157
|
+
口
|
158
|
+
誰
|
159
|
+
用
|
160
|
+
界
|
161
|
+
会
|
162
|
+
首
|
163
|
+
男
|
164
|
+
女
|
165
|
+
別
|
166
|
+
話
|
167
|
+
私
|
168
|
+
屋
|
169
|
+
店
|
170
|
+
家
|
171
|
+
場
|
172
|
+
等
|
173
|
+
見
|
174
|
+
際
|
175
|
+
観
|
176
|
+
段
|
177
|
+
略
|
178
|
+
例
|
179
|
+
系
|
180
|
+
論
|
181
|
+
形
|
182
|
+
間
|
183
|
+
地
|
184
|
+
員
|
185
|
+
線
|
186
|
+
点
|
187
|
+
書
|
188
|
+
品
|
189
|
+
力
|
190
|
+
法
|
191
|
+
感
|
192
|
+
作
|
193
|
+
元
|
194
|
+
手
|
195
|
+
数
|
196
|
+
彼
|
197
|
+
彼女
|
198
|
+
子
|
199
|
+
内
|
200
|
+
楽
|
201
|
+
喜
|
202
|
+
怒
|
203
|
+
哀
|
204
|
+
輪
|
205
|
+
頃
|
206
|
+
化
|
207
|
+
境
|
208
|
+
俺
|
209
|
+
奴
|
210
|
+
高
|
211
|
+
校
|
212
|
+
婦
|
213
|
+
伸
|
214
|
+
紀
|
215
|
+
誌
|
216
|
+
レ
|
217
|
+
行
|
218
|
+
列
|
219
|
+
事
|
220
|
+
士
|
221
|
+
台
|
222
|
+
集
|
223
|
+
様
|
224
|
+
所
|
225
|
+
歴
|
226
|
+
器
|
227
|
+
名
|
228
|
+
情
|
229
|
+
連
|
230
|
+
毎
|
231
|
+
式
|
232
|
+
簿
|
233
|
+
|
234
|
+
|
235
|
+
|
236
|
+
|
237
|
+
回
|
238
|
+
匹
|
239
|
+
個
|
240
|
+
席
|
241
|
+
束
|
242
|
+
歳
|
243
|
+
目
|
244
|
+
通
|
245
|
+
面
|
246
|
+
円
|
247
|
+
玉
|
248
|
+
枚
|
249
|
+
|
250
|
+
前
|
251
|
+
後
|
252
|
+
左
|
253
|
+
右
|
254
|
+
次
|
255
|
+
先
|
256
|
+
|
257
|
+
春
|
258
|
+
夏
|
259
|
+
秋
|
260
|
+
冬
|
261
|
+
|
262
|
+
|
263
|
+
|
264
|
+
一
|
265
|
+
二
|
266
|
+
三
|
267
|
+
四
|
268
|
+
五
|
269
|
+
六
|
270
|
+
七
|
271
|
+
八
|
272
|
+
九
|
273
|
+
十
|
274
|
+
百
|
275
|
+
千
|
276
|
+
万
|
277
|
+
億
|
278
|
+
兆
|
279
|
+
|
280
|
+
|
281
|
+
下記
|
282
|
+
上記
|
283
|
+
時間
|
284
|
+
今回
|
285
|
+
前回
|
286
|
+
場合
|
287
|
+
一つ
|
288
|
+
年生
|
289
|
+
自分
|
290
|
+
ヶ所
|
291
|
+
ヵ所
|
292
|
+
カ所
|
293
|
+
箇所
|
294
|
+
ヶ月
|
295
|
+
ヵ月
|
296
|
+
カ月
|
297
|
+
箇月
|
298
|
+
名前
|
299
|
+
本当
|
300
|
+
確か
|
301
|
+
時点
|
302
|
+
全部
|
303
|
+
関係
|
304
|
+
近く
|
305
|
+
方法
|
306
|
+
我々
|
307
|
+
違い
|
308
|
+
多く
|
309
|
+
扱い
|
310
|
+
新た
|
311
|
+
その後
|
312
|
+
半ば
|
313
|
+
結局
|
314
|
+
様々
|
315
|
+
以前
|
316
|
+
以後
|
317
|
+
以降
|
318
|
+
未満
|
319
|
+
以上
|
320
|
+
以下
|
321
|
+
幾つ
|
322
|
+
毎日
|
323
|
+
自体
|
324
|
+
向こう
|
325
|
+
何人
|
326
|
+
手段
|
327
|
+
同じ
|
328
|
+
感じ
|
data/lib/bm25/utils.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
module Bm25
|
2
|
+
|
3
|
+
module Utils
|
4
|
+
|
5
|
+
class << self
|
6
|
+
|
7
|
+
def is_stopword? (word)
|
8
|
+
match = false
|
9
|
+
|
10
|
+
File.open("lib/bm25/stopword.txt", "r") do |f|
|
11
|
+
f.each_line do |t|
|
12
|
+
if t.chomp === word
|
13
|
+
match = true
|
14
|
+
break
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
return match
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
def is_onechar?(word)
|
23
|
+
return word.size == 1
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
data/lib/bm25/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bm25
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Masayuki Komatsu
|
8
8
|
autorequire:
|
9
|
-
bindir:
|
9
|
+
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-03-
|
11
|
+
date: 2018-03-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -87,6 +87,8 @@ files:
|
|
87
87
|
- bm25.gemspec
|
88
88
|
- lib/bm25.rb
|
89
89
|
- lib/bm25/parser.rb
|
90
|
+
- lib/bm25/stopword.txt
|
91
|
+
- lib/bm25/utils.rb
|
90
92
|
- lib/bm25/version.rb
|
91
93
|
homepage: https://github.com/Bit-Pumpkin/bm25
|
92
94
|
licenses:
|
@@ -108,7 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
108
110
|
version: '0'
|
109
111
|
requirements: []
|
110
112
|
rubyforge_project:
|
111
|
-
rubygems_version: 2.
|
113
|
+
rubygems_version: 2.7.4
|
112
114
|
signing_key:
|
113
115
|
specification_version: 4
|
114
116
|
summary: Okapi Bm25 for Japanese
|