bm25 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/bm25/parser.rb +7 -7
- data/lib/bm25/utils.rb +1 -25
- data/lib/bm25/validator.rb +47 -0
- data/lib/bm25/version.rb +1 -1
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 23849a1ca209b9d85bddd83c0e135332af3176f72e14fe1301819107fcf25f65
|
4
|
+
data.tar.gz: 3a736717ba9c03c17548d2c295846daa2e71a044b167c59a8ffdd8bdfe2a0a9f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3b806bd8c54e3bd7b434c9fce996b838c0e6345b25e152458ac96b8a08c873e5405ed2304b50fa09aa706689db1cc695b10d2aa5a63dbbe607a28b36102ef855
|
7
|
+
data.tar.gz: 67e43c256e7dfb42f077bed16cdefc13b3f46119fbffb11ea53f26822700cbbefd4d0a253adc7c134a2837a1b62ef652220e409850aaed9e03408ebb99b9537f
|
data/Gemfile.lock
CHANGED
data/lib/bm25/parser.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require_relative 'utils'
|
2
|
+
require_relative 'validator'
|
2
3
|
require 'natto'
|
3
4
|
require 'pp'
|
4
5
|
|
@@ -53,7 +54,6 @@ module Bm25
|
|
53
54
|
end
|
54
55
|
end
|
55
56
|
avarage_word_length = @all_word_length / doc_list.length
|
56
|
-
# NDL(j) = 文書Djの総単語数 / (すべての文書の平均DL)
|
57
57
|
@docs << {
|
58
58
|
document: d,
|
59
59
|
words: word_map,
|
@@ -66,10 +66,9 @@ module Bm25
|
|
66
66
|
def create_idf_map
|
67
67
|
words = []
|
68
68
|
@docs.each do |d|
|
69
|
-
d[:words].each_pair
|
70
|
-
words << k
|
71
|
-
end
|
69
|
+
d[:words].each_pair{|k, v| words << k }
|
72
70
|
end
|
71
|
+
|
73
72
|
words = words.uniq
|
74
73
|
words.each do |word|
|
75
74
|
f = 0
|
@@ -88,14 +87,15 @@ module Bm25
|
|
88
87
|
new_words = []
|
89
88
|
k1 = 1.2
|
90
89
|
b = 0.75
|
90
|
+
# [ TF(i,j) * IDF(i) * (K1 + 1) ] / [ K1 * (1 - b + (b * NDL(j)) + TF(i,j) ]
|
91
91
|
d[:words].each_pair do |k, v|
|
92
|
-
|
92
|
+
tfidf = @idf_map[k][:idf] * v[:tf]
|
93
93
|
new_words << {
|
94
94
|
word: k,
|
95
95
|
tf: v[:tf],
|
96
96
|
idf: @idf_map[k][:idf],
|
97
|
-
|
98
|
-
bm25: (
|
97
|
+
tfidf: tfidf,
|
98
|
+
bm25: (tfidf + (k1 + 1)) / (k1 * (1 - b + (b * d[:dl])) + v[:tf])
|
99
99
|
}
|
100
100
|
end
|
101
101
|
data << {
|
data/lib/bm25/utils.rb
CHANGED
@@ -1,38 +1,14 @@
|
|
1
|
-
require 'natto'
|
2
1
|
module Bm25
|
3
2
|
|
4
3
|
module Utils
|
5
4
|
|
6
5
|
class << self
|
7
6
|
|
8
|
-
def is_stopword? (word)
|
9
|
-
match = false
|
10
|
-
stopword_path = File.join( File.dirname(__FILE__), 'stopword.txt' )
|
11
|
-
File.open(stopword_path, "r") do |f|
|
12
|
-
f.each_line do |t|
|
13
|
-
if t.chomp === word
|
14
|
-
match = true
|
15
|
-
break
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
return match
|
20
|
-
|
21
|
-
end
|
22
|
-
|
23
|
-
def is_onechar?(word)
|
24
|
-
return word.size == 1
|
25
|
-
end
|
26
|
-
|
27
7
|
def separate_words(document)
|
28
8
|
nm = Natto::MeCab.new
|
29
9
|
data = []
|
30
10
|
nm.parse(document) do |n|
|
31
|
-
if (n
|
32
|
-
n.feature.scan(/名詞|固有名詞/).length === 0 ||
|
33
|
-
n.surface.match(/[\/\d]/) ||
|
34
|
-
self.is_stopword?(n.surface) ||
|
35
|
-
self.is_onechar?(n.surface)
|
11
|
+
if Bm25::Validator.validate_word(n)
|
36
12
|
next
|
37
13
|
end
|
38
14
|
data << n.surface
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Bm25
|
2
|
+
class Validator
|
3
|
+
|
4
|
+
class << self
|
5
|
+
|
6
|
+
# @param [String] word
|
7
|
+
# @return [Boolean] return True if the word is a stopword
|
8
|
+
def is_stopword? (word)
|
9
|
+
match = false
|
10
|
+
stopword_path = File.join( File.dirname(__FILE__), 'stopword.txt' )
|
11
|
+
File.open(stopword_path, "r") do |f|
|
12
|
+
f.each_line do |t|
|
13
|
+
if t.chomp === word
|
14
|
+
match = true
|
15
|
+
break
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
return match
|
20
|
+
end
|
21
|
+
|
22
|
+
# @param [String] word
|
23
|
+
# @return [Boolean] return True if the word is one character
|
24
|
+
def is_onechar?(word)
|
25
|
+
return word.size == 1
|
26
|
+
end
|
27
|
+
|
28
|
+
# @param [Object] mecab object
|
29
|
+
# @return [Boolean] return True if the mecab obj is noun
|
30
|
+
def validate_word(word_obj)
|
31
|
+
n = word_obj
|
32
|
+
if (n.is_bos? || n.is_eos?) ||
|
33
|
+
n.feature.scan(/名詞/).length === 0 ||
|
34
|
+
n.surface.match(/[\/\d]/) ||
|
35
|
+
self.is_stopword?(n.surface) ||
|
36
|
+
self.is_onechar?(n.surface)
|
37
|
+
return true
|
38
|
+
end
|
39
|
+
|
40
|
+
return false
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
data/lib/bm25/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bm25
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Masayuki Komatsu
|
@@ -89,6 +89,7 @@ files:
|
|
89
89
|
- lib/bm25/parser.rb
|
90
90
|
- lib/bm25/stopword.txt
|
91
91
|
- lib/bm25/utils.rb
|
92
|
+
- lib/bm25/validator.rb
|
92
93
|
- lib/bm25/version.rb
|
93
94
|
homepage: https://github.com/Bit-Pumpkin/bm25
|
94
95
|
licenses:
|