bm25 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/bm25/parser.rb +11 -33
- data/lib/bm25/stopword.txt +1 -0
- data/lib/bm25/utils.rb +22 -0
- data/lib/bm25/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '039fdd5965e1f170b441115ddce07581551e4b210e5ff7814f34e409a836bbad'
|
4
|
+
data.tar.gz: cc171ad4db4e7a925c897d2ece2235089fd812a7e51cd663dc241aeb9d739716
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b1bd14b98cc0d801a4692471c9f5efae4b36cef4ed8d9928395ea4c2b93e4c17691564fd7a7a239b5024a2a06d0b46a243ac2436168ab93fc264700ec3b1f94f
|
7
|
+
data.tar.gz: 1bd71a4c0c9437122ff25c7184da839414b13dcccdef673b5fa103235e7c9cd5350584ae7025c67d5c92bb9051e5ca7532b6023f5b699e5055fc8bd747c71dc7
|
data/Gemfile.lock
CHANGED
data/lib/bm25/parser.rb
CHANGED
@@ -4,12 +4,11 @@ require 'pp'
|
|
4
4
|
|
5
5
|
module Bm25
|
6
6
|
class Parser
|
7
|
-
def initialize(
|
7
|
+
def initialize()
|
8
8
|
@base_document = ''
|
9
9
|
@docs = []
|
10
10
|
@idf_map = {}
|
11
11
|
@all_word_length = 0
|
12
|
-
@scopes = scopes.join('|')
|
13
12
|
end
|
14
13
|
|
15
14
|
def create_data
|
@@ -28,7 +27,7 @@ module Bm25
|
|
28
27
|
@docs = []
|
29
28
|
|
30
29
|
@base_document = document
|
31
|
-
@all_word_length =
|
30
|
+
@all_word_length = Bm25::Utils.separate_words(document).length
|
32
31
|
|
33
32
|
data = self.create_data
|
34
33
|
data = self.get_important_keyword(data)
|
@@ -37,10 +36,10 @@ module Bm25
|
|
37
36
|
|
38
37
|
def create_docs
|
39
38
|
nm = Natto::MeCab.new
|
40
|
-
doc_list =
|
39
|
+
doc_list = Bm25::Utils.separate_document(@base_document)
|
41
40
|
|
42
41
|
doc_list.each do |d|
|
43
|
-
total_words = separate_words(d)
|
42
|
+
total_words = Bm25::Utils.separate_words(d)
|
44
43
|
word_map = {}
|
45
44
|
total_words.each do |w|
|
46
45
|
count = 0
|
@@ -55,12 +54,12 @@ module Bm25
|
|
55
54
|
end
|
56
55
|
avarage_word_length = @all_word_length / doc_list.length
|
57
56
|
# NDL(j) = 文書Djの総単語数 / (すべての文書の平均DL)
|
58
|
-
@docs
|
57
|
+
@docs << {
|
59
58
|
document: d,
|
60
59
|
words: word_map,
|
61
60
|
words_length: total_words.length,
|
62
61
|
dl: total_words.length / avarage_word_length.to_f
|
63
|
-
}
|
62
|
+
}
|
64
63
|
end
|
65
64
|
end
|
66
65
|
|
@@ -68,7 +67,7 @@ module Bm25
|
|
68
67
|
words = []
|
69
68
|
@docs.each do |d|
|
70
69
|
d[:words].each_pair do |k, v|
|
71
|
-
words
|
70
|
+
words << k
|
72
71
|
end
|
73
72
|
end
|
74
73
|
words = words.uniq
|
@@ -91,18 +90,18 @@ module Bm25
|
|
91
90
|
b = 0.75
|
92
91
|
d[:words].each_pair do |k, v|
|
93
92
|
# [ TF(i,j) * IDF(i) * (K1 + 1) ] / [ K1 * (1 - b + (b * NDL(j)) + TF(i,j) ]
|
94
|
-
new_words
|
93
|
+
new_words << {
|
95
94
|
word: k,
|
96
95
|
tf: v[:tf],
|
97
96
|
idf: @idf_map[k][:idf],
|
98
97
|
val: @idf_map[k][:idf] * v[:tf],
|
99
98
|
bm25: (v[:tf] * @idf_map[k][:idf] + (k1 + 1)) / (k1 * (1 - b + (b * d[:dl])) + v[:tf])
|
100
|
-
}
|
99
|
+
}
|
101
100
|
end
|
102
|
-
data
|
101
|
+
data << {
|
103
102
|
document: d[:document],
|
104
103
|
words: new_words.sort_by{|w| -w[:bm25]}
|
105
|
-
}
|
104
|
+
}
|
106
105
|
end
|
107
106
|
return data
|
108
107
|
end
|
@@ -123,26 +122,5 @@ module Bm25
|
|
123
122
|
return word_map.sort {|(k1, v1), (k2, v2)| v2 <=> v1 }
|
124
123
|
end
|
125
124
|
|
126
|
-
def separate_words(document)
|
127
|
-
nm = Natto::MeCab.new
|
128
|
-
data = []
|
129
|
-
nm.parse(document) do |n|
|
130
|
-
if (n.is_bos? || n.is_eos?) ||
|
131
|
-
n.feature.scan(/#{@scopes}/).length === 0 ||
|
132
|
-
n.surface.match(/[\/\d]/) ||
|
133
|
-
Bm25::Utils.is_stopword?(n.surface) ||
|
134
|
-
Bm25::Utils.is_onechar?(n.surface)
|
135
|
-
next
|
136
|
-
end
|
137
|
-
data.push(n.surface)
|
138
|
-
end
|
139
|
-
return data
|
140
|
-
end
|
141
|
-
|
142
|
-
def separate_document(document)
|
143
|
-
docs = document.scan(/[^。^\.^\,\r\n|\n|\r]+/)
|
144
|
-
return docs
|
145
|
-
end
|
146
|
-
|
147
125
|
end
|
148
126
|
end
|
data/lib/bm25/stopword.txt
CHANGED
data/lib/bm25/utils.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'natto'
|
1
2
|
module Bm25
|
2
3
|
|
3
4
|
module Utils
|
@@ -23,6 +24,27 @@ module Bm25
|
|
23
24
|
return word.size == 1
|
24
25
|
end
|
25
26
|
|
27
|
+
def separate_words(document)
|
28
|
+
nm = Natto::MeCab.new
|
29
|
+
data = []
|
30
|
+
nm.parse(document) do |n|
|
31
|
+
if (n.is_bos? || n.is_eos?) ||
|
32
|
+
n.feature.scan(/名詞|固有名詞/).length === 0 ||
|
33
|
+
n.surface.match(/[\/\d]/) ||
|
34
|
+
self.is_stopword?(n.surface) ||
|
35
|
+
self.is_onechar?(n.surface)
|
36
|
+
next
|
37
|
+
end
|
38
|
+
data << n.surface
|
39
|
+
end
|
40
|
+
return data
|
41
|
+
end
|
42
|
+
|
43
|
+
def separate_document(document)
|
44
|
+
docs = document.scan(/[^。^\.^\,\r\n|\n|\r]+/)
|
45
|
+
return docs
|
46
|
+
end
|
47
|
+
|
26
48
|
end
|
27
49
|
|
28
50
|
end
|
data/lib/bm25/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bm25
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Masayuki Komatsu
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-03-
|
11
|
+
date: 2018-03-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|