bm25 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '039fdd5965e1f170b441115ddce07581551e4b210e5ff7814f34e409a836bbad'
4
- data.tar.gz: cc171ad4db4e7a925c897d2ece2235089fd812a7e51cd663dc241aeb9d739716
3
+ metadata.gz: 23849a1ca209b9d85bddd83c0e135332af3176f72e14fe1301819107fcf25f65
4
+ data.tar.gz: 3a736717ba9c03c17548d2c295846daa2e71a044b167c59a8ffdd8bdfe2a0a9f
5
5
  SHA512:
6
- metadata.gz: b1bd14b98cc0d801a4692471c9f5efae4b36cef4ed8d9928395ea4c2b93e4c17691564fd7a7a239b5024a2a06d0b46a243ac2436168ab93fc264700ec3b1f94f
7
- data.tar.gz: 1bd71a4c0c9437122ff25c7184da839414b13dcccdef673b5fa103235e7c9cd5350584ae7025c67d5c92bb9051e5ca7532b6023f5b699e5055fc8bd747c71dc7
6
+ metadata.gz: 3b806bd8c54e3bd7b434c9fce996b838c0e6345b25e152458ac96b8a08c873e5405ed2304b50fa09aa706689db1cc695b10d2aa5a63dbbe607a28b36102ef855
7
+ data.tar.gz: 67e43c256e7dfb42f077bed16cdefc13b3f46119fbffb11ea53f26822700cbbefd4d0a253adc7c134a2837a1b62ef652220e409850aaed9e03408ebb99b9537f
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- bm25 (0.1.5)
4
+ bm25 (0.1.6)
5
5
  natto
6
6
 
7
7
  GEM
@@ -1,4 +1,5 @@
1
1
  require_relative 'utils'
2
+ require_relative 'validator'
2
3
  require 'natto'
3
4
  require 'pp'
4
5
 
@@ -53,7 +54,6 @@ module Bm25
53
54
  end
54
55
  end
55
56
  avarage_word_length = @all_word_length / doc_list.length
56
- # NDL(j) = 文書Djの総単語数 / (すべての文書の平均DL)
57
57
  @docs << {
58
58
  document: d,
59
59
  words: word_map,
@@ -66,10 +66,9 @@ module Bm25
66
66
  def create_idf_map
67
67
  words = []
68
68
  @docs.each do |d|
69
- d[:words].each_pair do |k, v|
70
- words << k
71
- end
69
+ d[:words].each_pair{|k, v| words << k }
72
70
  end
71
+
73
72
  words = words.uniq
74
73
  words.each do |word|
75
74
  f = 0
@@ -88,14 +87,15 @@ module Bm25
88
87
  new_words = []
89
88
  k1 = 1.2
90
89
  b = 0.75
90
+ # [ TF(i,j) * IDF(i) * (K1 + 1) ] / [ K1 * (1 - b + (b * NDL(j)) + TF(i,j) ]
91
91
  d[:words].each_pair do |k, v|
92
- # [ TF(i,j) * IDF(i) * (K1 + 1) ] / [ K1 * (1 - b + (b * NDL(j)) + TF(i,j) ]
92
+ tfidf = @idf_map[k][:idf] * v[:tf]
93
93
  new_words << {
94
94
  word: k,
95
95
  tf: v[:tf],
96
96
  idf: @idf_map[k][:idf],
97
- val: @idf_map[k][:idf] * v[:tf],
98
- bm25: (v[:tf] * @idf_map[k][:idf] + (k1 + 1)) / (k1 * (1 - b + (b * d[:dl])) + v[:tf])
97
+ tfidf: tfidf,
98
+ bm25: (tfidf + (k1 + 1)) / (k1 * (1 - b + (b * d[:dl])) + v[:tf])
99
99
  }
100
100
  end
101
101
  data << {
@@ -1,38 +1,14 @@
1
- require 'natto'
2
1
  module Bm25
3
2
 
4
3
  module Utils
5
4
 
6
5
  class << self
7
6
 
8
- def is_stopword? (word)
9
- match = false
10
- stopword_path = File.join( File.dirname(__FILE__), 'stopword.txt' )
11
- File.open(stopword_path, "r") do |f|
12
- f.each_line do |t|
13
- if t.chomp === word
14
- match = true
15
- break
16
- end
17
- end
18
- end
19
- return match
20
-
21
- end
22
-
23
- def is_onechar?(word)
24
- return word.size == 1
25
- end
26
-
27
7
  def separate_words(document)
28
8
  nm = Natto::MeCab.new
29
9
  data = []
30
10
  nm.parse(document) do |n|
31
- if (n.is_bos? || n.is_eos?) ||
32
- n.feature.scan(/名詞|固有名詞/).length === 0 ||
33
- n.surface.match(/[\/\d]/) ||
34
- self.is_stopword?(n.surface) ||
35
- self.is_onechar?(n.surface)
11
+ if Bm25::Validator.validate_word(n)
36
12
  next
37
13
  end
38
14
  data << n.surface
@@ -0,0 +1,47 @@
1
+ module Bm25
2
+ class Validator
3
+
4
+ class << self
5
+
6
+ # @param [String] word
7
+ # @return [Boolean] return True if the word is a stopword
8
+ def is_stopword? (word)
9
+ match = false
10
+ stopword_path = File.join( File.dirname(__FILE__), 'stopword.txt' )
11
+ File.open(stopword_path, "r") do |f|
12
+ f.each_line do |t|
13
+ if t.chomp === word
14
+ match = true
15
+ break
16
+ end
17
+ end
18
+ end
19
+ return match
20
+ end
21
+
22
+ # @param [String] word
23
+ # @return [Boolean] return True if the word is one character
24
+ def is_onechar?(word)
25
+ return word.size == 1
26
+ end
27
+
28
+ # @param [Object] mecab object
29
+ # @return [Boolean] return True if the mecab obj is noun
30
+ def validate_word(word_obj)
31
+ n = word_obj
32
+ if (n.is_bos? || n.is_eos?) ||
33
+ n.feature.scan(/名詞/).length === 0 ||
34
+ n.surface.match(/[\/\d]/) ||
35
+ self.is_stopword?(n.surface) ||
36
+ self.is_onechar?(n.surface)
37
+ return true
38
+ end
39
+
40
+ return false
41
+
42
+ end
43
+
44
+ end
45
+
46
+ end
47
+ end
@@ -1,3 +1,3 @@
1
1
  module Bm25
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bm25
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Masayuki Komatsu
@@ -89,6 +89,7 @@ files:
89
89
  - lib/bm25/parser.rb
90
90
  - lib/bm25/stopword.txt
91
91
  - lib/bm25/utils.rb
92
+ - lib/bm25/validator.rb
92
93
  - lib/bm25/version.rb
93
94
  homepage: https://github.com/Bit-Pumpkin/bm25
94
95
  licenses: