bm25 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3cb66896c835cefd13368e83f874fb6a0d3667bade74c27e3f069685fade9abc
4
- data.tar.gz: 40b158e69b60560e880fc40ffab05b01066fe3e49ade003b1bda51bb920fd09e
3
+ metadata.gz: '039fdd5965e1f170b441115ddce07581551e4b210e5ff7814f34e409a836bbad'
4
+ data.tar.gz: cc171ad4db4e7a925c897d2ece2235089fd812a7e51cd663dc241aeb9d739716
5
5
  SHA512:
6
- metadata.gz: 301214a74bb46d76161264e1f767dd34d09216734b17cb55ad955eda0e741a648dc259778b24eb9bcfa51b8659a60a7bf310044992e626ceaa3e5187fa405bd9
7
- data.tar.gz: 3b6b7938854cd8eba7581599f107f17f7fc7234923c36c96fc07f1fb800f178bdbbce08710692481ef561859475bbd59b8eddc59a1a5135aa6667f47b4ce9302
6
+ metadata.gz: b1bd14b98cc0d801a4692471c9f5efae4b36cef4ed8d9928395ea4c2b93e4c17691564fd7a7a239b5024a2a06d0b46a243ac2436168ab93fc264700ec3b1f94f
7
+ data.tar.gz: 1bd71a4c0c9437122ff25c7184da839414b13dcccdef673b5fa103235e7c9cd5350584ae7025c67d5c92bb9051e5ca7532b6023f5b699e5055fc8bd747c71dc7
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- bm25 (0.1.4)
4
+ bm25 (0.1.5)
5
5
  natto
6
6
 
7
7
  GEM
@@ -4,12 +4,11 @@ require 'pp'
4
4
 
5
5
  module Bm25
6
6
  class Parser
7
- def initialize(scopes = [])
7
+ def initialize()
8
8
  @base_document = ''
9
9
  @docs = []
10
10
  @idf_map = {}
11
11
  @all_word_length = 0
12
- @scopes = scopes.join('|')
13
12
  end
14
13
 
15
14
  def create_data
@@ -28,7 +27,7 @@ module Bm25
28
27
  @docs = []
29
28
 
30
29
  @base_document = document
31
- @all_word_length = self.separate_words(document).length
30
+ @all_word_length = Bm25::Utils.separate_words(document).length
32
31
 
33
32
  data = self.create_data
34
33
  data = self.get_important_keyword(data)
@@ -37,10 +36,10 @@ module Bm25
37
36
 
38
37
  def create_docs
39
38
  nm = Natto::MeCab.new
40
- doc_list = self.separate_document(@base_document)
39
+ doc_list = Bm25::Utils.separate_document(@base_document)
41
40
 
42
41
  doc_list.each do |d|
43
- total_words = separate_words(d)
42
+ total_words = Bm25::Utils.separate_words(d)
44
43
  word_map = {}
45
44
  total_words.each do |w|
46
45
  count = 0
@@ -55,12 +54,12 @@ module Bm25
55
54
  end
56
55
  avarage_word_length = @all_word_length / doc_list.length
57
56
  # NDL(j) = 文書Djの総単語数 / (すべての文書の平均DL)
58
- @docs.push({
57
+ @docs << {
59
58
  document: d,
60
59
  words: word_map,
61
60
  words_length: total_words.length,
62
61
  dl: total_words.length / avarage_word_length.to_f
63
- })
62
+ }
64
63
  end
65
64
  end
66
65
 
@@ -68,7 +67,7 @@ module Bm25
68
67
  words = []
69
68
  @docs.each do |d|
70
69
  d[:words].each_pair do |k, v|
71
- words.push(k)
70
+ words << k
72
71
  end
73
72
  end
74
73
  words = words.uniq
@@ -91,18 +90,18 @@ module Bm25
91
90
  b = 0.75
92
91
  d[:words].each_pair do |k, v|
93
92
  # [ TF(i,j) * IDF(i) * (K1 + 1) ] / [ K1 * (1 - b + (b * NDL(j)) + TF(i,j) ]
94
- new_words.push({
93
+ new_words << {
95
94
  word: k,
96
95
  tf: v[:tf],
97
96
  idf: @idf_map[k][:idf],
98
97
  val: @idf_map[k][:idf] * v[:tf],
99
98
  bm25: (v[:tf] * @idf_map[k][:idf] + (k1 + 1)) / (k1 * (1 - b + (b * d[:dl])) + v[:tf])
100
- })
99
+ }
101
100
  end
102
- data.push({
101
+ data << {
103
102
  document: d[:document],
104
103
  words: new_words.sort_by{|w| -w[:bm25]}
105
- })
104
+ }
106
105
  end
107
106
  return data
108
107
  end
@@ -123,26 +122,5 @@ module Bm25
123
122
  return word_map.sort {|(k1, v1), (k2, v2)| v2 <=> v1 }
124
123
  end
125
124
 
126
- def separate_words(document)
127
- nm = Natto::MeCab.new
128
- data = []
129
- nm.parse(document) do |n|
130
- if (n.is_bos? || n.is_eos?) ||
131
- n.feature.scan(/#{@scopes}/).length === 0 ||
132
- n.surface.match(/[\/\d]/) ||
133
- Bm25::Utils.is_stopword?(n.surface) ||
134
- Bm25::Utils.is_onechar?(n.surface)
135
- next
136
- end
137
- data.push(n.surface)
138
- end
139
- return data
140
- end
141
-
142
- def separate_document(document)
143
- docs = document.scan(/[^。^\.^\,\r\n|\n|\r]+/)
144
- return docs
145
- end
146
-
147
125
  end
148
126
  end
@@ -102,6 +102,7 @@
102
102
  わたし
103
103
  けど
104
104
  ので
105
+ です
105
106
  ハイ
106
107
 
107
108
 
@@ -1,3 +1,4 @@
1
+ require 'natto'
1
2
  module Bm25
2
3
 
3
4
  module Utils
@@ -23,6 +24,27 @@ module Bm25
23
24
  return word.size == 1
24
25
  end
25
26
 
27
+ def separate_words(document)
28
+ nm = Natto::MeCab.new
29
+ data = []
30
+ nm.parse(document) do |n|
31
+ if (n.is_bos? || n.is_eos?) ||
32
+ n.feature.scan(/名詞|固有名詞/).length === 0 ||
33
+ n.surface.match(/[\/\d]/) ||
34
+ self.is_stopword?(n.surface) ||
35
+ self.is_onechar?(n.surface)
36
+ next
37
+ end
38
+ data << n.surface
39
+ end
40
+ return data
41
+ end
42
+
43
+ def separate_document(document)
44
+ docs = document.scan(/[^。^\.^\,\r\n|\n|\r]+/)
45
+ return docs
46
+ end
47
+
26
48
  end
27
49
 
28
50
  end
@@ -1,3 +1,3 @@
1
1
  module Bm25
2
- VERSION = "0.1.4"
2
+ VERSION = "0.1.5"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bm25
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Masayuki Komatsu
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-03-28 00:00:00.000000000 Z
11
+ date: 2018-03-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler