RubyGems - bm25 - Versions diffs - 0.1.4 → 0.1.5 - Mend

bm25 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3cb66896c835cefd13368e83f874fb6a0d3667bade74c27e3f069685fade9abc
-  data.tar.gz: 40b158e69b60560e880fc40ffab05b01066fe3e49ade003b1bda51bb920fd09e
+  metadata.gz: '039fdd5965e1f170b441115ddce07581551e4b210e5ff7814f34e409a836bbad'
+  data.tar.gz: cc171ad4db4e7a925c897d2ece2235089fd812a7e51cd663dc241aeb9d739716
 SHA512:
-  metadata.gz: 301214a74bb46d76161264e1f767dd34d09216734b17cb55ad955eda0e741a648dc259778b24eb9bcfa51b8659a60a7bf310044992e626ceaa3e5187fa405bd9
-  data.tar.gz: 3b6b7938854cd8eba7581599f107f17f7fc7234923c36c96fc07f1fb800f178bdbbce08710692481ef561859475bbd59b8eddc59a1a5135aa6667f47b4ce9302
+  metadata.gz: b1bd14b98cc0d801a4692471c9f5efae4b36cef4ed8d9928395ea4c2b93e4c17691564fd7a7a239b5024a2a06d0b46a243ac2436168ab93fc264700ec3b1f94f
+  data.tar.gz: 1bd71a4c0c9437122ff25c7184da839414b13dcccdef673b5fa103235e7c9cd5350584ae7025c67d5c92bb9051e5ca7532b6023f5b699e5055fc8bd747c71dc7

data/Gemfile.lock CHANGED

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    bm25 (0.1.4)
+    bm25 (0.1.5)
       natto
 GEM

data/lib/bm25/parser.rb CHANGED

@@ -4,12 +4,11 @@ require 'pp'
 module Bm25
   class Parser
-    def initialize(scopes = [])
+    def initialize()
       @base_document = ''
       @docs = []
       @idf_map = {}
       @all_word_length = 0
-      @scopes = scopes.join('|')
     end
     def create_data
@@ -28,7 +27,7 @@ module Bm25
       @docs = []
       @base_document = document
-      @all_word_length = self.separate_words(document).length
+      @all_word_length = Bm25::Utils.separate_words(document).length
       data = self.create_data
       data = self.get_important_keyword(data)
@@ -37,10 +36,10 @@ module Bm25
     def create_docs
       nm = Natto::MeCab.new
-      doc_list = self.separate_document(@base_document)
+      doc_list = Bm25::Utils.separate_document(@base_document)
       doc_list.each do |d|
-        total_words = separate_words(d)
+        total_words = Bm25::Utils.separate_words(d)
         word_map = {}
         total_words.each do |w|
           count = 0
@@ -55,12 +54,12 @@ module Bm25
         end
         avarage_word_length = @all_word_length / doc_list.length
         # NDL(j) = 文書Djの総単語数 / (すべての文書の平均DL)
-        @docs.push({
+        @docs << {
           document: d,
           words: word_map,
           words_length: total_words.length,
           dl: total_words.length / avarage_word_length.to_f
-        })
+        }
       end
     end
@@ -68,7 +67,7 @@ module Bm25
       words = []
       @docs.each do |d|
         d[:words].each_pair do |k, v|
-          words.push(k)
+          words << k
         end
       end
       words = words.uniq
@@ -91,18 +90,18 @@ module Bm25
         b = 0.75
         d[:words].each_pair do |k, v|
           # [ TF(i,j) * IDF(i) * (K1 + 1) ] / [ K1 * (1 - b + (b * NDL(j)) + TF(i,j) ]
-          new_words.push({
+          new_words << {
             word: k,
             tf: v[:tf],
             idf: @idf_map[k][:idf],
             val: @idf_map[k][:idf] * v[:tf],
             bm25: (v[:tf] * @idf_map[k][:idf] + (k1 + 1)) / (k1 * (1 - b + (b * d[:dl])) + v[:tf])
-          })
+          }
         end
-        data.push({
+        data << {
           document: d[:document],
           words: new_words.sort_by{|w| -w[:bm25]}
-        })
+        }
       end
       return data
     end
@@ -123,26 +122,5 @@ module Bm25
       return word_map.sort {|(k1, v1), (k2, v2)| v2 <=> v1 }
     end
-    def separate_words(document)
-      nm = Natto::MeCab.new
-      data = []
-      nm.parse(document) do |n|
-        if (n.is_bos? || n.is_eos?) ||
-            n.feature.scan(/#{@scopes}/).length === 0 ||
-            n.surface.match(/[\/\d]/) ||
-            Bm25::Utils.is_stopword?(n.surface) ||
-            Bm25::Utils.is_onechar?(n.surface)
-          next
-        end
-        data.push(n.surface)
-      end
-      return data
-    end
-    def separate_document(document)
-      docs = document.scan(/[^。^\.^\,\r\n|\n|\r]+/)
-      return docs
-    end
   end
 end

data/lib/bm25/stopword.txt CHANGED

@@ -102,6 +102,7 @@
 わたし
 けど
 ので
+です
 ハイ
 上
 中

data/lib/bm25/utils.rb CHANGED

@@ -1,3 +1,4 @@
+require 'natto'
 module Bm25
   module Utils
@@ -23,6 +24,27 @@ module Bm25
         return word.size == 1
       end
+      def separate_words(document)
+        nm = Natto::MeCab.new
+        data = []
+        nm.parse(document) do |n|
+          if (n.is_bos? || n.is_eos?) ||
+              n.feature.scan(/名詞|固有名詞/).length === 0 ||
+              n.surface.match(/[\/\d]/) ||
+              self.is_stopword?(n.surface) ||
+              self.is_onechar?(n.surface)
+            next
+          end
+          data << n.surface
+        end
+        return data
+      end
+      def separate_document(document)
+        docs = document.scan(/[^。^\.^\,\r\n|\n|\r]+/)
+        return docs
+      end
     end
   end

data/lib/bm25/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Bm25
-  VERSION = "0.1.4"
+  VERSION = "0.1.5"
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bm25
 version: !ruby/object:Gem::Version
-  version: 0.1.4
+  version: 0.1.5
 platform: ruby
 authors:
 - Masayuki Komatsu
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-03-28 00:00:00.000000000 Z
+date: 2018-03-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler