RubyGems - bm25 - Versions diffs - 0.1.2 → 0.1.3 - Mend

bm25 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: 0ef7fac79a6db09014f07c985635114c9daf1d93
-  data.tar.gz: 3e027d22087aeec8eca9aae97216d42fa0516ec1
+SHA256:
+  metadata.gz: 1a99f7e2ba2f3c27e3683119915f8d0f8bfe2d876ac872d1b9798727f31deefa
+  data.tar.gz: c5c3f13b6d5cd86baa620d16f0594ff4257562d259432c1b3695b9c3ba72f4d7
 SHA512:
-  metadata.gz: 112549c9a347cfdd8e6fedd50bbf30799d62a66ade8835b527797596eb80f0c2a1b4e4204e47d11374c1cde0b6d0fd5f915abf0600ee718be391af3634332471
-  data.tar.gz: 06370fa88c366ade0265f361b71963fc67b16cf474820323effbb77cc6b17993dd1c514c5c06aab1cdba8616a472c48b5ba893362b35c43d17394ea293537d45
+  metadata.gz: 74aecf440fe5ad44c6e01d4f59b6dd2c082558a466431928c91d9051a2c63d7eb14eb9c5b7bc88be502b38c1dc344a970e1dd329354956e29d0e6f3598b8521f
+  data.tar.gz: e7e10625e4ff8d6f1d237712a8c5381b75f1c8a9fb70fc475b55d17783263455722d5b438080681c7dbf317b4960e6bd89e8e8fe88922b57455ae24ffd91abb6

data/.gitignore CHANGED

@@ -9,5 +9,5 @@
 # rspec failure tracking
 .rspec_status
+*.gem
 test.rb

data/Gemfile.lock CHANGED

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    bm25 (0.1.2)
+    bm25 (0.1.3)
       natto
 GEM

data/README.md CHANGED

@@ -2,10 +2,8 @@
 Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/bm25`. To experiment with that code, run `bin/console` for an interactive prompt.
-TODO: Delete this and the text above, and describe your gem
 ## Installation
+Step. 1
 Add this line to your application's Gemfile:
 ```ruby
@@ -20,9 +18,27 @@ Or install it yourself as:
     $ gem install bm25
+Step. 2 Install Mecab and setup mecab-ipadic-neologd
+mecab: http://taku910.github.io/mecab/
+mecab-ipadic-neologd: https://github.com/neologd/mecab-ipadic-neologd
 ## Usage
+```
+require 'bm25'
+paser = Bm25::Parser.new(['名詞'])
+words = paser.execute("プログラマーだけど肩こりがひどいので懸垂バーを買って背中を鍛えることにした")
+# words = [
+#   ['プログラマー', 1.7142857142857146],
+#   ['肩こり', 1.7142857142857146],
+#   ['懸垂', 1.7142857142857146],
+#   ['バー', 1.7142857142857146],
+#   ['背中', 1.7142857142857146]
+# ]
+```
-TODO: Write usage instructions here
 ## Development

data/bm25.gemspec CHANGED

@@ -26,7 +26,6 @@ Gem::Specification.new do |spec|
   spec.files         = `git ls-files -z`.split("\x0").reject do |f|
     f.match(%r{^(test|spec|features)/})
   end
-  spec.bindir        = "exe"
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
   spec.require_paths = ["lib"]

data/lib/bm25.rb CHANGED

@@ -1,6 +1,5 @@
-require "bm25/version"
 require 'bm25/parser'
+require 'bm25/utils'
 module Bm25
 end

data/lib/bm25/parser.rb CHANGED

@@ -1,140 +1,145 @@
+require_relative 'utils'
 require 'natto'
+require 'pp'
 module Bm25
   class Parser
+    def initialize(scopes = [])
+      @base_document = ''
+      @docs = []
+      @idf_map = {}
+      @all_word_length = 0
+      @scopes = scopes.join('|')
+    end
-      def initialize(scopes = [])
-        @base_document = ''
-        @docs = []
-        @idf_map = {}
-        @all_word_length = 0
-        @scopes = scopes.join('|')
-      end
-      def create_data
-        self.create_docs
-        self.create_idf_map
-        dataset = self.get_dataset
-        return dataset
-      end
+    def create_data
+      self.create_docs
+      self.create_idf_map
+      dataset = self.get_dataset
+      return dataset
+    end
-      def execute(document)
-        @allword_length = 0
-        @idf_map = {}
-        @docs = []
+    def execute(document)
+      @allword_length = 0
+      @idf_map = {}
+      @docs = []
-        @base_document = document
-        @all_word_length = self.separate_words(document).length
+      @base_document = document
+      @all_word_length = self.separate_words(document).length
-        data = self.create_data
-        data = self.get_important_keyword(data)
-        return data
-      end
+      data = self.create_data
+      data = self.get_important_keyword(data)
+      return data
+    end
-      def create_docs
-        nm = Natto::MeCab.new
-        doc_list = self.separate_document(@base_document)
+    def create_docs
+      nm = Natto::MeCab.new
+      doc_list = self.separate_document(@base_document)
-        doc_list.each do |d|
-          total_words = separate_words(d)
-          word_map = {}
-          total_words.each do |w|
-            count = 0
-            #単語数
-            count = d.scan(/#{Regexp.escape(w)}/).length
-            if word_map[w].nil?
-              word_map[w] = {
-                count: count,
-                tf: count.to_f / total_words.length
-              }
-            end
+      doc_list.each do |d|
+        total_words = separate_words(d)
+        word_map = {}
+        total_words.each do |w|
+          count = 0
+          #単語数
+          count = d.scan(/#{Regexp.escape(w)}/).length
+          if word_map[w].nil?
+            word_map[w] = {
+              count: count,
+              tf: count.to_f / total_words.length
+            }
           end
-          avarage_word_length = @all_word_length / doc_list.length
-          # NDL(j) = 文書Djの総単語数 / (すべての文書の平均DL)
-          @docs.push({
-            document: d,
-            words: word_map,
-            words_length: total_words.length,
-            dl: total_words.length / avarage_word_length.to_f
-          })
         end
+        avarage_word_length = @all_word_length / doc_list.length
+        # NDL(j) = 文書Djの総単語数 / (すべての文書の平均DL)
+        @docs.push({
+          document: d,
+          words: word_map,
+          words_length: total_words.length,
+          dl: total_words.length / avarage_word_length.to_f
+        })
       end
+    end
-      def create_idf_map
-        words = []
-        @docs.each do |d|
-          d[:words].each_pair do |k, v|
-            words.push(k)
-          end
-        end
-        words = words.uniq
-        words.each do |word|
-          f = 0
-          @docs.each{|d| f = f + 1 if d[:words][word]}
-          idf = f === 0 ? 0 : @docs.length / f
-          @idf_map[word] = {
-            df: f,
-            idf: Math.log(idf) + 1
-          }
+    def create_idf_map
+      words = []
+      @docs.each do |d|
+        d[:words].each_pair do |k, v|
+          words.push(k)
         end
       end
+      words = words.uniq
+      words.each do |word|
+        f = 0
+        @docs.each{|d| f = f + 1 if d[:words][word]}
+        idf = f === 0 ? 0 : @docs.length / f
+        @idf_map[word] = {
+          df: f,
+          idf: Math.log(idf) + 1
+        }
+      end
+    end
-      def get_dataset
-        data = []
-        @docs.each do |d|
-          new_words = []
-          k1 = 1.2
-          b = 0.75
-          d[:words].each_pair do |k, v|
-            # [ TF(i,j) * IDF(i) * (K1 + 1) ] / [ K1 * (1 - b + (b * NDL(j)) + TF(i,j) ]
-            new_words.push({
-              word: k,
-              tf: v[:tf],
-              idf: @idf_map[k][:idf],
-              val: @idf_map[k][:idf] * v[:tf],
-              bm25: (v[:tf] * @idf_map[k][:idf] + (k1 + 1)) / (k1 * (1 - b + (b * d[:dl])) + v[:tf])
-            })
-          end
-          data.push({
-            document: d[:document],
-            words: new_words.sort_by{|w| -w[:bm25]}
+    def get_dataset
+      data = []
+      @docs.each do |d|
+        new_words = []
+        k1 = 1.2
+        b = 0.75
+        d[:words].each_pair do |k, v|
+          # [ TF(i,j) * IDF(i) * (K1 + 1) ] / [ K1 * (1 - b + (b * NDL(j)) + TF(i,j) ]
+          new_words.push({
+            word: k,
+            tf: v[:tf],
+            idf: @idf_map[k][:idf],
+            val: @idf_map[k][:idf] * v[:tf],
+            bm25: (v[:tf] * @idf_map[k][:idf] + (k1 + 1)) / (k1 * (1 - b + (b * d[:dl])) + v[:tf])
           })
         end
-        return data
+        data.push({
+          document: d[:document],
+          words: new_words.sort_by{|w| -w[:bm25]}
+        })
       end
+      return data
+    end
-      def get_important_keyword(dataset)
-        word_map = {}
-        dataset.each do |data|
-          data[:words].each do |val|
-            k = val[:word]
-            bm25 = val[:bm25]
-            if word_map[k]
-              word_map[k] = word_map[k] + bm25
-            else
-              word_map[k] = bm25
-            end
+    def get_important_keyword(dataset)
+      word_map = {}
+      dataset.each do |data|
+        data[:words].each do |val|
+          k = val[:word]
+          bm25 = val[:bm25]
+          if word_map[k]
+            word_map[k] = word_map[k] + bm25
+          else
+            word_map[k] = bm25
           end
         end
-        return word_map.sort {|(k1, v1), (k2, v2)| v2 <=> v1 }
       end
+      return word_map.sort {|(k1, v1), (k2, v2)| v2 <=> v1 }
+    end
-      def separate_words(document)
-        nm = Natto::MeCab.new
-        data = []
-        nm.parse(document) do |n|
-          if (n.is_bos? || n.is_eos?) || n.feature.scan(/#{@scopes}/).length === 0 || n.surface.match(/[\/\d]/)
-            next
-          end
-          data.push(n.surface)
+    def separate_words(document)
+      nm = Natto::MeCab.new
+      data = []
+      nm.parse(document) do |n|
+        if (n.is_bos? || n.is_eos?) ||
+            n.feature.scan(/#{@scopes}/).length === 0 ||
+            n.surface.match(/[\/\d]/) ||
+            Bm25::Utils.is_stopword?(n.surface) ||
+            Bm25::Utils.is_onechar?(n.surface)
+          next
         end
-        return data
+        data.push(n.surface)
       end
+      return data
+    end
-      def separate_document(document)
-        docs = document.scan(/[^。^\.^\,\r\n|\n|\r]+/)
-        return docs
-      end
+    def separate_document(document)
+      docs = document.scan(/[^。^\.^\,\r\n|\n|\r]+/)
+      return docs
+    end
   end
 end

data/lib/bm25/stopword.txt ADDED

@@ -0,0 +1,328 @@
+あそこ
+あたり
+あちら
+あっち
+あと
+あな
+あなた
+あれ
+いくつ
+いつ
+いま
+いや
+いろいろ
+うち
+おおまか
+おまえ
+おれ
+がい
+かく
+かたち
+かやの
+から
+がら
+きた
+くせ
+ここ
+こっち
+こと
+ごと
+こちら
+ごっちゃ
+これ
+これら
+ごろ
+さまざま
+さらい
+さん
+しかた
+しよう
+すか
+ずつ
+すね
+すべて
+ぜんぶ
+そう
+そこ
+そちら
+そっち
+そで
+それ
+それぞれ
+それなり
+たくさん
+たち
+たび
+ため
+だめ
+ちゃ
+ちゃん
+てん
+とおり
+とき
+どこ
+どこか
+ところ
+どちら
+どっか
+どっち
+どれ
+なか
+なかば
+なに
+など
+なん
+はじめ
+はず
+はるか
+ひと
+ひとつ
+ふく
+ぶり
+べつ
+へん
+ぺん
+ほう
+ほか
+まさ
+まし
+まとも
+まま
+みたい
+みつ
+みなさん
+みんな
+もと
+もの
+もん
+やつ
+よう
+よそ
+わけ
+わたし
+ハイ
+上
+中
+下
+字
+年
+月
+日
+時
+分
+秒
+週
+火
+水
+木
+金
+土
+国
+都
+道
+府
+県
+市
+区
+町
+村
+各
+第
+方
+何
+的
+度
+文
+者
+性
+体
+人
+他
+今
+部
+課
+係
+外
+類
+達
+気
+室
+口
+誰
+用
+界
+会
+首
+男
+女
+別
+話
+私
+屋
+店
+家
+場
+等
+見
+際
+観
+段
+略
+例
+系
+論
+形
+間
+地
+員
+線
+点
+書
+品
+力
+法
+感
+作
+元
+手
+数
+彼
+彼女
+子
+内
+楽
+喜
+怒
+哀
+輪
+頃
+化
+境
+俺
+奴
+高
+校
+婦
+伸
+紀
+誌
+レ
+行
+列
+事
+士
+台
+集
+様
+所
+歴
+器
+名
+情
+連
+毎
+式
+簿
+回
+匹
+個
+席
+束
+歳
+目
+通
+面
+円
+玉
+枚
+前
+後
+左
+右
+次
+先
+春
+夏
+秋
+冬
+一
+二
+三
+四
+五
+六
+七
+八
+九
+十
+百
+千
+万
+億
+兆
+下記
+上記
+時間
+今回
+前回
+場合
+一つ
+年生
+自分
+ヶ所
+ヵ所
+カ所
+箇所
+ヶ月
+ヵ月
+カ月
+箇月
+名前
+本当
+確か
+時点
+全部
+関係
+近く
+方法
+我々
+違い
+多く
+扱い
+新た
+その後
+半ば
+結局
+様々
+以前
+以後
+以降
+未満
+以上
+以下
+幾つ
+毎日
+自体
+向こう
+何人
+手段
+同じ
+感じ

data/lib/bm25/utils.rb ADDED

@@ -0,0 +1,30 @@
+module Bm25
+  module Utils
+    class << self
+      def is_stopword? (word)
+        match = false
+        File.open("lib/bm25/stopword.txt", "r") do |f|
+          f.each_line do |t|
+            if t.chomp === word
+              match = true
+              break
+            end
+          end
+        end
+        return match
+      end
+      def is_onechar?(word)
+        return word.size == 1
+      end
+    end
+  end
+end

data/lib/bm25/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Bm25
-  VERSION = "0.1.2"
+  VERSION = "0.1.3"
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bm25
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.3
 platform: ruby
 authors:
 - Masayuki Komatsu
 autorequire:
-bindir: exe
+bindir: bin
 cert_chain: []
-date: 2018-03-24 00:00:00.000000000 Z
+date: 2018-03-26 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -87,6 +87,8 @@ files:
 - bm25.gemspec
 - lib/bm25.rb
 - lib/bm25/parser.rb
+- lib/bm25/stopword.txt
+- lib/bm25/utils.rb
 - lib/bm25/version.rb
 homepage: https://github.com/Bit-Pumpkin/bm25
 licenses:
@@ -108,7 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.5.1
+rubygems_version: 2.7.4
 signing_key:
 specification_version: 4
 summary: Okapi Bm25 for Japanese