RubyGems - bm25f - Versions diffs - 0.2.3 → 0.2.5 - Mend

bm25f 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: b00f60fdde35d0565f169e9486d2935e5ddcef9bbd8084d60d9228364c9ebed7
-  data.tar.gz: 7249983dda5a101fb1b2ddb646414b58776ae9a29feb4f563707e9a654d45094
+  metadata.gz: 9b141349387662e73de071841670357c61b61d775cb61501d84dc7ccc801e6de
+  data.tar.gz: ad7bb2c6223a5f125224b0a5d0afdb9c3950811c9c09cd1a39ae943b38a39eb9
 SHA512:
-  metadata.gz: 16231fa9ba99f19cff2a6e9e801b734b53effa584da0bc8fcc80000774bbe61c75f1994bfbdc39b3cdac8d427b5049ff56dd1dbcc669eed3b836d910e45f5547
-  data.tar.gz: eea02e6dc1989ff69baa6097997da274149083a87cbbc7355101e613103f09fa52886718422cf68b9cb9afadb2e4ed9ce54be8117cba536a99647db01acb24f0
+  metadata.gz: 3662bed23e2c1f041766b29ffe2f87d13ec18e107870c4948fc85851bbc1640a569a9f831f67f46bb8035577fc8a33654461bd2f4d2fe4e324ef3b302f66f665
+  data.tar.gz: e27b5d15af219b3dfc41e675015bacd5c5c1d930c0f70df844f8341fab0062670eb3690bdc41163ed628e2e90f340c1fb9814ce6dd9c30fc30daf77056b4b89a

data/lib/bm25f.rb CHANGED Viewed

@@ -1,8 +1,7 @@
-require 'treat'
+require 'uea-stemmer'
+require 'pragmatic_tokenizer'
 class BM25F
-  include Treat::Core::DSL
   # Initializes a BM25F model.
   #
   # @param term_freq_weight [Float] Weight for term frequency.
@@ -10,6 +9,9 @@ class BM25F
   def initialize(term_freq_weight: 1.33, doc_length_weight: 0.8)
     @term_freq_weight = term_freq_weight
     @doc_length_weight = doc_length_weight
+    @tokenizer = PragmaticTokenizer::Tokenizer.new
+    @stemmer = UEAStemmer.new
   end
   # Fits the model to a set of documents.
@@ -56,7 +58,7 @@ class BM25F
     documents.each do |k, v|
       next unless v.instance_of? String
-      documents[k] = sentence(v).map(&:stem).join(' ')
+      documents[k] = v.map { |t| @stemmer.stem t }.oin(' ')
     end
     documents
   end
@@ -88,7 +90,7 @@ class BM25F
   def calculate_idf
     idf = {}
     @field_weights.each_key do |field|
-      field_doc_count = @documents.count { |doc| !doc[field].empty? }
+      field_doc_count = @documents.count { |doc| !doc[field]&.empty? }
       idf[field] = Math.log((@total_docs - field_doc_count + 0.5) / (field_doc_count + 0.5) + 1.0)
     end
     idf
@@ -99,7 +101,7 @@ class BM25F
   # @param query [String] The query to preprocess.
   # @return [Array<String>] An array of preprocessed query terms.
   def preprocess_query(query)
-    sentence(query).tokenize.map(&:stem)
+    @tokenizer.tokenize(query).map { |t| @stemmer.stem t }
   end
   # Calculates the score of a document using an array of query terms.
@@ -114,7 +116,10 @@ class BM25F
         tf = field_term_frequency(field, term, doc_id)
         idf = @idf[field]
         field_length_norm = field_length_norm(field, doc_id)
-        doc_score += @field_weights[field] * ((tf * (@term_freq_weight + 1)) / (tf + @term_freq_weight * field_length_norm) * idf)
+        val = @field_weights[field] * ((tf * (@term_freq_weight + 1)) / (tf + @term_freq_weight * field_length_norm) * idf)
+        val = 0 if val.nan?
+        doc_score += val
       end
     end
     doc_score
@@ -127,7 +132,10 @@ class BM25F
   # @param doc_id [Integer] The document ID.
   # @return [Integer] The term frequency.
   def field_term_frequency(field, term, doc_id)
-    @documents[doc_id][field].scan(term).count
+    val = @documents[doc_id][field]
+    return 0 if val.nil?
+    val.scan(term).count
   end
   # Calculates the field length normalization factor of a document.
@@ -136,6 +144,9 @@ class BM25F
   # @param doc_id [Integer] The document ID.
   # @return [Float] The field length normalization factor.
   def field_length_norm(field, doc_id)
-    1.0 - @doc_length_weight + @doc_length_weight * (@doc_lengths[doc_id][field] / @avg_doc_length)
+    val = @doc_lengths[doc_id][field]
+    return 0 if val.nil?
+    1.0 - @doc_length_weight + @doc_length_weight * (val / @avg_doc_length)
   end
 end

data/test/test_bm25f.rb CHANGED Viewed

@@ -5,19 +5,20 @@ class BM25FTest < Minitest::Test
   def setup
     @bm25f = BM25F.new
     @documents = [
-      { title: 'hello world', content: 'foo bar baz' },
-      { title: 'foo bar', content: 'goodbye, world!' }
+      { url: 'https://wikimedia.org', title: 'Wikimedia',
+        content: 'Wikimedia. Wikimedia is a global movement whose mission is to bring free educational content to the world. Through various projects, chapters and the support structure of the ...' },
+      { url: 'https://twitter.com/Wikipedia', title: 'Wikipedia (@Wikipedia) · Twitter', content: nil },
+      { url: 'https://play.google.com/store/apps/details', title: 'Wikipedia - Apps on Google Play',
+        content: 'The best Wikipedia experience on your Mobile device. Ad-free and free of charge, forever. With the official Wikipedia app, you can search and explore 40+ ...' },
+      { url: 'https://www.wikipedia.org', title: 'Wikipedia',
+        content: 'Wikipedia is a free online encyclopedia, created and edited by volunteers around the world and hosted by the Wikimedia Foundation.' }
     ]
   end
   def test_score
     @bm25f.fit @documents
-    scores = @bm25f.score 'hello world foo bar baz'
+    scores = @bm25f.score 'wikipedia'
-    # Sort
-    scores = scores.to_a.sort_by { |_, v| v.to_i }
-    # Checks if the most matching element is the first element
-    assert scores.last[0].zero?
+    puts scores.inspect
   end
 end

metadata CHANGED Viewed

@@ -1,29 +1,43 @@
 --- !ruby/object:Gem::Specification
 name: bm25f
 version: !ruby/object:Gem::Version
-  version: 0.2.3
+  version: 0.2.5
 platform: ruby
 authors:
 - catflip
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-09-10 00:00:00.000000000 Z
+date: 2023-09-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
-  name: treat
+  name: pragmatic_tokenizer
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: '2.1'
+        version: 3.2.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: '2.1'
+        version: 3.2.0
+- !ruby/object:Gem::Dependency
+  name: uea-stemmer
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.10.3
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.10.3
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement