bm25f 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/bm25f.rb +13 -4
  3. data/test/test_bm25f.rb +1 -1
  4. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b00f60fdde35d0565f169e9486d2935e5ddcef9bbd8084d60d9228364c9ebed7
4
- data.tar.gz: 7249983dda5a101fb1b2ddb646414b58776ae9a29feb4f563707e9a654d45094
3
+ metadata.gz: 7c7af39f073c1fc55d42199f9c882caa5adf672d5c94c715fe379c58cd4a4a19
4
+ data.tar.gz: 1ebb197c89da4b70780d3631658a1bb3d0ce5c244abe8b5875c41ee1632f63a3
5
5
  SHA512:
6
- metadata.gz: 16231fa9ba99f19cff2a6e9e801b734b53effa584da0bc8fcc80000774bbe61c75f1994bfbdc39b3cdac8d427b5049ff56dd1dbcc669eed3b836d910e45f5547
7
- data.tar.gz: eea02e6dc1989ff69baa6097997da274149083a87cbbc7355101e613103f09fa52886718422cf68b9cb9afadb2e4ed9ce54be8117cba536a99647db01acb24f0
6
+ metadata.gz: 816270030afd15716f8b89f4c74d5c11dbc32f169c7a10a1eca931782e21e320017aa95bab66e32c6b500d407922e289edce8db396c956a62c245871744bb79b
7
+ data.tar.gz: 0ef709327eaa9c5b14e1132bafc199c4d66759b509c85d8bf3dcb23ba5b8dbe526b698c0ab1051486741fa3a3d4d21d073f8d4e3d12ced9128ec16fedd6a39e4
data/lib/bm25f.rb CHANGED
@@ -88,7 +88,7 @@ class BM25F
88
88
  def calculate_idf
89
89
  idf = {}
90
90
  @field_weights.each_key do |field|
91
- field_doc_count = @documents.count { |doc| !doc[field].empty? }
91
+ field_doc_count = @documents.count { |doc| !doc[field]&.empty? }
92
92
  idf[field] = Math.log((@total_docs - field_doc_count + 0.5) / (field_doc_count + 0.5) + 1.0)
93
93
  end
94
94
  idf
@@ -114,7 +114,10 @@ class BM25F
114
114
  tf = field_term_frequency(field, term, doc_id)
115
115
  idf = @idf[field]
116
116
  field_length_norm = field_length_norm(field, doc_id)
117
- doc_score += @field_weights[field] * ((tf * (@term_freq_weight + 1)) / (tf + @term_freq_weight * field_length_norm) * idf)
117
+ val = @field_weights[field] * ((tf * (@term_freq_weight + 1)) / (tf + @term_freq_weight * field_length_norm) * idf)
118
+ val = 0 if val.nan?
119
+
120
+ doc_score += val
118
121
  end
119
122
  end
120
123
  doc_score
@@ -127,7 +130,10 @@ class BM25F
127
130
  # @param doc_id [Integer] The document ID.
128
131
  # @return [Integer] The term frequency.
129
132
  def field_term_frequency(field, term, doc_id)
130
- @documents[doc_id][field].scan(term).count
133
+ val = @documents[doc_id][field]
134
+ return 0 if val.nil?
135
+
136
+ val.scan(term).count
131
137
  end
132
138
 
133
139
  # Calculates the field length normalization factor of a document.
@@ -136,6 +142,9 @@ class BM25F
136
142
  # @param doc_id [Integer] The document ID.
137
143
  # @return [Float] The field length normalization factor.
138
144
  def field_length_norm(field, doc_id)
139
- 1.0 - @doc_length_weight + @doc_length_weight * (@doc_lengths[doc_id][field] / @avg_doc_length)
145
+ val = @doc_lengths[doc_id][field]
146
+ return 0 if val.nil?
147
+
148
+ 1.0 - @doc_length_weight + @doc_length_weight * (val / @avg_doc_length)
140
149
  end
141
150
  end
data/test/test_bm25f.rb CHANGED
@@ -6,7 +6,7 @@ class BM25FTest < Minitest::Test
6
6
  @bm25f = BM25F.new
7
7
  @documents = [
8
8
  { title: 'hello world', content: 'foo bar baz' },
9
- { title: 'foo bar', content: 'goodbye, world!' }
9
+ { title: 'foo bar', content: 'goodbye, world!', test: nil }
10
10
  ]
11
11
  end
12
12
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bm25f
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - catflip