bm25f 0.2.2 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/bm25f.rb +14 -5
  3. data/test/test_bm25f.rb +1 -1
  4. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bc84b7827c64bd77694f548ac479113906f4a5173b50fab2c479079beec3bc41
4
- data.tar.gz: 8adf6005477365e0b827e93b67ae8b4bbc99c85db73e1b274005ce043659d690
3
+ metadata.gz: 7c7af39f073c1fc55d42199f9c882caa5adf672d5c94c715fe379c58cd4a4a19
4
+ data.tar.gz: 1ebb197c89da4b70780d3631658a1bb3d0ce5c244abe8b5875c41ee1632f63a3
5
5
  SHA512:
6
- metadata.gz: 17f1f1e79d4fa265610c85dbc52ae949de236252d96796beca5a08c80c4035947b56dc04607ca00aa5e9e2eaac41cb9f5a645f78415c66c1b91aaa5a77c25e60
7
- data.tar.gz: fbc526d3080a2672b41da06b78c7df363138bbc62df91bc8c065133ec2e807432c19ea108fdc14fb3bb2852973bfefdb18d0a0f98a4d4a7742eac23a1d7afdbb
6
+ metadata.gz: 816270030afd15716f8b89f4c74d5c11dbc32f169c7a10a1eca931782e21e320017aa95bab66e32c6b500d407922e289edce8db396c956a62c245871744bb79b
7
+ data.tar.gz: 0ef709327eaa9c5b14e1132bafc199c4d66759b509c85d8bf3dcb23ba5b8dbe526b698c0ab1051486741fa3a3d4d21d073f8d4e3d12ced9128ec16fedd6a39e4
data/lib/bm25f.rb CHANGED
@@ -77,7 +77,7 @@ class BM25F
77
77
  def calculate_document_lengths(documents)
78
78
  doc_lengths = {}
79
79
  documents.each_with_index do |doc, i|
80
- doc_lengths[i] = doc.transform_values { |v| v.nil ? 0 : v.length }
80
+ doc_lengths[i] = doc.transform_values { |v| v.nil? ? 0 : v.length }
81
81
  end
82
82
  doc_lengths
83
83
  end
@@ -88,7 +88,7 @@ class BM25F
88
88
  def calculate_idf
89
89
  idf = {}
90
90
  @field_weights.each_key do |field|
91
- field_doc_count = @documents.count { |doc| !doc[field].empty? }
91
+ field_doc_count = @documents.count { |doc| !doc[field]&.empty? }
92
92
  idf[field] = Math.log((@total_docs - field_doc_count + 0.5) / (field_doc_count + 0.5) + 1.0)
93
93
  end
94
94
  idf
@@ -114,7 +114,10 @@ class BM25F
114
114
  tf = field_term_frequency(field, term, doc_id)
115
115
  idf = @idf[field]
116
116
  field_length_norm = field_length_norm(field, doc_id)
117
- doc_score += @field_weights[field] * ((tf * (@term_freq_weight + 1)) / (tf + @term_freq_weight * field_length_norm) * idf)
117
+ val = @field_weights[field] * ((tf * (@term_freq_weight + 1)) / (tf + @term_freq_weight * field_length_norm) * idf)
118
+ val = 0 if val.nan?
119
+
120
+ doc_score += val
118
121
  end
119
122
  end
120
123
  doc_score
@@ -127,7 +130,10 @@ class BM25F
127
130
  # @param doc_id [Integer] The document ID.
128
131
  # @return [Integer] The term frequency.
129
132
  def field_term_frequency(field, term, doc_id)
130
- @documents[doc_id][field].scan(term).count
133
+ val = @documents[doc_id][field]
134
+ return 0 if val.nil?
135
+
136
+ val.scan(term).count
131
137
  end
132
138
 
133
139
  # Calculates the field length normalization factor of a document.
@@ -136,6 +142,9 @@ class BM25F
136
142
  # @param doc_id [Integer] The document ID.
137
143
  # @return [Float] The field length normalization factor.
138
144
  def field_length_norm(field, doc_id)
139
- 1.0 - @doc_length_weight + @doc_length_weight * (@doc_lengths[doc_id][field] / @avg_doc_length)
145
+ val = @doc_lengths[doc_id][field]
146
+ return 0 if val.nil?
147
+
148
+ 1.0 - @doc_length_weight + @doc_length_weight * (val / @avg_doc_length)
140
149
  end
141
150
  end
data/test/test_bm25f.rb CHANGED
@@ -6,7 +6,7 @@ class BM25FTest < Minitest::Test
6
6
  @bm25f = BM25F.new
7
7
  @documents = [
8
8
  { title: 'hello world', content: 'foo bar baz' },
9
- { title: 'foo bar', content: 'goodbye, world!' }
9
+ { title: 'foo bar', content: 'goodbye, world!', test: nil }
10
10
  ]
11
11
  end
12
12
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bm25f
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - catflip