bm25f 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/bm25f.rb +13 -4
- data/test/test_bm25f.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7c7af39f073c1fc55d42199f9c882caa5adf672d5c94c715fe379c58cd4a4a19
|
4
|
+
data.tar.gz: 1ebb197c89da4b70780d3631658a1bb3d0ce5c244abe8b5875c41ee1632f63a3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 816270030afd15716f8b89f4c74d5c11dbc32f169c7a10a1eca931782e21e320017aa95bab66e32c6b500d407922e289edce8db396c956a62c245871744bb79b
|
7
|
+
data.tar.gz: 0ef709327eaa9c5b14e1132bafc199c4d66759b509c85d8bf3dcb23ba5b8dbe526b698c0ab1051486741fa3a3d4d21d073f8d4e3d12ced9128ec16fedd6a39e4
|
data/lib/bm25f.rb
CHANGED
@@ -88,7 +88,7 @@ class BM25F
|
|
88
88
|
def calculate_idf
|
89
89
|
idf = {}
|
90
90
|
@field_weights.each_key do |field|
|
91
|
-
field_doc_count = @documents.count { |doc| !doc[field]
|
91
|
+
field_doc_count = @documents.count { |doc| !doc[field]&.empty? }
|
92
92
|
idf[field] = Math.log((@total_docs - field_doc_count + 0.5) / (field_doc_count + 0.5) + 1.0)
|
93
93
|
end
|
94
94
|
idf
|
@@ -114,7 +114,10 @@ class BM25F
|
|
114
114
|
tf = field_term_frequency(field, term, doc_id)
|
115
115
|
idf = @idf[field]
|
116
116
|
field_length_norm = field_length_norm(field, doc_id)
|
117
|
-
|
117
|
+
val = @field_weights[field] * ((tf * (@term_freq_weight + 1)) / (tf + @term_freq_weight * field_length_norm) * idf)
|
118
|
+
val = 0 if val.nan?
|
119
|
+
|
120
|
+
doc_score += val
|
118
121
|
end
|
119
122
|
end
|
120
123
|
doc_score
|
@@ -127,7 +130,10 @@ class BM25F
|
|
127
130
|
# @param doc_id [Integer] The document ID.
|
128
131
|
# @return [Integer] The term frequency.
|
129
132
|
def field_term_frequency(field, term, doc_id)
|
130
|
-
@documents[doc_id][field]
|
133
|
+
val = @documents[doc_id][field]
|
134
|
+
return 0 if val.nil?
|
135
|
+
|
136
|
+
val.scan(term).count
|
131
137
|
end
|
132
138
|
|
133
139
|
# Calculates the field length normalization factor of a document.
|
@@ -136,6 +142,9 @@ class BM25F
|
|
136
142
|
# @param doc_id [Integer] The document ID.
|
137
143
|
# @return [Float] The field length normalization factor.
|
138
144
|
def field_length_norm(field, doc_id)
|
139
|
-
|
145
|
+
val = @doc_lengths[doc_id][field]
|
146
|
+
return 0 if val.nil?
|
147
|
+
|
148
|
+
1.0 - @doc_length_weight + @doc_length_weight * (val / @avg_doc_length)
|
140
149
|
end
|
141
150
|
end
|
data/test/test_bm25f.rb
CHANGED