bm25f 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/bm25f.rb +13 -4
- data/test/test_bm25f.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7c7af39f073c1fc55d42199f9c882caa5adf672d5c94c715fe379c58cd4a4a19
|
4
|
+
data.tar.gz: 1ebb197c89da4b70780d3631658a1bb3d0ce5c244abe8b5875c41ee1632f63a3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 816270030afd15716f8b89f4c74d5c11dbc32f169c7a10a1eca931782e21e320017aa95bab66e32c6b500d407922e289edce8db396c956a62c245871744bb79b
|
7
|
+
data.tar.gz: 0ef709327eaa9c5b14e1132bafc199c4d66759b509c85d8bf3dcb23ba5b8dbe526b698c0ab1051486741fa3a3d4d21d073f8d4e3d12ced9128ec16fedd6a39e4
|
data/lib/bm25f.rb
CHANGED
@@ -88,7 +88,7 @@ class BM25F
|
|
88
88
|
def calculate_idf
|
89
89
|
idf = {}
|
90
90
|
@field_weights.each_key do |field|
|
91
|
-
field_doc_count = @documents.count { |doc| !doc[field]
|
91
|
+
field_doc_count = @documents.count { |doc| !doc[field]&.empty? }
|
92
92
|
idf[field] = Math.log((@total_docs - field_doc_count + 0.5) / (field_doc_count + 0.5) + 1.0)
|
93
93
|
end
|
94
94
|
idf
|
@@ -114,7 +114,10 @@ class BM25F
|
|
114
114
|
tf = field_term_frequency(field, term, doc_id)
|
115
115
|
idf = @idf[field]
|
116
116
|
field_length_norm = field_length_norm(field, doc_id)
|
117
|
-
|
117
|
+
val = @field_weights[field] * ((tf * (@term_freq_weight + 1)) / (tf + @term_freq_weight * field_length_norm) * idf)
|
118
|
+
val = 0 if val.nan?
|
119
|
+
|
120
|
+
doc_score += val
|
118
121
|
end
|
119
122
|
end
|
120
123
|
doc_score
|
@@ -127,7 +130,10 @@ class BM25F
|
|
127
130
|
# @param doc_id [Integer] The document ID.
|
128
131
|
# @return [Integer] The term frequency.
|
129
132
|
def field_term_frequency(field, term, doc_id)
|
130
|
-
@documents[doc_id][field]
|
133
|
+
val = @documents[doc_id][field]
|
134
|
+
return 0 if val.nil?
|
135
|
+
|
136
|
+
val.scan(term).count
|
131
137
|
end
|
132
138
|
|
133
139
|
# Calculates the field length normalization factor of a document.
|
@@ -136,6 +142,9 @@ class BM25F
|
|
136
142
|
# @param doc_id [Integer] The document ID.
|
137
143
|
# @return [Float] The field length normalization factor.
|
138
144
|
def field_length_norm(field, doc_id)
|
139
|
-
|
145
|
+
val = @doc_lengths[doc_id][field]
|
146
|
+
return 0 if val.nil?
|
147
|
+
|
148
|
+
1.0 - @doc_length_weight + @doc_length_weight * (val / @avg_doc_length)
|
140
149
|
end
|
141
150
|
end
|
data/test/test_bm25f.rb
CHANGED