bm25f 0.2.3 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/bm25f.rb +20 -9
- data/test/test_bm25f.rb +9 -8
- metadata +19 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9b141349387662e73de071841670357c61b61d775cb61501d84dc7ccc801e6de
|
4
|
+
data.tar.gz: ad7bb2c6223a5f125224b0a5d0afdb9c3950811c9c09cd1a39ae943b38a39eb9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3662bed23e2c1f041766b29ffe2f87d13ec18e107870c4948fc85851bbc1640a569a9f831f67f46bb8035577fc8a33654461bd2f4d2fe4e324ef3b302f66f665
|
7
|
+
data.tar.gz: e27b5d15af219b3dfc41e675015bacd5c5c1d930c0f70df844f8341fab0062670eb3690bdc41163ed628e2e90f340c1fb9814ce6dd9c30fc30daf77056b4b89a
|
data/lib/bm25f.rb
CHANGED
@@ -1,8 +1,7 @@
|
|
1
|
-
require '
|
1
|
+
require 'uea-stemmer'
|
2
|
+
require 'pragmatic_tokenizer'
|
2
3
|
|
3
4
|
class BM25F
|
4
|
-
include Treat::Core::DSL
|
5
|
-
|
6
5
|
# Initializes a BM25F model.
|
7
6
|
#
|
8
7
|
# @param term_freq_weight [Float] Weight for term frequency.
|
@@ -10,6 +9,9 @@ class BM25F
|
|
10
9
|
def initialize(term_freq_weight: 1.33, doc_length_weight: 0.8)
|
11
10
|
@term_freq_weight = term_freq_weight
|
12
11
|
@doc_length_weight = doc_length_weight
|
12
|
+
|
13
|
+
@tokenizer = PragmaticTokenizer::Tokenizer.new
|
14
|
+
@stemmer = UEAStemmer.new
|
13
15
|
end
|
14
16
|
|
15
17
|
# Fits the model to a set of documents.
|
@@ -56,7 +58,7 @@ class BM25F
|
|
56
58
|
documents.each do |k, v|
|
57
59
|
next unless v.instance_of? String
|
58
60
|
|
59
|
-
documents[k] =
|
61
|
+
documents[k] = v.map { |t| @stemmer.stem t }.oin(' ')
|
60
62
|
end
|
61
63
|
documents
|
62
64
|
end
|
@@ -88,7 +90,7 @@ class BM25F
|
|
88
90
|
def calculate_idf
|
89
91
|
idf = {}
|
90
92
|
@field_weights.each_key do |field|
|
91
|
-
field_doc_count = @documents.count { |doc| !doc[field]
|
93
|
+
field_doc_count = @documents.count { |doc| !doc[field]&.empty? }
|
92
94
|
idf[field] = Math.log((@total_docs - field_doc_count + 0.5) / (field_doc_count + 0.5) + 1.0)
|
93
95
|
end
|
94
96
|
idf
|
@@ -99,7 +101,7 @@ class BM25F
|
|
99
101
|
# @param query [String] The query to preprocess.
|
100
102
|
# @return [Array<String>] An array of preprocessed query terms.
|
101
103
|
def preprocess_query(query)
|
102
|
-
|
104
|
+
@tokenizer.tokenize(query).map { |t| @stemmer.stem t }
|
103
105
|
end
|
104
106
|
|
105
107
|
# Calculates the score of a document using an array of query terms.
|
@@ -114,7 +116,10 @@ class BM25F
|
|
114
116
|
tf = field_term_frequency(field, term, doc_id)
|
115
117
|
idf = @idf[field]
|
116
118
|
field_length_norm = field_length_norm(field, doc_id)
|
117
|
-
|
119
|
+
val = @field_weights[field] * ((tf * (@term_freq_weight + 1)) / (tf + @term_freq_weight * field_length_norm) * idf)
|
120
|
+
val = 0 if val.nan?
|
121
|
+
|
122
|
+
doc_score += val
|
118
123
|
end
|
119
124
|
end
|
120
125
|
doc_score
|
@@ -127,7 +132,10 @@ class BM25F
|
|
127
132
|
# @param doc_id [Integer] The document ID.
|
128
133
|
# @return [Integer] The term frequency.
|
129
134
|
def field_term_frequency(field, term, doc_id)
|
130
|
-
@documents[doc_id][field]
|
135
|
+
val = @documents[doc_id][field]
|
136
|
+
return 0 if val.nil?
|
137
|
+
|
138
|
+
val.scan(term).count
|
131
139
|
end
|
132
140
|
|
133
141
|
# Calculates the field length normalization factor of a document.
|
@@ -136,6 +144,9 @@ class BM25F
|
|
136
144
|
# @param doc_id [Integer] The document ID.
|
137
145
|
# @return [Float] The field length normalization factor.
|
138
146
|
def field_length_norm(field, doc_id)
|
139
|
-
|
147
|
+
val = @doc_lengths[doc_id][field]
|
148
|
+
return 0 if val.nil?
|
149
|
+
|
150
|
+
1.0 - @doc_length_weight + @doc_length_weight * (val / @avg_doc_length)
|
140
151
|
end
|
141
152
|
end
|
data/test/test_bm25f.rb
CHANGED
@@ -5,19 +5,20 @@ class BM25FTest < Minitest::Test
|
|
5
5
|
def setup
|
6
6
|
@bm25f = BM25F.new
|
7
7
|
@documents = [
|
8
|
-
{
|
9
|
-
|
8
|
+
{ url: 'https://wikimedia.org', title: 'Wikimedia',
|
9
|
+
content: 'Wikimedia. Wikimedia is a global movement whose mission is to bring free educational content to the world. Through various projects, chapters and the support structure of the ...' },
|
10
|
+
{ url: 'https://twitter.com/Wikipedia', title: 'Wikipedia (@Wikipedia) · Twitter', content: nil },
|
11
|
+
{ url: 'https://play.google.com/store/apps/details', title: 'Wikipedia - Apps on Google Play',
|
12
|
+
content: 'The best Wikipedia experience on your Mobile device. Ad-free and free of charge, forever. With the official Wikipedia app, you can search and explore 40+ ...' },
|
13
|
+
{ url: 'https://www.wikipedia.org', title: 'Wikipedia',
|
14
|
+
content: 'Wikipedia is a free online encyclopedia, created and edited by volunteers around the world and hosted by the Wikimedia Foundation.' }
|
10
15
|
]
|
11
16
|
end
|
12
17
|
|
13
18
|
def test_score
|
14
19
|
@bm25f.fit @documents
|
15
|
-
scores = @bm25f.score '
|
20
|
+
scores = @bm25f.score 'wikipedia'
|
16
21
|
|
17
|
-
|
18
|
-
scores = scores.to_a.sort_by { |_, v| v.to_i }
|
19
|
-
|
20
|
-
# Checks if the most matching element is the first element
|
21
|
-
assert scores.last[0].zero?
|
22
|
+
puts scores.inspect
|
22
23
|
end
|
23
24
|
end
|
metadata
CHANGED
@@ -1,29 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bm25f
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- catflip
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-09-
|
11
|
+
date: 2023-09-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: pragmatic_tokenizer
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 3.2.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 3.2.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: uea-stemmer
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.10.3
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.10.3
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: rake
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|