bm25f 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/bm25f.rb +20 -9
- data/test/test_bm25f.rb +9 -8
- metadata +19 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9b141349387662e73de071841670357c61b61d775cb61501d84dc7ccc801e6de
|
4
|
+
data.tar.gz: ad7bb2c6223a5f125224b0a5d0afdb9c3950811c9c09cd1a39ae943b38a39eb9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3662bed23e2c1f041766b29ffe2f87d13ec18e107870c4948fc85851bbc1640a569a9f831f67f46bb8035577fc8a33654461bd2f4d2fe4e324ef3b302f66f665
|
7
|
+
data.tar.gz: e27b5d15af219b3dfc41e675015bacd5c5c1d930c0f70df844f8341fab0062670eb3690bdc41163ed628e2e90f340c1fb9814ce6dd9c30fc30daf77056b4b89a
|
data/lib/bm25f.rb
CHANGED
@@ -1,8 +1,7 @@
|
|
1
|
-
require '
|
1
|
+
require 'uea-stemmer'
|
2
|
+
require 'pragmatic_tokenizer'
|
2
3
|
|
3
4
|
class BM25F
|
4
|
-
include Treat::Core::DSL
|
5
|
-
|
6
5
|
# Initializes a BM25F model.
|
7
6
|
#
|
8
7
|
# @param term_freq_weight [Float] Weight for term frequency.
|
@@ -10,6 +9,9 @@ class BM25F
|
|
10
9
|
def initialize(term_freq_weight: 1.33, doc_length_weight: 0.8)
|
11
10
|
@term_freq_weight = term_freq_weight
|
12
11
|
@doc_length_weight = doc_length_weight
|
12
|
+
|
13
|
+
@tokenizer = PragmaticTokenizer::Tokenizer.new
|
14
|
+
@stemmer = UEAStemmer.new
|
13
15
|
end
|
14
16
|
|
15
17
|
# Fits the model to a set of documents.
|
@@ -56,7 +58,7 @@ class BM25F
|
|
56
58
|
documents.each do |k, v|
|
57
59
|
next unless v.instance_of? String
|
58
60
|
|
59
|
-
documents[k] =
|
61
|
+
documents[k] = v.map { |t| @stemmer.stem t }.oin(' ')
|
60
62
|
end
|
61
63
|
documents
|
62
64
|
end
|
@@ -88,7 +90,7 @@ class BM25F
|
|
88
90
|
def calculate_idf
|
89
91
|
idf = {}
|
90
92
|
@field_weights.each_key do |field|
|
91
|
-
field_doc_count = @documents.count { |doc| !doc[field]
|
93
|
+
field_doc_count = @documents.count { |doc| !doc[field]&.empty? }
|
92
94
|
idf[field] = Math.log((@total_docs - field_doc_count + 0.5) / (field_doc_count + 0.5) + 1.0)
|
93
95
|
end
|
94
96
|
idf
|
@@ -99,7 +101,7 @@ class BM25F
|
|
99
101
|
# @param query [String] The query to preprocess.
|
100
102
|
# @return [Array<String>] An array of preprocessed query terms.
|
101
103
|
def preprocess_query(query)
|
102
|
-
|
104
|
+
@tokenizer.tokenize(query).map { |t| @stemmer.stem t }
|
103
105
|
end
|
104
106
|
|
105
107
|
# Calculates the score of a document using an array of query terms.
|
@@ -114,7 +116,10 @@ class BM25F
|
|
114
116
|
tf = field_term_frequency(field, term, doc_id)
|
115
117
|
idf = @idf[field]
|
116
118
|
field_length_norm = field_length_norm(field, doc_id)
|
117
|
-
|
119
|
+
val = @field_weights[field] * ((tf * (@term_freq_weight + 1)) / (tf + @term_freq_weight * field_length_norm) * idf)
|
120
|
+
val = 0 if val.nan?
|
121
|
+
|
122
|
+
doc_score += val
|
118
123
|
end
|
119
124
|
end
|
120
125
|
doc_score
|
@@ -127,7 +132,10 @@ class BM25F
|
|
127
132
|
# @param doc_id [Integer] The document ID.
|
128
133
|
# @return [Integer] The term frequency.
|
129
134
|
def field_term_frequency(field, term, doc_id)
|
130
|
-
@documents[doc_id][field]
|
135
|
+
val = @documents[doc_id][field]
|
136
|
+
return 0 if val.nil?
|
137
|
+
|
138
|
+
val.scan(term).count
|
131
139
|
end
|
132
140
|
|
133
141
|
# Calculates the field length normalization factor of a document.
|
@@ -136,6 +144,9 @@ class BM25F
|
|
136
144
|
# @param doc_id [Integer] The document ID.
|
137
145
|
# @return [Float] The field length normalization factor.
|
138
146
|
def field_length_norm(field, doc_id)
|
139
|
-
|
147
|
+
val = @doc_lengths[doc_id][field]
|
148
|
+
return 0 if val.nil?
|
149
|
+
|
150
|
+
1.0 - @doc_length_weight + @doc_length_weight * (val / @avg_doc_length)
|
140
151
|
end
|
141
152
|
end
|
data/test/test_bm25f.rb
CHANGED
@@ -5,19 +5,20 @@ class BM25FTest < Minitest::Test
|
|
5
5
|
def setup
|
6
6
|
@bm25f = BM25F.new
|
7
7
|
@documents = [
|
8
|
-
{
|
9
|
-
|
8
|
+
{ url: 'https://wikimedia.org', title: 'Wikimedia',
|
9
|
+
content: 'Wikimedia. Wikimedia is a global movement whose mission is to bring free educational content to the world. Through various projects, chapters and the support structure of the ...' },
|
10
|
+
{ url: 'https://twitter.com/Wikipedia', title: 'Wikipedia (@Wikipedia) · Twitter', content: nil },
|
11
|
+
{ url: 'https://play.google.com/store/apps/details', title: 'Wikipedia - Apps on Google Play',
|
12
|
+
content: 'The best Wikipedia experience on your Mobile device. Ad-free and free of charge, forever. With the official Wikipedia app, you can search and explore 40+ ...' },
|
13
|
+
{ url: 'https://www.wikipedia.org', title: 'Wikipedia',
|
14
|
+
content: 'Wikipedia is a free online encyclopedia, created and edited by volunteers around the world and hosted by the Wikimedia Foundation.' }
|
10
15
|
]
|
11
16
|
end
|
12
17
|
|
13
18
|
def test_score
|
14
19
|
@bm25f.fit @documents
|
15
|
-
scores = @bm25f.score '
|
20
|
+
scores = @bm25f.score 'wikipedia'
|
16
21
|
|
17
|
-
|
18
|
-
scores = scores.to_a.sort_by { |_, v| v.to_i }
|
19
|
-
|
20
|
-
# Checks if the most matching element is the first element
|
21
|
-
assert scores.last[0].zero?
|
22
|
+
puts scores.inspect
|
22
23
|
end
|
23
24
|
end
|
metadata
CHANGED
@@ -1,29 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bm25f
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- catflip
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-09-
|
11
|
+
date: 2023-09-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: pragmatic_tokenizer
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 3.2.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 3.2.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: uea-stemmer
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.10.3
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.10.3
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: rake
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|