bm25f 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/bm25f.rb +7 -5
- data/test/test_bm25f.rb +9 -8
- metadata +19 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9b141349387662e73de071841670357c61b61d775cb61501d84dc7ccc801e6de
|
4
|
+
data.tar.gz: ad7bb2c6223a5f125224b0a5d0afdb9c3950811c9c09cd1a39ae943b38a39eb9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3662bed23e2c1f041766b29ffe2f87d13ec18e107870c4948fc85851bbc1640a569a9f831f67f46bb8035577fc8a33654461bd2f4d2fe4e324ef3b302f66f665
|
7
|
+
data.tar.gz: e27b5d15af219b3dfc41e675015bacd5c5c1d930c0f70df844f8341fab0062670eb3690bdc41163ed628e2e90f340c1fb9814ce6dd9c30fc30daf77056b4b89a
|
data/lib/bm25f.rb
CHANGED
@@ -1,8 +1,7 @@
|
|
1
|
-
require '
|
1
|
+
require 'uea-stemmer'
|
2
|
+
require 'pragmatic_tokenizer'
|
2
3
|
|
3
4
|
class BM25F
|
4
|
-
include Treat::Core::DSL
|
5
|
-
|
6
5
|
# Initializes a BM25F model.
|
7
6
|
#
|
8
7
|
# @param term_freq_weight [Float] Weight for term frequency.
|
@@ -10,6 +9,9 @@ class BM25F
|
|
10
9
|
def initialize(term_freq_weight: 1.33, doc_length_weight: 0.8)
|
11
10
|
@term_freq_weight = term_freq_weight
|
12
11
|
@doc_length_weight = doc_length_weight
|
12
|
+
|
13
|
+
@tokenizer = PragmaticTokenizer::Tokenizer.new
|
14
|
+
@stemmer = UEAStemmer.new
|
13
15
|
end
|
14
16
|
|
15
17
|
# Fits the model to a set of documents.
|
@@ -56,7 +58,7 @@ class BM25F
|
|
56
58
|
documents.each do |k, v|
|
57
59
|
next unless v.instance_of? String
|
58
60
|
|
59
|
-
documents[k] =
|
61
|
+
documents[k] = v.map { |t| @stemmer.stem t }.oin(' ')
|
60
62
|
end
|
61
63
|
documents
|
62
64
|
end
|
@@ -99,7 +101,7 @@ class BM25F
|
|
99
101
|
# @param query [String] The query to preprocess.
|
100
102
|
# @return [Array<String>] An array of preprocessed query terms.
|
101
103
|
def preprocess_query(query)
|
102
|
-
|
104
|
+
@tokenizer.tokenize(query).map { |t| @stemmer.stem t }
|
103
105
|
end
|
104
106
|
|
105
107
|
# Calculates the score of a document using an array of query terms.
|
data/test/test_bm25f.rb
CHANGED
@@ -5,19 +5,20 @@ class BM25FTest < Minitest::Test
|
|
5
5
|
def setup
|
6
6
|
@bm25f = BM25F.new
|
7
7
|
@documents = [
|
8
|
-
{
|
9
|
-
|
8
|
+
{ url: 'https://wikimedia.org', title: 'Wikimedia',
|
9
|
+
content: 'Wikimedia. Wikimedia is a global movement whose mission is to bring free educational content to the world. Through various projects, chapters and the support structure of the ...' },
|
10
|
+
{ url: 'https://twitter.com/Wikipedia', title: 'Wikipedia (@Wikipedia) · Twitter', content: nil },
|
11
|
+
{ url: 'https://play.google.com/store/apps/details', title: 'Wikipedia - Apps on Google Play',
|
12
|
+
content: 'The best Wikipedia experience on your Mobile device. Ad-free and free of charge, forever. With the official Wikipedia app, you can search and explore 40+ ...' },
|
13
|
+
{ url: 'https://www.wikipedia.org', title: 'Wikipedia',
|
14
|
+
content: 'Wikipedia is a free online encyclopedia, created and edited by volunteers around the world and hosted by the Wikimedia Foundation.' }
|
10
15
|
]
|
11
16
|
end
|
12
17
|
|
13
18
|
def test_score
|
14
19
|
@bm25f.fit @documents
|
15
|
-
scores = @bm25f.score '
|
20
|
+
scores = @bm25f.score 'wikipedia'
|
16
21
|
|
17
|
-
|
18
|
-
scores = scores.to_a.sort_by { |_, v| v.to_i }
|
19
|
-
|
20
|
-
# Checks if the most matching element is the first element
|
21
|
-
assert scores.last[0].zero?
|
22
|
+
puts scores.inspect
|
22
23
|
end
|
23
24
|
end
|
metadata
CHANGED
@@ -1,29 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bm25f
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- catflip
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-09-
|
11
|
+
date: 2023-09-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: pragmatic_tokenizer
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 3.2.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 3.2.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: uea-stemmer
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.10.3
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.10.3
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: rake
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|