bm25f 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/bm25f.rb +20 -9
  3. data/test/test_bm25f.rb +9 -8
  4. metadata +19 -5
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b00f60fdde35d0565f169e9486d2935e5ddcef9bbd8084d60d9228364c9ebed7
4
- data.tar.gz: 7249983dda5a101fb1b2ddb646414b58776ae9a29feb4f563707e9a654d45094
3
+ metadata.gz: 9b141349387662e73de071841670357c61b61d775cb61501d84dc7ccc801e6de
4
+ data.tar.gz: ad7bb2c6223a5f125224b0a5d0afdb9c3950811c9c09cd1a39ae943b38a39eb9
5
5
  SHA512:
6
- metadata.gz: 16231fa9ba99f19cff2a6e9e801b734b53effa584da0bc8fcc80000774bbe61c75f1994bfbdc39b3cdac8d427b5049ff56dd1dbcc669eed3b836d910e45f5547
7
- data.tar.gz: eea02e6dc1989ff69baa6097997da274149083a87cbbc7355101e613103f09fa52886718422cf68b9cb9afadb2e4ed9ce54be8117cba536a99647db01acb24f0
6
+ metadata.gz: 3662bed23e2c1f041766b29ffe2f87d13ec18e107870c4948fc85851bbc1640a569a9f831f67f46bb8035577fc8a33654461bd2f4d2fe4e324ef3b302f66f665
7
+ data.tar.gz: e27b5d15af219b3dfc41e675015bacd5c5c1d930c0f70df844f8341fab0062670eb3690bdc41163ed628e2e90f340c1fb9814ce6dd9c30fc30daf77056b4b89a
data/lib/bm25f.rb CHANGED
@@ -1,8 +1,7 @@
1
- require 'treat'
1
+ require 'uea-stemmer'
2
+ require 'pragmatic_tokenizer'
2
3
 
3
4
  class BM25F
4
- include Treat::Core::DSL
5
-
6
5
  # Initializes a BM25F model.
7
6
  #
8
7
  # @param term_freq_weight [Float] Weight for term frequency.
@@ -10,6 +9,9 @@ class BM25F
10
9
  def initialize(term_freq_weight: 1.33, doc_length_weight: 0.8)
11
10
  @term_freq_weight = term_freq_weight
12
11
  @doc_length_weight = doc_length_weight
12
+
13
+ @tokenizer = PragmaticTokenizer::Tokenizer.new
14
+ @stemmer = UEAStemmer.new
13
15
  end
14
16
 
15
17
  # Fits the model to a set of documents.
@@ -56,7 +58,7 @@ class BM25F
56
58
  documents.each do |k, v|
57
59
  next unless v.instance_of? String
58
60
 
59
- documents[k] = sentence(v).map(&:stem).join(' ')
61
+ documents[k] = v.map { |t| @stemmer.stem t }.oin(' ')
60
62
  end
61
63
  documents
62
64
  end
@@ -88,7 +90,7 @@ class BM25F
88
90
  def calculate_idf
89
91
  idf = {}
90
92
  @field_weights.each_key do |field|
91
- field_doc_count = @documents.count { |doc| !doc[field].empty? }
93
+ field_doc_count = @documents.count { |doc| !doc[field]&.empty? }
92
94
  idf[field] = Math.log((@total_docs - field_doc_count + 0.5) / (field_doc_count + 0.5) + 1.0)
93
95
  end
94
96
  idf
@@ -99,7 +101,7 @@ class BM25F
99
101
  # @param query [String] The query to preprocess.
100
102
  # @return [Array<String>] An array of preprocessed query terms.
101
103
  def preprocess_query(query)
102
- sentence(query).tokenize.map(&:stem)
104
+ @tokenizer.tokenize(query).map { |t| @stemmer.stem t }
103
105
  end
104
106
 
105
107
  # Calculates the score of a document using an array of query terms.
@@ -114,7 +116,10 @@ class BM25F
114
116
  tf = field_term_frequency(field, term, doc_id)
115
117
  idf = @idf[field]
116
118
  field_length_norm = field_length_norm(field, doc_id)
117
- doc_score += @field_weights[field] * ((tf * (@term_freq_weight + 1)) / (tf + @term_freq_weight * field_length_norm) * idf)
119
+ val = @field_weights[field] * ((tf * (@term_freq_weight + 1)) / (tf + @term_freq_weight * field_length_norm) * idf)
120
+ val = 0 if val.nan?
121
+
122
+ doc_score += val
118
123
  end
119
124
  end
120
125
  doc_score
@@ -127,7 +132,10 @@ class BM25F
127
132
  # @param doc_id [Integer] The document ID.
128
133
  # @return [Integer] The term frequency.
129
134
  def field_term_frequency(field, term, doc_id)
130
- @documents[doc_id][field].scan(term).count
135
+ val = @documents[doc_id][field]
136
+ return 0 if val.nil?
137
+
138
+ val.scan(term).count
131
139
  end
132
140
 
133
141
  # Calculates the field length normalization factor of a document.
@@ -136,6 +144,9 @@ class BM25F
136
144
  # @param doc_id [Integer] The document ID.
137
145
  # @return [Float] The field length normalization factor.
138
146
  def field_length_norm(field, doc_id)
139
- 1.0 - @doc_length_weight + @doc_length_weight * (@doc_lengths[doc_id][field] / @avg_doc_length)
147
+ val = @doc_lengths[doc_id][field]
148
+ return 0 if val.nil?
149
+
150
+ 1.0 - @doc_length_weight + @doc_length_weight * (val / @avg_doc_length)
140
151
  end
141
152
  end
data/test/test_bm25f.rb CHANGED
@@ -5,19 +5,20 @@ class BM25FTest < Minitest::Test
5
5
  def setup
6
6
  @bm25f = BM25F.new
7
7
  @documents = [
8
- { title: 'hello world', content: 'foo bar baz' },
9
- { title: 'foo bar', content: 'goodbye, world!' }
8
+ { url: 'https://wikimedia.org', title: 'Wikimedia',
9
+ content: 'Wikimedia. Wikimedia is a global movement whose mission is to bring free educational content to the world. Through various projects, chapters and the support structure of the ...' },
10
+ { url: 'https://twitter.com/Wikipedia', title: 'Wikipedia (@Wikipedia) · Twitter', content: nil },
11
+ { url: 'https://play.google.com/store/apps/details', title: 'Wikipedia - Apps on Google Play',
12
+ content: 'The best Wikipedia experience on your Mobile device. Ad-free and free of charge, forever. With the official Wikipedia app, you can search and explore 40+ ...' },
13
+ { url: 'https://www.wikipedia.org', title: 'Wikipedia',
14
+ content: 'Wikipedia is a free online encyclopedia, created and edited by volunteers around the world and hosted by the Wikimedia Foundation.' }
10
15
  ]
11
16
  end
12
17
 
13
18
  def test_score
14
19
  @bm25f.fit @documents
15
- scores = @bm25f.score 'hello world foo bar baz'
20
+ scores = @bm25f.score 'wikipedia'
16
21
 
17
- # Sort
18
- scores = scores.to_a.sort_by { |_, v| v.to_i }
19
-
20
- # Checks if the most matching element is the first element
21
- assert scores.last[0].zero?
22
+ puts scores.inspect
22
23
  end
23
24
  end
metadata CHANGED
@@ -1,29 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bm25f
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - catflip
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-09-10 00:00:00.000000000 Z
11
+ date: 2023-09-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: treat
14
+ name: pragmatic_tokenizer
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '2.1'
19
+ version: 3.2.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '2.1'
26
+ version: 3.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: uea-stemmer
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.10.3
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.10.3
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: rake
29
43
  requirement: !ruby/object:Gem::Requirement