bm25f 0.2.3 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/bm25f.rb +20 -9
  3. data/test/test_bm25f.rb +9 -8
  4. metadata +19 -5
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b00f60fdde35d0565f169e9486d2935e5ddcef9bbd8084d60d9228364c9ebed7
4
- data.tar.gz: 7249983dda5a101fb1b2ddb646414b58776ae9a29feb4f563707e9a654d45094
3
+ metadata.gz: 9b141349387662e73de071841670357c61b61d775cb61501d84dc7ccc801e6de
4
+ data.tar.gz: ad7bb2c6223a5f125224b0a5d0afdb9c3950811c9c09cd1a39ae943b38a39eb9
5
5
  SHA512:
6
- metadata.gz: 16231fa9ba99f19cff2a6e9e801b734b53effa584da0bc8fcc80000774bbe61c75f1994bfbdc39b3cdac8d427b5049ff56dd1dbcc669eed3b836d910e45f5547
7
- data.tar.gz: eea02e6dc1989ff69baa6097997da274149083a87cbbc7355101e613103f09fa52886718422cf68b9cb9afadb2e4ed9ce54be8117cba536a99647db01acb24f0
6
+ metadata.gz: 3662bed23e2c1f041766b29ffe2f87d13ec18e107870c4948fc85851bbc1640a569a9f831f67f46bb8035577fc8a33654461bd2f4d2fe4e324ef3b302f66f665
7
+ data.tar.gz: e27b5d15af219b3dfc41e675015bacd5c5c1d930c0f70df844f8341fab0062670eb3690bdc41163ed628e2e90f340c1fb9814ce6dd9c30fc30daf77056b4b89a
data/lib/bm25f.rb CHANGED
@@ -1,8 +1,7 @@
1
- require 'treat'
1
+ require 'uea-stemmer'
2
+ require 'pragmatic_tokenizer'
2
3
 
3
4
  class BM25F
4
- include Treat::Core::DSL
5
-
6
5
  # Initializes a BM25F model.
7
6
  #
8
7
  # @param term_freq_weight [Float] Weight for term frequency.
@@ -10,6 +9,9 @@ class BM25F
10
9
  def initialize(term_freq_weight: 1.33, doc_length_weight: 0.8)
11
10
  @term_freq_weight = term_freq_weight
12
11
  @doc_length_weight = doc_length_weight
12
+
13
+ @tokenizer = PragmaticTokenizer::Tokenizer.new
14
+ @stemmer = UEAStemmer.new
13
15
  end
14
16
 
15
17
  # Fits the model to a set of documents.
@@ -56,7 +58,7 @@ class BM25F
56
58
  documents.each do |k, v|
57
59
  next unless v.instance_of? String
58
60
 
59
- documents[k] = sentence(v).map(&:stem).join(' ')
61
+ documents[k] = v.map { |t| @stemmer.stem t }.oin(' ')
60
62
  end
61
63
  documents
62
64
  end
@@ -88,7 +90,7 @@ class BM25F
88
90
  def calculate_idf
89
91
  idf = {}
90
92
  @field_weights.each_key do |field|
91
- field_doc_count = @documents.count { |doc| !doc[field].empty? }
93
+ field_doc_count = @documents.count { |doc| !doc[field]&.empty? }
92
94
  idf[field] = Math.log((@total_docs - field_doc_count + 0.5) / (field_doc_count + 0.5) + 1.0)
93
95
  end
94
96
  idf
@@ -99,7 +101,7 @@ class BM25F
99
101
  # @param query [String] The query to preprocess.
100
102
  # @return [Array<String>] An array of preprocessed query terms.
101
103
  def preprocess_query(query)
102
- sentence(query).tokenize.map(&:stem)
104
+ @tokenizer.tokenize(query).map { |t| @stemmer.stem t }
103
105
  end
104
106
 
105
107
  # Calculates the score of a document using an array of query terms.
@@ -114,7 +116,10 @@ class BM25F
114
116
  tf = field_term_frequency(field, term, doc_id)
115
117
  idf = @idf[field]
116
118
  field_length_norm = field_length_norm(field, doc_id)
117
- doc_score += @field_weights[field] * ((tf * (@term_freq_weight + 1)) / (tf + @term_freq_weight * field_length_norm) * idf)
119
+ val = @field_weights[field] * ((tf * (@term_freq_weight + 1)) / (tf + @term_freq_weight * field_length_norm) * idf)
120
+ val = 0 if val.nan?
121
+
122
+ doc_score += val
118
123
  end
119
124
  end
120
125
  doc_score
@@ -127,7 +132,10 @@ class BM25F
127
132
  # @param doc_id [Integer] The document ID.
128
133
  # @return [Integer] The term frequency.
129
134
  def field_term_frequency(field, term, doc_id)
130
- @documents[doc_id][field].scan(term).count
135
+ val = @documents[doc_id][field]
136
+ return 0 if val.nil?
137
+
138
+ val.scan(term).count
131
139
  end
132
140
 
133
141
  # Calculates the field length normalization factor of a document.
@@ -136,6 +144,9 @@ class BM25F
136
144
  # @param doc_id [Integer] The document ID.
137
145
  # @return [Float] The field length normalization factor.
138
146
  def field_length_norm(field, doc_id)
139
- 1.0 - @doc_length_weight + @doc_length_weight * (@doc_lengths[doc_id][field] / @avg_doc_length)
147
+ val = @doc_lengths[doc_id][field]
148
+ return 0 if val.nil?
149
+
150
+ 1.0 - @doc_length_weight + @doc_length_weight * (val / @avg_doc_length)
140
151
  end
141
152
  end
data/test/test_bm25f.rb CHANGED
@@ -5,19 +5,20 @@ class BM25FTest < Minitest::Test
5
5
  def setup
6
6
  @bm25f = BM25F.new
7
7
  @documents = [
8
- { title: 'hello world', content: 'foo bar baz' },
9
- { title: 'foo bar', content: 'goodbye, world!' }
8
+ { url: 'https://wikimedia.org', title: 'Wikimedia',
9
+ content: 'Wikimedia. Wikimedia is a global movement whose mission is to bring free educational content to the world. Through various projects, chapters and the support structure of the ...' },
10
+ { url: 'https://twitter.com/Wikipedia', title: 'Wikipedia (@Wikipedia) · Twitter', content: nil },
11
+ { url: 'https://play.google.com/store/apps/details', title: 'Wikipedia - Apps on Google Play',
12
+ content: 'The best Wikipedia experience on your Mobile device. Ad-free and free of charge, forever. With the official Wikipedia app, you can search and explore 40+ ...' },
13
+ { url: 'https://www.wikipedia.org', title: 'Wikipedia',
14
+ content: 'Wikipedia is a free online encyclopedia, created and edited by volunteers around the world and hosted by the Wikimedia Foundation.' }
10
15
  ]
11
16
  end
12
17
 
13
18
  def test_score
14
19
  @bm25f.fit @documents
15
- scores = @bm25f.score 'hello world foo bar baz'
20
+ scores = @bm25f.score 'wikipedia'
16
21
 
17
- # Sort
18
- scores = scores.to_a.sort_by { |_, v| v.to_i }
19
-
20
- # Checks if the most matching element is the first element
21
- assert scores.last[0].zero?
22
+ puts scores.inspect
22
23
  end
23
24
  end
metadata CHANGED
@@ -1,29 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bm25f
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - catflip
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-09-10 00:00:00.000000000 Z
11
+ date: 2023-09-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: treat
14
+ name: pragmatic_tokenizer
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '2.1'
19
+ version: 3.2.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '2.1'
26
+ version: 3.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: uea-stemmer
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.10.3
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.10.3
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: rake
29
43
  requirement: !ruby/object:Gem::Requirement