bm25f 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/bm25f.rb +7 -5
  3. data/test/test_bm25f.rb +9 -8
  4. metadata +19 -5
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7c7af39f073c1fc55d42199f9c882caa5adf672d5c94c715fe379c58cd4a4a19
4
- data.tar.gz: 1ebb197c89da4b70780d3631658a1bb3d0ce5c244abe8b5875c41ee1632f63a3
3
+ metadata.gz: 9b141349387662e73de071841670357c61b61d775cb61501d84dc7ccc801e6de
4
+ data.tar.gz: ad7bb2c6223a5f125224b0a5d0afdb9c3950811c9c09cd1a39ae943b38a39eb9
5
5
  SHA512:
6
- metadata.gz: 816270030afd15716f8b89f4c74d5c11dbc32f169c7a10a1eca931782e21e320017aa95bab66e32c6b500d407922e289edce8db396c956a62c245871744bb79b
7
- data.tar.gz: 0ef709327eaa9c5b14e1132bafc199c4d66759b509c85d8bf3dcb23ba5b8dbe526b698c0ab1051486741fa3a3d4d21d073f8d4e3d12ced9128ec16fedd6a39e4
6
+ metadata.gz: 3662bed23e2c1f041766b29ffe2f87d13ec18e107870c4948fc85851bbc1640a569a9f831f67f46bb8035577fc8a33654461bd2f4d2fe4e324ef3b302f66f665
7
+ data.tar.gz: e27b5d15af219b3dfc41e675015bacd5c5c1d930c0f70df844f8341fab0062670eb3690bdc41163ed628e2e90f340c1fb9814ce6dd9c30fc30daf77056b4b89a
data/lib/bm25f.rb CHANGED
@@ -1,8 +1,7 @@
1
- require 'treat'
1
+ require 'uea-stemmer'
2
+ require 'pragmatic_tokenizer'
2
3
 
3
4
  class BM25F
4
- include Treat::Core::DSL
5
-
6
5
  # Initializes a BM25F model.
7
6
  #
8
7
  # @param term_freq_weight [Float] Weight for term frequency.
@@ -10,6 +9,9 @@ class BM25F
10
9
  def initialize(term_freq_weight: 1.33, doc_length_weight: 0.8)
11
10
  @term_freq_weight = term_freq_weight
12
11
  @doc_length_weight = doc_length_weight
12
+
13
+ @tokenizer = PragmaticTokenizer::Tokenizer.new
14
+ @stemmer = UEAStemmer.new
13
15
  end
14
16
 
15
17
  # Fits the model to a set of documents.
@@ -56,7 +58,7 @@ class BM25F
56
58
  documents.each do |k, v|
57
59
  next unless v.instance_of? String
58
60
 
59
- documents[k] = sentence(v).map(&:stem).join(' ')
61
+ documents[k] = v.map { |t| @stemmer.stem t }.oin(' ')
60
62
  end
61
63
  documents
62
64
  end
@@ -99,7 +101,7 @@ class BM25F
99
101
  # @param query [String] The query to preprocess.
100
102
  # @return [Array<String>] An array of preprocessed query terms.
101
103
  def preprocess_query(query)
102
- sentence(query).tokenize.map(&:stem)
104
+ @tokenizer.tokenize(query).map { |t| @stemmer.stem t }
103
105
  end
104
106
 
105
107
  # Calculates the score of a document using an array of query terms.
data/test/test_bm25f.rb CHANGED
@@ -5,19 +5,20 @@ class BM25FTest < Minitest::Test
5
5
  def setup
6
6
  @bm25f = BM25F.new
7
7
  @documents = [
8
- { title: 'hello world', content: 'foo bar baz' },
9
- { title: 'foo bar', content: 'goodbye, world!', test: nil }
8
+ { url: 'https://wikimedia.org', title: 'Wikimedia',
9
+ content: 'Wikimedia. Wikimedia is a global movement whose mission is to bring free educational content to the world. Through various projects, chapters and the support structure of the ...' },
10
+ { url: 'https://twitter.com/Wikipedia', title: 'Wikipedia (@Wikipedia) · Twitter', content: nil },
11
+ { url: 'https://play.google.com/store/apps/details', title: 'Wikipedia - Apps on Google Play',
12
+ content: 'The best Wikipedia experience on your Mobile device. Ad-free and free of charge, forever. With the official Wikipedia app, you can search and explore 40+ ...' },
13
+ { url: 'https://www.wikipedia.org', title: 'Wikipedia',
14
+ content: 'Wikipedia is a free online encyclopedia, created and edited by volunteers around the world and hosted by the Wikimedia Foundation.' }
10
15
  ]
11
16
  end
12
17
 
13
18
  def test_score
14
19
  @bm25f.fit @documents
15
- scores = @bm25f.score 'hello world foo bar baz'
20
+ scores = @bm25f.score 'wikipedia'
16
21
 
17
- # Sort
18
- scores = scores.to_a.sort_by { |_, v| v.to_i }
19
-
20
- # Checks if the most matching element is the first element
21
- assert scores.last[0].zero?
22
+ puts scores.inspect
22
23
  end
23
24
  end
metadata CHANGED
@@ -1,29 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bm25f
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - catflip
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-09-10 00:00:00.000000000 Z
11
+ date: 2023-09-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: treat
14
+ name: pragmatic_tokenizer
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '2.1'
19
+ version: 3.2.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '2.1'
26
+ version: 3.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: uea-stemmer
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.10.3
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.10.3
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: rake
29
43
  requirement: !ruby/object:Gem::Requirement