bm25f 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/bm25f.rb +7 -5
  3. data/test/test_bm25f.rb +9 -8
  4. metadata +19 -5
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7c7af39f073c1fc55d42199f9c882caa5adf672d5c94c715fe379c58cd4a4a19
4
- data.tar.gz: 1ebb197c89da4b70780d3631658a1bb3d0ce5c244abe8b5875c41ee1632f63a3
3
+ metadata.gz: 9b141349387662e73de071841670357c61b61d775cb61501d84dc7ccc801e6de
4
+ data.tar.gz: ad7bb2c6223a5f125224b0a5d0afdb9c3950811c9c09cd1a39ae943b38a39eb9
5
5
  SHA512:
6
- metadata.gz: 816270030afd15716f8b89f4c74d5c11dbc32f169c7a10a1eca931782e21e320017aa95bab66e32c6b500d407922e289edce8db396c956a62c245871744bb79b
7
- data.tar.gz: 0ef709327eaa9c5b14e1132bafc199c4d66759b509c85d8bf3dcb23ba5b8dbe526b698c0ab1051486741fa3a3d4d21d073f8d4e3d12ced9128ec16fedd6a39e4
6
+ metadata.gz: 3662bed23e2c1f041766b29ffe2f87d13ec18e107870c4948fc85851bbc1640a569a9f831f67f46bb8035577fc8a33654461bd2f4d2fe4e324ef3b302f66f665
7
+ data.tar.gz: e27b5d15af219b3dfc41e675015bacd5c5c1d930c0f70df844f8341fab0062670eb3690bdc41163ed628e2e90f340c1fb9814ce6dd9c30fc30daf77056b4b89a
data/lib/bm25f.rb CHANGED
@@ -1,8 +1,7 @@
1
- require 'treat'
1
+ require 'uea-stemmer'
2
+ require 'pragmatic_tokenizer'
2
3
 
3
4
  class BM25F
4
- include Treat::Core::DSL
5
-
6
5
  # Initializes a BM25F model.
7
6
  #
8
7
  # @param term_freq_weight [Float] Weight for term frequency.
@@ -10,6 +9,9 @@ class BM25F
10
9
  def initialize(term_freq_weight: 1.33, doc_length_weight: 0.8)
11
10
  @term_freq_weight = term_freq_weight
12
11
  @doc_length_weight = doc_length_weight
12
+
13
+ @tokenizer = PragmaticTokenizer::Tokenizer.new
14
+ @stemmer = UEAStemmer.new
13
15
  end
14
16
 
15
17
  # Fits the model to a set of documents.
@@ -56,7 +58,7 @@ class BM25F
56
58
  documents.each do |k, v|
57
59
  next unless v.instance_of? String
58
60
 
59
- documents[k] = sentence(v).map(&:stem).join(' ')
61
+ documents[k] = v.map { |t| @stemmer.stem t }.oin(' ')
60
62
  end
61
63
  documents
62
64
  end
@@ -99,7 +101,7 @@ class BM25F
99
101
  # @param query [String] The query to preprocess.
100
102
  # @return [Array<String>] An array of preprocessed query terms.
101
103
  def preprocess_query(query)
102
- sentence(query).tokenize.map(&:stem)
104
+ @tokenizer.tokenize(query).map { |t| @stemmer.stem t }
103
105
  end
104
106
 
105
107
  # Calculates the score of a document using an array of query terms.
data/test/test_bm25f.rb CHANGED
@@ -5,19 +5,20 @@ class BM25FTest < Minitest::Test
5
5
  def setup
6
6
  @bm25f = BM25F.new
7
7
  @documents = [
8
- { title: 'hello world', content: 'foo bar baz' },
9
- { title: 'foo bar', content: 'goodbye, world!', test: nil }
8
+ { url: 'https://wikimedia.org', title: 'Wikimedia',
9
+ content: 'Wikimedia. Wikimedia is a global movement whose mission is to bring free educational content to the world. Through various projects, chapters and the support structure of the ...' },
10
+ { url: 'https://twitter.com/Wikipedia', title: 'Wikipedia (@Wikipedia) · Twitter', content: nil },
11
+ { url: 'https://play.google.com/store/apps/details', title: 'Wikipedia - Apps on Google Play',
12
+ content: 'The best Wikipedia experience on your Mobile device. Ad-free and free of charge, forever. With the official Wikipedia app, you can search and explore 40+ ...' },
13
+ { url: 'https://www.wikipedia.org', title: 'Wikipedia',
14
+ content: 'Wikipedia is a free online encyclopedia, created and edited by volunteers around the world and hosted by the Wikimedia Foundation.' }
10
15
  ]
11
16
  end
12
17
 
13
18
  def test_score
14
19
  @bm25f.fit @documents
15
- scores = @bm25f.score 'hello world foo bar baz'
20
+ scores = @bm25f.score 'wikipedia'
16
21
 
17
- # Sort
18
- scores = scores.to_a.sort_by { |_, v| v.to_i }
19
-
20
- # Checks if the most matching element is the first element
21
- assert scores.last[0].zero?
22
+ puts scores.inspect
22
23
  end
23
24
  end
metadata CHANGED
@@ -1,29 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bm25f
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - catflip
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-09-10 00:00:00.000000000 Z
11
+ date: 2023-09-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: treat
14
+ name: pragmatic_tokenizer
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '2.1'
19
+ version: 3.2.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '2.1'
26
+ version: 3.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: uea-stemmer
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.10.3
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.10.3
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: rake
29
43
  requirement: !ruby/object:Gem::Requirement