bayesian_bm25_rs 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bayesian_bm25_rs-0.1.1/.DS_Store +0 -0
- bayesian_bm25_rs-0.1.1/.gitignore +28 -0
- bayesian_bm25_rs-0.1.1/Cargo.lock +133 -0
- bayesian_bm25_rs-0.1.1/Cargo.toml +23 -0
- bayesian_bm25_rs-0.1.1/Makefile +7 -0
- bayesian_bm25_rs-0.1.1/PKG-INFO +112 -0
- bayesian_bm25_rs-0.1.1/README.md +88 -0
- bayesian_bm25_rs-0.1.1/docs/pyodide.md +26 -0
- bayesian_bm25_rs-0.1.1/pyproject.toml +35 -0
- bayesian_bm25_rs-0.1.1/scripts/build_pyodide.sh +12 -0
- bayesian_bm25_rs-0.1.1/src/bayesian_scorer.rs +79 -0
- bayesian_bm25_rs-0.1.1/src/bm25_scorer.rs +65 -0
- bayesian_bm25_rs-0.1.1/src/corpus.rs +85 -0
- bayesian_bm25_rs-0.1.1/src/defaults.rs +171 -0
- bayesian_bm25_rs-0.1.1/src/experiments.rs +603 -0
- bayesian_bm25_rs-0.1.1/src/hybrid_scorer.rs +71 -0
- bayesian_bm25_rs-0.1.1/src/lib.rs +34 -0
- bayesian_bm25_rs-0.1.1/src/main.rs +46 -0
- bayesian_bm25_rs-0.1.1/src/math_utils.rs +46 -0
- bayesian_bm25_rs-0.1.1/src/parameter_learner.rs +90 -0
- bayesian_bm25_rs-0.1.1/src/pybindings.rs +570 -0
- bayesian_bm25_rs-0.1.1/src/tokenizer.rs +32 -0
- bayesian_bm25_rs-0.1.1/src/vector_scorer.rs +20 -0
- bayesian_bm25_rs-0.1.1/tests/test_smoke.py +29 -0
|
Binary file
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Rust
|
|
2
|
+
/target/
|
|
3
|
+
|
|
4
|
+
# Python
|
|
5
|
+
__pycache__/
|
|
6
|
+
*.py[cod]
|
|
7
|
+
*.pyd
|
|
8
|
+
*.so
|
|
9
|
+
*.dylib
|
|
10
|
+
|
|
11
|
+
# Packaging
|
|
12
|
+
/dist/
|
|
13
|
+
/build/
|
|
14
|
+
*.egg-info/
|
|
15
|
+
|
|
16
|
+
# Virtualenv
|
|
17
|
+
/.venv/
|
|
18
|
+
/venv/
|
|
19
|
+
|
|
20
|
+
# Cargo cache
|
|
21
|
+
/.cargo/
|
|
22
|
+
|
|
23
|
+
# Pyodide build env
|
|
24
|
+
/.pyodide-xbuildenv-*/
|
|
25
|
+
/.pyodide-venv/
|
|
26
|
+
|
|
27
|
+
# OS
|
|
28
|
+
.DS_Store
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# This file is automatically @generated by Cargo.
|
|
2
|
+
# It is not intended for manual editing.
|
|
3
|
+
version = 4
|
|
4
|
+
|
|
5
|
+
[[package]]
|
|
6
|
+
name = "bayesian_bm25"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
dependencies = [
|
|
9
|
+
"pyo3",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
[[package]]
|
|
13
|
+
name = "heck"
|
|
14
|
+
version = "0.5.0"
|
|
15
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
16
|
+
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
|
17
|
+
|
|
18
|
+
[[package]]
|
|
19
|
+
name = "libc"
|
|
20
|
+
version = "0.2.180"
|
|
21
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
22
|
+
checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc"
|
|
23
|
+
|
|
24
|
+
[[package]]
|
|
25
|
+
name = "once_cell"
|
|
26
|
+
version = "1.21.3"
|
|
27
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
28
|
+
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
|
29
|
+
|
|
30
|
+
[[package]]
|
|
31
|
+
name = "portable-atomic"
|
|
32
|
+
version = "1.13.1"
|
|
33
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
34
|
+
checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
|
|
35
|
+
|
|
36
|
+
[[package]]
|
|
37
|
+
name = "proc-macro2"
|
|
38
|
+
version = "1.0.106"
|
|
39
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
40
|
+
checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
|
|
41
|
+
dependencies = [
|
|
42
|
+
"unicode-ident",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[[package]]
|
|
46
|
+
name = "pyo3"
|
|
47
|
+
version = "0.28.0"
|
|
48
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
49
|
+
checksum = "fcf3ccafdf54c050be48a3a086d372f77ba6615f5057211607cd30e5ac5cec6d"
|
|
50
|
+
dependencies = [
|
|
51
|
+
"libc",
|
|
52
|
+
"once_cell",
|
|
53
|
+
"portable-atomic",
|
|
54
|
+
"pyo3-build-config",
|
|
55
|
+
"pyo3-ffi",
|
|
56
|
+
"pyo3-macros",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
[[package]]
|
|
60
|
+
name = "pyo3-build-config"
|
|
61
|
+
version = "0.28.0"
|
|
62
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
63
|
+
checksum = "972720a441c91fd9c49f212a1d2d74c6e3803b231ebc8d66c51efbd7ccab11c8"
|
|
64
|
+
dependencies = [
|
|
65
|
+
"target-lexicon",
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
[[package]]
|
|
69
|
+
name = "pyo3-ffi"
|
|
70
|
+
version = "0.28.0"
|
|
71
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
72
|
+
checksum = "5994456d9dab8934d600d3867571b6410f24fbd6002570ad56356733eb54859b"
|
|
73
|
+
dependencies = [
|
|
74
|
+
"libc",
|
|
75
|
+
"pyo3-build-config",
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
[[package]]
|
|
79
|
+
name = "pyo3-macros"
|
|
80
|
+
version = "0.28.0"
|
|
81
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
82
|
+
checksum = "11ce9cc8d81b3c4969748807604d92b4eef363c5bb82b1a1bdb34ec6f1093a18"
|
|
83
|
+
dependencies = [
|
|
84
|
+
"proc-macro2",
|
|
85
|
+
"pyo3-macros-backend",
|
|
86
|
+
"quote",
|
|
87
|
+
"syn",
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
[[package]]
|
|
91
|
+
name = "pyo3-macros-backend"
|
|
92
|
+
version = "0.28.0"
|
|
93
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
94
|
+
checksum = "eaf4b60036a154d23282679b658e3cc7d88d3b8c9a40b43824785f232d2e1b98"
|
|
95
|
+
dependencies = [
|
|
96
|
+
"heck",
|
|
97
|
+
"proc-macro2",
|
|
98
|
+
"pyo3-build-config",
|
|
99
|
+
"quote",
|
|
100
|
+
"syn",
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
[[package]]
|
|
104
|
+
name = "quote"
|
|
105
|
+
version = "1.0.44"
|
|
106
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
107
|
+
checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4"
|
|
108
|
+
dependencies = [
|
|
109
|
+
"proc-macro2",
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
[[package]]
|
|
113
|
+
name = "syn"
|
|
114
|
+
version = "2.0.114"
|
|
115
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
116
|
+
checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a"
|
|
117
|
+
dependencies = [
|
|
118
|
+
"proc-macro2",
|
|
119
|
+
"quote",
|
|
120
|
+
"unicode-ident",
|
|
121
|
+
]
|
|
122
|
+
|
|
123
|
+
[[package]]
|
|
124
|
+
name = "target-lexicon"
|
|
125
|
+
version = "0.13.4"
|
|
126
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
127
|
+
checksum = "b1dd07eb858a2067e2f3c7155d54e929265c264e6f37efe3ee7a8d1b5a1dd0ba"
|
|
128
|
+
|
|
129
|
+
[[package]]
|
|
130
|
+
name = "unicode-ident"
|
|
131
|
+
version = "1.0.22"
|
|
132
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
133
|
+
checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "bayesian_bm25"
|
|
3
|
+
version = "0.1.1"
|
|
4
|
+
edition = "2021"
|
|
5
|
+
description = "Bayesian BM25 scoring and experimental validation (Rust core + Python bindings)"
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
license = "UNLICENSED"
|
|
8
|
+
repository = "https://github.com/sigridjineth/bayesian_bm25_rs"
|
|
9
|
+
|
|
10
|
+
[lib]
|
|
11
|
+
name = "bayesian_bm25"
|
|
12
|
+
path = "src/lib.rs"
|
|
13
|
+
crate-type = ["rlib", "cdylib"]
|
|
14
|
+
|
|
15
|
+
[[bin]]
|
|
16
|
+
name = "run_experiments"
|
|
17
|
+
path = "src/main.rs"
|
|
18
|
+
|
|
19
|
+
[dependencies]
|
|
20
|
+
pyo3 = { version = "0.28", optional = true, features = ["extension-module"] }
|
|
21
|
+
|
|
22
|
+
[features]
|
|
23
|
+
python = ["pyo3"]
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bayesian_bm25_rs
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Classifier: License :: Other/Proprietary License
|
|
5
|
+
Classifier: Programming Language :: Rust
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Summary: Bayesian BM25 scoring and experimental validation (Rust core + Python bindings)
|
|
16
|
+
Keywords: bm25,information-retrieval,search,ranking,bayesian,hybrid-search
|
|
17
|
+
License: UNLICENSED
|
|
18
|
+
Requires-Python: >=3.8
|
|
19
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
20
|
+
Project-URL: Homepage, https://github.com/sigridjineth/bayesian_bm25_rs
|
|
21
|
+
Project-URL: Issues, https://github.com/sigridjineth/bayesian_bm25_rs/issues
|
|
22
|
+
Project-URL: Repository, https://github.com/sigridjineth/bayesian_bm25_rs
|
|
23
|
+
|
|
24
|
+
# bb25 (Bayesian BM25)
|
|
25
|
+
|
|
26
|
+
bb25 is a fast, self-contained BM25 + Bayesian calibration implementation with a minimal Python API. It also includes a small reference corpus and experiment suite so you can validate the expected numerical properties.
|
|
27
|
+
|
|
28
|
+
- PyPI package name: `bayesian_bm25_rs`
|
|
29
|
+
- Python import name: `bb25`
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
pip install bayesian_bm25_rs
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Quick start
|
|
38
|
+
|
|
39
|
+
### Use the built-in corpus and queries
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
import bb25 as bb
|
|
43
|
+
|
|
44
|
+
corpus = bb.build_default_corpus()
|
|
45
|
+
docs = corpus.documents()
|
|
46
|
+
queries = bb.build_default_queries()
|
|
47
|
+
|
|
48
|
+
bm25 = bb.BM25Scorer(corpus, 1.2, 0.75)
|
|
49
|
+
score = bm25.score(queries[0].terms, docs[0])
|
|
50
|
+
print("score0", score)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Build your own corpus
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
import bb25 as bb
|
|
57
|
+
|
|
58
|
+
corpus = bb.Corpus()
|
|
59
|
+
corpus.add_document("d1", "neural networks for ranking", [0.1] * 8)
|
|
60
|
+
corpus.add_document("d2", "bm25 is a strong baseline", [0.2] * 8)
|
|
61
|
+
corpus.build_index() # must be called before creating scorers
|
|
62
|
+
|
|
63
|
+
bm25 = bb.BM25Scorer(corpus, 1.2, 0.75)
|
|
64
|
+
print(bm25.idf("bm25"))
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Bayesian calibration + hybrid fusion
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
import bb25 as bb
|
|
71
|
+
|
|
72
|
+
corpus = bb.build_default_corpus()
|
|
73
|
+
docs = corpus.documents()
|
|
74
|
+
queries = bb.build_default_queries()
|
|
75
|
+
|
|
76
|
+
bm25 = bb.BM25Scorer(corpus, 1.2, 0.75)
|
|
77
|
+
bayes = bb.BayesianBM25Scorer(bm25, 1.0, 0.5)
|
|
78
|
+
vector = bb.VectorScorer()
|
|
79
|
+
hybrid = bb.HybridScorer(bayes, vector)
|
|
80
|
+
|
|
81
|
+
q = queries[0]
|
|
82
|
+
prob_or = hybrid.score_or(q.terms, q.embedding, docs[0])
|
|
83
|
+
prob_and = hybrid.score_and(q.terms, q.embedding, docs[0])
|
|
84
|
+
print("OR", prob_or, "AND", prob_and)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Run the experiments
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
import bb25 as bb
|
|
91
|
+
|
|
92
|
+
results = bb.run_experiments()
|
|
93
|
+
print(all(r.passed for r in results))
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Build from source (Rust)
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
make build
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## PyPI publishing
|
|
103
|
+
|
|
104
|
+
Build a wheel with maturin:
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
python -m pip install maturin
|
|
108
|
+
maturin build --release
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
For Pyodide builds, see `docs/pyodide.md`.
|
|
112
|
+
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# bb25 (Bayesian BM25)
|
|
2
|
+
|
|
3
|
+
bb25 is a fast, self-contained BM25 + Bayesian calibration implementation with a minimal Python API. It also includes a small reference corpus and experiment suite so you can validate the expected numerical properties.
|
|
4
|
+
|
|
5
|
+
- PyPI package name: `bayesian_bm25_rs`
|
|
6
|
+
- Python import name: `bb25`
|
|
7
|
+
|
|
8
|
+
## Install
|
|
9
|
+
|
|
10
|
+
```
|
|
11
|
+
pip install bayesian_bm25_rs
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Quick start
|
|
15
|
+
|
|
16
|
+
### Use the built-in corpus and queries
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
import bb25 as bb
|
|
20
|
+
|
|
21
|
+
corpus = bb.build_default_corpus()
|
|
22
|
+
docs = corpus.documents()
|
|
23
|
+
queries = bb.build_default_queries()
|
|
24
|
+
|
|
25
|
+
bm25 = bb.BM25Scorer(corpus, 1.2, 0.75)
|
|
26
|
+
score = bm25.score(queries[0].terms, docs[0])
|
|
27
|
+
print("score0", score)
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Build your own corpus
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
import bb25 as bb
|
|
34
|
+
|
|
35
|
+
corpus = bb.Corpus()
|
|
36
|
+
corpus.add_document("d1", "neural networks for ranking", [0.1] * 8)
|
|
37
|
+
corpus.add_document("d2", "bm25 is a strong baseline", [0.2] * 8)
|
|
38
|
+
corpus.build_index() # must be called before creating scorers
|
|
39
|
+
|
|
40
|
+
bm25 = bb.BM25Scorer(corpus, 1.2, 0.75)
|
|
41
|
+
print(bm25.idf("bm25"))
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Bayesian calibration + hybrid fusion
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
import bb25 as bb
|
|
48
|
+
|
|
49
|
+
corpus = bb.build_default_corpus()
|
|
50
|
+
docs = corpus.documents()
|
|
51
|
+
queries = bb.build_default_queries()
|
|
52
|
+
|
|
53
|
+
bm25 = bb.BM25Scorer(corpus, 1.2, 0.75)
|
|
54
|
+
bayes = bb.BayesianBM25Scorer(bm25, 1.0, 0.5)
|
|
55
|
+
vector = bb.VectorScorer()
|
|
56
|
+
hybrid = bb.HybridScorer(bayes, vector)
|
|
57
|
+
|
|
58
|
+
q = queries[0]
|
|
59
|
+
prob_or = hybrid.score_or(q.terms, q.embedding, docs[0])
|
|
60
|
+
prob_and = hybrid.score_and(q.terms, q.embedding, docs[0])
|
|
61
|
+
print("OR", prob_or, "AND", prob_and)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Run the experiments
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
import bb25 as bb
|
|
68
|
+
|
|
69
|
+
results = bb.run_experiments()
|
|
70
|
+
print(all(r.passed for r in results))
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Build from source (Rust)
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
make build
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## PyPI publishing
|
|
80
|
+
|
|
81
|
+
Build a wheel with maturin:
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
python -m pip install maturin
|
|
85
|
+
maturin build --release
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
For Pyodide builds, see `docs/pyodide.md`.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Pyodide Build Notes
|
|
2
|
+
|
|
3
|
+
This project targets Pyodide by keeping the Python API pure (no file I/O, no OS dependencies) and using PyO3 for bindings. The actual build depends on your Pyodide toolchain setup.
|
|
4
|
+
|
|
5
|
+
The Python import name is `bb25`.
|
|
6
|
+
|
|
7
|
+
## Minimal Flow
|
|
8
|
+
|
|
9
|
+
1) Install a Pyodide build toolchain (pyodide-build + Emscripten).
|
|
10
|
+
2) From `bayesian_bm25_rs/`, run:
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
./scripts/build_pyodide.sh
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
This will place a wheel in `dist/`, which can be loaded in Pyodide.
|
|
17
|
+
|
|
18
|
+
## Notes
|
|
19
|
+
|
|
20
|
+
- If your Pyodide Python version is newer than the PyO3 version supports, set:
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
- For reproducibility, pin your Pyodide toolchain and Python version.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["maturin>=1.5,<2"]
|
|
3
|
+
build-backend = "maturin"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "bayesian_bm25_rs"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "Bayesian BM25 scoring and experimental validation (Rust core + Python bindings)"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "UNLICENSED" }
|
|
12
|
+
keywords = ["bm25", "information-retrieval", "search", "ranking", "bayesian", "hybrid-search"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"License :: Other/Proprietary License",
|
|
15
|
+
"Programming Language :: Rust",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
18
|
+
"Programming Language :: Python :: 3.8",
|
|
19
|
+
"Programming Language :: Python :: 3.9",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Programming Language :: Python :: 3.13",
|
|
24
|
+
"Operating System :: OS Independent",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.urls]
|
|
28
|
+
Homepage = "https://github.com/sigridjineth/bayesian_bm25_rs"
|
|
29
|
+
Repository = "https://github.com/sigridjineth/bayesian_bm25_rs"
|
|
30
|
+
Issues = "https://github.com/sigridjineth/bayesian_bm25_rs/issues"
|
|
31
|
+
|
|
32
|
+
[tool.maturin]
|
|
33
|
+
bindings = "pyo3"
|
|
34
|
+
module-name = "bb25"
|
|
35
|
+
features = ["python"]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
if ! command -v pyodide >/dev/null 2>&1; then
|
|
5
|
+
echo "pyodide CLI not found. Install pyodide-build and retry."
|
|
6
|
+
echo "Example: python -m pip install pyodide-build"
|
|
7
|
+
exit 1
|
|
8
|
+
fi
|
|
9
|
+
|
|
10
|
+
# Builds a wheel suitable for Pyodide.
|
|
11
|
+
# You may need to set up a Pyodide toolchain and Emscripten beforehand.
|
|
12
|
+
pyodide build --wheel -o dist
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
use std::rc::Rc;
|
|
2
|
+
|
|
3
|
+
use crate::bm25_scorer::BM25Scorer;
|
|
4
|
+
use crate::corpus::Document;
|
|
5
|
+
use crate::math_utils::{clamp, safe_log, safe_prob, sigmoid};
|
|
6
|
+
|
|
7
|
+
pub struct BayesianBM25Scorer {
|
|
8
|
+
bm25: Rc<BM25Scorer>,
|
|
9
|
+
alpha: f64,
|
|
10
|
+
beta: f64,
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
impl BayesianBM25Scorer {
|
|
14
|
+
pub fn new(bm25: Rc<BM25Scorer>, alpha: f64, beta: f64) -> Self {
|
|
15
|
+
Self { bm25, alpha, beta }
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
pub fn likelihood(&self, score: f64) -> f64 {
|
|
19
|
+
sigmoid(self.alpha * (score - self.beta))
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
pub fn tf_prior(&self, tf: usize) -> f64 {
|
|
23
|
+
0.2 + 0.7 * (tf as f64 / 10.0).min(1.0)
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
pub fn norm_prior(&self, doc_length: usize, avg_doc_length: f64) -> f64 {
|
|
27
|
+
if avg_doc_length < 1.0 {
|
|
28
|
+
return 0.5;
|
|
29
|
+
}
|
|
30
|
+
let ratio = doc_length as f64 / avg_doc_length;
|
|
31
|
+
let prior = 1.0 / (1.0 + ratio);
|
|
32
|
+
clamp(prior, 0.1, 0.9)
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
pub fn composite_prior(&self, tf: usize, doc_length: usize, avg_doc_length: f64) -> f64 {
|
|
36
|
+
let p_tf = self.tf_prior(tf);
|
|
37
|
+
let p_norm = self.norm_prior(doc_length, avg_doc_length);
|
|
38
|
+
clamp(0.7 * p_tf + 0.3 * p_norm, 0.1, 0.9)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
pub fn posterior(&self, score: f64, prior: f64) -> f64 {
|
|
42
|
+
let mut lik = self.likelihood(score);
|
|
43
|
+
lik = safe_prob(lik);
|
|
44
|
+
let prior = safe_prob(prior);
|
|
45
|
+
let numerator = lik * prior;
|
|
46
|
+
let denominator = numerator + (1.0 - lik) * (1.0 - prior);
|
|
47
|
+
numerator / denominator
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
pub fn score_term(&self, term: &str, doc: &Document) -> f64 {
|
|
51
|
+
let raw_score = self.bm25.score_term_standard(term, doc);
|
|
52
|
+
if raw_score == 0.0 {
|
|
53
|
+
return 0.0;
|
|
54
|
+
}
|
|
55
|
+
let tf = *doc.term_freq.get(term).unwrap_or(&0);
|
|
56
|
+
let prior = self.composite_prior(tf, doc.length, self.bm25.avgdl());
|
|
57
|
+
self.posterior(raw_score, prior)
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
pub fn score(&self, query_terms: &[String], doc: &Document) -> f64 {
|
|
61
|
+
let mut log_complement_sum = 0.0;
|
|
62
|
+
let mut has_match = false;
|
|
63
|
+
|
|
64
|
+
for term in query_terms {
|
|
65
|
+
let p = self.score_term(term, doc);
|
|
66
|
+
if p > 0.0 {
|
|
67
|
+
has_match = true;
|
|
68
|
+
let p = safe_prob(p);
|
|
69
|
+
log_complement_sum += safe_log(1.0 - p);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if !has_match {
|
|
74
|
+
return 0.0;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
1.0 - log_complement_sum.exp()
|
|
78
|
+
}
|
|
79
|
+
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
use std::rc::Rc;
|
|
2
|
+
|
|
3
|
+
use crate::corpus::{Corpus, Document};
|
|
4
|
+
|
|
5
|
+
pub struct BM25Scorer {
|
|
6
|
+
corpus: Rc<Corpus>,
|
|
7
|
+
k1: f64,
|
|
8
|
+
b: f64,
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
impl BM25Scorer {
|
|
12
|
+
pub fn new(corpus: Rc<Corpus>, k1: f64, b: f64) -> Self {
|
|
13
|
+
Self { corpus, k1, b }
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
pub fn idf(&self, term: &str) -> f64 {
|
|
17
|
+
let n = self.corpus.n as f64;
|
|
18
|
+
let df_t = *self.corpus.df.get(term).unwrap_or(&0) as f64;
|
|
19
|
+
((n - df_t + 0.5) / (df_t + 0.5)).ln()
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
fn length_norm(&self, doc: &Document) -> f64 {
|
|
23
|
+
1.0 - self.b + self.b * (doc.length as f64) / self.corpus.avgdl
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
pub fn score_term_standard(&self, term: &str, doc: &Document) -> f64 {
|
|
27
|
+
let tf = *doc.term_freq.get(term).unwrap_or(&0) as f64;
|
|
28
|
+
if tf == 0.0 {
|
|
29
|
+
return 0.0;
|
|
30
|
+
}
|
|
31
|
+
let norm = self.length_norm(doc);
|
|
32
|
+
let idf_val = self.idf(term);
|
|
33
|
+
idf_val * (self.k1 + 1.0) * tf / (self.k1 * norm + tf)
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
pub fn score_term_rewritten(&self, term: &str, doc: &Document) -> f64 {
|
|
37
|
+
let tf = *doc.term_freq.get(term).unwrap_or(&0) as f64;
|
|
38
|
+
if tf == 0.0 {
|
|
39
|
+
return 0.0;
|
|
40
|
+
}
|
|
41
|
+
let norm = self.length_norm(doc);
|
|
42
|
+
let boost = (self.k1 + 1.0) * tf / (self.k1 * norm + tf);
|
|
43
|
+
let idf_val = self.idf(term);
|
|
44
|
+
idf_val * boost
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
pub fn score(&self, query_terms: &[String], doc: &Document) -> f64 {
|
|
48
|
+
query_terms
|
|
49
|
+
.iter()
|
|
50
|
+
.map(|term| self.score_term_standard(term, doc))
|
|
51
|
+
.sum()
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
pub fn upper_bound(&self, term: &str) -> f64 {
|
|
55
|
+
let idf_val = self.idf(term);
|
|
56
|
+
if idf_val <= 0.0 {
|
|
57
|
+
return 0.0;
|
|
58
|
+
}
|
|
59
|
+
(self.k1 + 1.0) * idf_val
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
pub fn avgdl(&self) -> f64 {
|
|
63
|
+
self.corpus.avgdl
|
|
64
|
+
}
|
|
65
|
+
}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
use std::collections::HashMap;
|
|
2
|
+
|
|
3
|
+
use crate::tokenizer::Tokenizer;
|
|
4
|
+
|
|
5
|
+
#[derive(Clone)]
|
|
6
|
+
pub struct Document {
|
|
7
|
+
pub id: String,
|
|
8
|
+
pub text: String,
|
|
9
|
+
pub embedding: Vec<f64>,
|
|
10
|
+
pub tokens: Vec<String>,
|
|
11
|
+
pub length: usize,
|
|
12
|
+
pub term_freq: HashMap<String, usize>,
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
pub struct Corpus {
|
|
16
|
+
tokenizer: Tokenizer,
|
|
17
|
+
documents: Vec<Document>,
|
|
18
|
+
doc_by_id: HashMap<String, usize>,
|
|
19
|
+
pub n: usize,
|
|
20
|
+
pub avgdl: f64,
|
|
21
|
+
pub df: HashMap<String, usize>,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
impl Corpus {
|
|
25
|
+
pub fn new(tokenizer: Tokenizer) -> Self {
|
|
26
|
+
Self {
|
|
27
|
+
tokenizer,
|
|
28
|
+
documents: Vec::new(),
|
|
29
|
+
doc_by_id: HashMap::new(),
|
|
30
|
+
n: 0,
|
|
31
|
+
avgdl: 0.0,
|
|
32
|
+
df: HashMap::new(),
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
pub fn add_document(&mut self, doc_id: &str, text: &str, embedding: Vec<f64>) {
|
|
37
|
+
let tokens = self.tokenizer.tokenize(text);
|
|
38
|
+
let mut term_freq = HashMap::new();
|
|
39
|
+
for token in &tokens {
|
|
40
|
+
*term_freq.entry(token.clone()).or_insert(0) += 1;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
let doc = Document {
|
|
44
|
+
id: doc_id.to_string(),
|
|
45
|
+
text: text.to_string(),
|
|
46
|
+
embedding,
|
|
47
|
+
length: tokens.len(),
|
|
48
|
+
tokens,
|
|
49
|
+
term_freq,
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
let idx = self.documents.len();
|
|
53
|
+
self.documents.push(doc);
|
|
54
|
+
self.doc_by_id.insert(doc_id.to_string(), idx);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
pub fn build_index(&mut self) {
|
|
58
|
+
self.n = self.documents.len();
|
|
59
|
+
self.df.clear();
|
|
60
|
+
let mut total_length = 0usize;
|
|
61
|
+
|
|
62
|
+
for doc in &self.documents {
|
|
63
|
+
total_length += doc.length;
|
|
64
|
+
for term in doc.term_freq.keys() {
|
|
65
|
+
*self.df.entry(term.clone()).or_insert(0) += 1;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
self.avgdl = if self.n > 0 {
|
|
70
|
+
total_length as f64 / self.n as f64
|
|
71
|
+
} else {
|
|
72
|
+
0.0
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
pub fn get_document(&self, doc_id: &str) -> Option<&Document> {
|
|
77
|
+
self.doc_by_id
|
|
78
|
+
.get(doc_id)
|
|
79
|
+
.and_then(|idx| self.documents.get(*idx))
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
pub fn documents(&self) -> &[Document] {
|
|
83
|
+
&self.documents
|
|
84
|
+
}
|
|
85
|
+
}
|