sg-reranker 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sg_reranker-0.1.0/PKG-INFO +214 -0
- sg_reranker-0.1.0/README.md +202 -0
- sg_reranker-0.1.0/pyproject.toml +25 -0
- sg_reranker-0.1.0/setup.cfg +4 -0
- sg_reranker-0.1.0/src/sg/reranker/__init__.py +7 -0
- sg_reranker-0.1.0/src/sg/reranker/_base.py +47 -0
- sg_reranker-0.1.0/src/sg/reranker/_client.py +87 -0
- sg_reranker-0.1.0/src/sg/reranker/_models.py +17 -0
- sg_reranker-0.1.0/src/sg_reranker.egg-info/PKG-INFO +214 -0
- sg_reranker-0.1.0/src/sg_reranker.egg-info/SOURCES.txt +12 -0
- sg_reranker-0.1.0/src/sg_reranker.egg-info/dependency_links.txt +1 -0
- sg_reranker-0.1.0/src/sg_reranker.egg-info/requires.txt +5 -0
- sg_reranker-0.1.0/src/sg_reranker.egg-info/top_level.txt +1 -0
- sg_reranker-0.1.0/tests/test_reranker.py +87 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sg-reranker
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight client for the SG reranker service
|
|
5
|
+
License: Proprietary
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: requests>=2.28
|
|
9
|
+
Provides-Extra: dev
|
|
10
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
11
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
12
|
+
|
|
13
|
+
# BGE Reranker v2-m3 — Docker Service
|
|
14
|
+
|
|
15
|
+
A production-ready CPU reranker microservice running
|
|
16
|
+
[BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)
|
|
17
|
+
via ONNX Runtime (INT8 dynamic quantization).
|
|
18
|
+
|
|
19
|
+
Tuned for a GCP **n2-highmem-2** VM: 2 vCPUs, 16 GB RAM.
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Quick start
|
|
24
|
+
|
|
25
|
+
### 1 — Build
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
docker compose build
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
The build downloads the model from Hugging Face and bakes the quantized ONNX
|
|
32
|
+
file into the image (≈ 5–15 min, depending on network speed and CPU).
|
|
33
|
+
Subsequent rebuilds are fast because Docker caches the conversion layer as long
|
|
34
|
+
as `requirements.txt` and `app/convert.py` are unchanged.
|
|
35
|
+
|
|
36
|
+
### 2 — Configure (optional)
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
cp .env.example .env
|
|
40
|
+
# Edit .env to set API_KEY, MAX_LENGTH, thread counts, etc.
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### 3 — Run
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
docker compose up -d
|
|
47
|
+
docker compose logs -f # watch startup + warmup
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
The service is ready when you see **"Model ready — serving on port 8000"** in
|
|
51
|
+
the logs (usually 10–30 s after container start).
|
|
52
|
+
|
|
53
|
+
### 4 — Health check
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
curl http://localhost:8000/health
|
|
57
|
+
# {"status":"ok","model":"BAAI/bge-reranker-v2-m3","max_length":512}
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### 5 — Rerank
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
curl -s -X POST http://localhost:8000/rerank \
|
|
64
|
+
-H "Content-Type: application/json" \
|
|
65
|
+
-d '{
|
|
66
|
+
"query": "What is machine learning?",
|
|
67
|
+
"documents": [
|
|
68
|
+
"Machine learning enables systems to learn from data without explicit programming.",
|
|
69
|
+
"Python is a popular general-purpose programming language.",
|
|
70
|
+
"Deep learning models complex patterns using multi-layer neural networks."
|
|
71
|
+
],
|
|
72
|
+
"top_n": 2,
|
|
73
|
+
"return_documents": true
|
|
74
|
+
}' | python3 -m json.tool
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
With bearer-token auth enabled (`API_KEY` set in `.env`):
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
curl -s -X POST http://localhost:8000/rerank \
|
|
81
|
+
-H "Content-Type: application/json" \
|
|
82
|
+
-H "Authorization: Bearer $API_KEY" \
|
|
83
|
+
-d '{...}'
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### 6 — Python client
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# against local container
|
|
90
|
+
python client.py
|
|
91
|
+
|
|
92
|
+
# against a remote VM
|
|
93
|
+
RERANKER_URL=http://10.0.0.5:8000 API_KEY=secret python client.py
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## API reference
|
|
99
|
+
|
|
100
|
+
### `GET /health`
|
|
101
|
+
|
|
102
|
+
```json
|
|
103
|
+
{"status": "ok", "model": "BAAI/bge-reranker-v2-m3", "max_length": 512}
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### `POST /rerank`
|
|
107
|
+
|
|
108
|
+
**Request body**
|
|
109
|
+
|
|
110
|
+
| Field | Type | Default | Description |
|
|
111
|
+
|--------------------|----------------|---------|------------------------------------------|
|
|
112
|
+
| `query` | string | — | The search query |
|
|
113
|
+
| `documents` | list of string | — | Candidate documents to score |
|
|
114
|
+
| `top_n` | int \| null | null | Return only the top N results |
|
|
115
|
+
| `max_length` | int \| null | null | Token limit per pair (env var fallback) |
|
|
116
|
+
| `return_documents` | bool | true | Include document text in the response |
|
|
117
|
+
|
|
118
|
+
**Response**
|
|
119
|
+
|
|
120
|
+
```json
|
|
121
|
+
{
|
|
122
|
+
"results": [
|
|
123
|
+
{"index": 0, "score": 0.9821, "document": "Machine learning enables …"},
|
|
124
|
+
{"index": 2, "score": 0.7634, "document": "Deep learning models …"}
|
|
125
|
+
],
|
|
126
|
+
"model": "BAAI/bge-reranker-v2-m3"
|
|
127
|
+
}
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Results are sorted by `score` descending. `index` refers to the original
|
|
131
|
+
position in the `documents` list so you can map back to your data.
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## Reaching the service from another machine
|
|
136
|
+
|
|
137
|
+
There are three common approaches; choose based on your threat model.
|
|
138
|
+
|
|
139
|
+
### Option A — Same-VPC internal IP (recommended for GCP)
|
|
140
|
+
|
|
141
|
+
If your client runs in the same GCP VPC as the VM, use the VM's **internal
|
|
142
|
+
IP** directly. No firewall rule is needed because traffic stays on the private
|
|
143
|
+
network.
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
RERANKER_URL=http://10.128.0.X:8000 python client.py
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Find the internal IP: `gcloud compute instances describe <VM_NAME> --format='get(networkInterfaces[0].networkIP)'`
|
|
150
|
+
|
|
151
|
+
### Option B — External IP with a locked-down firewall rule
|
|
152
|
+
|
|
153
|
+
Create a GCP firewall rule that allows TCP port 8000 only from specific source
|
|
154
|
+
IP ranges (your office CIDR, a bastion IP, etc.):
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
gcloud compute firewall-rules create allow-reranker \
|
|
158
|
+
--direction=INGRESS \
|
|
159
|
+
--action=ALLOW \
|
|
160
|
+
--rules=tcp:8000 \
|
|
161
|
+
--source-ranges=<YOUR_CIDR> \
|
|
162
|
+
--target-tags=reranker-vm
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Then tag your VM and hit it on its external IP:
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
RERANKER_URL=http://<EXTERNAL_IP>:8000 python client.py
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
> **Warning** — plain HTTP is unencrypted. Anyone on the network path can
|
|
172
|
+
> read requests and responses, including the documents you are reranking.
|
|
173
|
+
> Use this option only within a trusted network or with TLS termination
|
|
174
|
+
> (e.g. a load balancer or nginx with a certificate).
|
|
175
|
+
|
|
176
|
+
### Option C — SSH tunnel (for local development)
|
|
177
|
+
|
|
178
|
+
Forward a local port to the service through an encrypted SSH session. No
|
|
179
|
+
firewall rule is needed and no traffic leaves the tunnel unencrypted.
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
# In one terminal — keep this open
|
|
183
|
+
gcloud compute ssh <VM_NAME> -- -N -L 8000:localhost:8000
|
|
184
|
+
|
|
185
|
+
# In another terminal
|
|
186
|
+
curl http://localhost:8000/health
|
|
187
|
+
python client.py # uses http://localhost:8000 by default
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
## Environment variables
|
|
193
|
+
|
|
194
|
+
| Variable | Default | Description |
|
|
195
|
+
|-------------------|---------------------------------|------------------------------------------|
|
|
196
|
+
| `MODEL_DIR` | `/models/onnx_reranker_quant` | Path to the quantized ONNX model dir |
|
|
197
|
+
| `MAX_LENGTH` | `512` | Default token limit per query+doc pair |
|
|
198
|
+
| `OMP_NUM_THREADS` | `2` | PyTorch / OpenBLAS thread count |
|
|
199
|
+
| `ORT_NUM_THREADS` | `2` | ONNX Runtime intra-op thread count |
|
|
200
|
+
| `API_KEY` | _(unset = auth disabled)_ | Bearer token for `/rerank` |
|
|
201
|
+
|
|
202
|
+
---
|
|
203
|
+
|
|
204
|
+
## Notes
|
|
205
|
+
|
|
206
|
+
- **Single Uvicorn worker** is intentional. Each worker loads a full copy of
|
|
207
|
+
the model. On a 2-vCPU VM a second worker would double RAM usage and cause
|
|
208
|
+
thread contention rather than improve throughput.
|
|
209
|
+
- **INT8 quantization** cuts model size by ~3× and speeds up matrix
|
|
210
|
+
multiplications on CPUs that support AVX2 or AVX-512 VNNI (most modern Intel
|
|
211
|
+
and AMD cores). Accuracy loss on typical reranking benchmarks is < 1%.
|
|
212
|
+
- **TLS** — this service speaks plain HTTP. For production deployments on the
|
|
213
|
+
public internet, terminate TLS at a load balancer or a reverse proxy (nginx,
|
|
214
|
+
Caddy) and keep the container on an internal network.
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
# BGE Reranker v2-m3 — Docker Service
|
|
2
|
+
|
|
3
|
+
A production-ready CPU reranker microservice running
|
|
4
|
+
[BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)
|
|
5
|
+
via ONNX Runtime (INT8 dynamic quantization).
|
|
6
|
+
|
|
7
|
+
Tuned for a GCP **n2-highmem-2** VM: 2 vCPUs, 16 GB RAM.
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Quick start
|
|
12
|
+
|
|
13
|
+
### 1 — Build
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
docker compose build
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
The build downloads the model from Hugging Face and bakes the quantized ONNX
|
|
20
|
+
file into the image (≈ 5–15 min, depending on network speed and CPU).
|
|
21
|
+
Subsequent rebuilds are fast because Docker caches the conversion layer as long
|
|
22
|
+
as `requirements.txt` and `app/convert.py` are unchanged.
|
|
23
|
+
|
|
24
|
+
### 2 — Configure (optional)
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
cp .env.example .env
|
|
28
|
+
# Edit .env to set API_KEY, MAX_LENGTH, thread counts, etc.
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### 3 — Run
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
docker compose up -d
|
|
35
|
+
docker compose logs -f # watch startup + warmup
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
The service is ready when you see **"Model ready — serving on port 8000"** in
|
|
39
|
+
the logs (usually 10–30 s after container start).
|
|
40
|
+
|
|
41
|
+
### 4 — Health check
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
curl http://localhost:8000/health
|
|
45
|
+
# {"status":"ok","model":"BAAI/bge-reranker-v2-m3","max_length":512}
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### 5 — Rerank
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
curl -s -X POST http://localhost:8000/rerank \
|
|
52
|
+
-H "Content-Type: application/json" \
|
|
53
|
+
-d '{
|
|
54
|
+
"query": "What is machine learning?",
|
|
55
|
+
"documents": [
|
|
56
|
+
"Machine learning enables systems to learn from data without explicit programming.",
|
|
57
|
+
"Python is a popular general-purpose programming language.",
|
|
58
|
+
"Deep learning models complex patterns using multi-layer neural networks."
|
|
59
|
+
],
|
|
60
|
+
"top_n": 2,
|
|
61
|
+
"return_documents": true
|
|
62
|
+
}' | python3 -m json.tool
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
With bearer-token auth enabled (`API_KEY` set in `.env`):
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
curl -s -X POST http://localhost:8000/rerank \
|
|
69
|
+
-H "Content-Type: application/json" \
|
|
70
|
+
-H "Authorization: Bearer $API_KEY" \
|
|
71
|
+
-d '{...}'
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### 6 — Python client
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
# against local container
|
|
78
|
+
python client.py
|
|
79
|
+
|
|
80
|
+
# against a remote VM
|
|
81
|
+
RERANKER_URL=http://10.0.0.5:8000 API_KEY=secret python client.py
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## API reference
|
|
87
|
+
|
|
88
|
+
### `GET /health`
|
|
89
|
+
|
|
90
|
+
```json
|
|
91
|
+
{"status": "ok", "model": "BAAI/bge-reranker-v2-m3", "max_length": 512}
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### `POST /rerank`
|
|
95
|
+
|
|
96
|
+
**Request body**
|
|
97
|
+
|
|
98
|
+
| Field | Type | Default | Description |
|
|
99
|
+
|--------------------|----------------|---------|------------------------------------------|
|
|
100
|
+
| `query` | string | — | The search query |
|
|
101
|
+
| `documents` | list of string | — | Candidate documents to score |
|
|
102
|
+
| `top_n` | int \| null | null | Return only the top N results |
|
|
103
|
+
| `max_length` | int \| null | null | Token limit per pair (env var fallback) |
|
|
104
|
+
| `return_documents` | bool | true | Include document text in the response |
|
|
105
|
+
|
|
106
|
+
**Response**
|
|
107
|
+
|
|
108
|
+
```json
|
|
109
|
+
{
|
|
110
|
+
"results": [
|
|
111
|
+
{"index": 0, "score": 0.9821, "document": "Machine learning enables …"},
|
|
112
|
+
{"index": 2, "score": 0.7634, "document": "Deep learning models …"}
|
|
113
|
+
],
|
|
114
|
+
"model": "BAAI/bge-reranker-v2-m3"
|
|
115
|
+
}
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Results are sorted by `score` descending. `index` refers to the original
|
|
119
|
+
position in the `documents` list so you can map back to your data.
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Reaching the service from another machine
|
|
124
|
+
|
|
125
|
+
There are three common approaches; choose based on your threat model.
|
|
126
|
+
|
|
127
|
+
### Option A — Same-VPC internal IP (recommended for GCP)
|
|
128
|
+
|
|
129
|
+
If your client runs in the same GCP VPC as the VM, use the VM's **internal
|
|
130
|
+
IP** directly. No firewall rule is needed because traffic stays on the private
|
|
131
|
+
network.
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
RERANKER_URL=http://10.128.0.X:8000 python client.py
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Find the internal IP: `gcloud compute instances describe <VM_NAME> --format='get(networkInterfaces[0].networkIP)'`
|
|
138
|
+
|
|
139
|
+
### Option B — External IP with a locked-down firewall rule
|
|
140
|
+
|
|
141
|
+
Create a GCP firewall rule that allows TCP port 8000 only from specific source
|
|
142
|
+
IP ranges (your office CIDR, a bastion IP, etc.):
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
gcloud compute firewall-rules create allow-reranker \
|
|
146
|
+
--direction=INGRESS \
|
|
147
|
+
--action=ALLOW \
|
|
148
|
+
--rules=tcp:8000 \
|
|
149
|
+
--source-ranges=<YOUR_CIDR> \
|
|
150
|
+
--target-tags=reranker-vm
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
Then tag your VM and hit it on its external IP:
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
RERANKER_URL=http://<EXTERNAL_IP>:8000 python client.py
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
> **Warning** — plain HTTP is unencrypted. Anyone on the network path can
|
|
160
|
+
> read requests and responses, including the documents you are reranking.
|
|
161
|
+
> Use this option only within a trusted network or with TLS termination
|
|
162
|
+
> (e.g. a load balancer or nginx with a certificate).
|
|
163
|
+
|
|
164
|
+
### Option C — SSH tunnel (for local development)
|
|
165
|
+
|
|
166
|
+
Forward a local port to the service through an encrypted SSH session. No
|
|
167
|
+
firewall rule is needed and no traffic leaves the tunnel unencrypted.
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
# In one terminal — keep this open
|
|
171
|
+
gcloud compute ssh <VM_NAME> -- -N -L 8000:localhost:8000
|
|
172
|
+
|
|
173
|
+
# In another terminal
|
|
174
|
+
curl http://localhost:8000/health
|
|
175
|
+
python client.py # uses http://localhost:8000 by default
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## Environment variables
|
|
181
|
+
|
|
182
|
+
| Variable | Default | Description |
|
|
183
|
+
|-------------------|---------------------------------|------------------------------------------|
|
|
184
|
+
| `MODEL_DIR` | `/models/onnx_reranker_quant` | Path to the quantized ONNX model dir |
|
|
185
|
+
| `MAX_LENGTH` | `512` | Default token limit per query+doc pair |
|
|
186
|
+
| `OMP_NUM_THREADS` | `2` | PyTorch / OpenBLAS thread count |
|
|
187
|
+
| `ORT_NUM_THREADS` | `2` | ONNX Runtime intra-op thread count |
|
|
188
|
+
| `API_KEY` | _(unset = auth disabled)_ | Bearer token for `/rerank` |
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
## Notes
|
|
193
|
+
|
|
194
|
+
- **Single Uvicorn worker** is intentional. Each worker loads a full copy of
|
|
195
|
+
the model. On a 2-vCPU VM a second worker would double RAM usage and cause
|
|
196
|
+
thread contention rather than improve throughput.
|
|
197
|
+
- **INT8 quantization** cuts model size by ~3× and speeds up matrix
|
|
198
|
+
multiplications on CPUs that support AVX2 or AVX-512 VNNI (most modern Intel
|
|
199
|
+
and AMD cores). Accuracy loss on typical reranking benchmarks is < 1%.
|
|
200
|
+
- **TLS** — this service speaks plain HTTP. For production deployments on the
|
|
201
|
+
public internet, terminate TLS at a load balancer or a reverse proxy (nginx,
|
|
202
|
+
Caddy) and keep the container on an internal network.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "sg-reranker"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Lightweight client for the SG reranker service"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "Proprietary" }
|
|
12
|
+
dependencies = ["requests>=2.28"]
|
|
13
|
+
|
|
14
|
+
[project.optional-dependencies]
|
|
15
|
+
dev = ["pytest>=7", "pytest-cov"]
|
|
16
|
+
|
|
17
|
+
[tool.setuptools.packages.find]
|
|
18
|
+
where = ["src"]
|
|
19
|
+
namespaces = true # enables implicit namespace packages (sg.reranker, sg.embeddings, ...)
|
|
20
|
+
|
|
21
|
+
[tool.pytest.ini_options]
|
|
22
|
+
testpaths = ["tests"]
|
|
23
|
+
|
|
24
|
+
[tool.coverage.run]
|
|
25
|
+
source = ["src"]
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import TYPE_CHECKING, Optional
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from ._models import RerankResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SGServiceClient(ABC):
|
|
13
|
+
"""Base for all SG service clients. Inject a custom session for testing or connection pooling."""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
base_url: str,
|
|
18
|
+
api_key: Optional[str] = None,
|
|
19
|
+
session: Optional[requests.Session] = None,
|
|
20
|
+
) -> None:
|
|
21
|
+
self._base_url = base_url.rstrip("/")
|
|
22
|
+
self._api_key = api_key
|
|
23
|
+
self._session = session or requests.Session()
|
|
24
|
+
|
|
25
|
+
def _headers(self) -> dict[str, str]:
|
|
26
|
+
h = {"Content-Type": "application/json"}
|
|
27
|
+
if self._api_key:
|
|
28
|
+
h["Authorization"] = f"Bearer {self._api_key}"
|
|
29
|
+
return h
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def health(self) -> dict: ...
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class InferenceInterface(ABC):
|
|
36
|
+
"""Contract for inference services. Implement this to add a new inference backend."""
|
|
37
|
+
|
|
38
|
+
@abstractmethod
|
|
39
|
+
def rerank(
|
|
40
|
+
self,
|
|
41
|
+
model: str,
|
|
42
|
+
query: str,
|
|
43
|
+
documents: list[str],
|
|
44
|
+
top_n: Optional[int] = None,
|
|
45
|
+
return_documents: bool = True,
|
|
46
|
+
max_length: Optional[int] = None,
|
|
47
|
+
) -> "RerankResult": ...
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from ._base import InferenceInterface, SGServiceClient
|
|
8
|
+
from ._models import RankedDocument, RerankResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class _Inference(InferenceInterface):
|
|
12
|
+
def __init__(self, client: Reranker) -> None:
|
|
13
|
+
self._client = client
|
|
14
|
+
|
|
15
|
+
def rerank(
|
|
16
|
+
self,
|
|
17
|
+
model: str,
|
|
18
|
+
query: str,
|
|
19
|
+
documents: list[str],
|
|
20
|
+
top_n: Optional[int] = None,
|
|
21
|
+
return_documents: bool = True,
|
|
22
|
+
max_length: Optional[int] = None,
|
|
23
|
+
) -> RerankResult:
|
|
24
|
+
payload: dict = {
|
|
25
|
+
"query": query,
|
|
26
|
+
"documents": documents,
|
|
27
|
+
"return_documents": return_documents,
|
|
28
|
+
}
|
|
29
|
+
if top_n is not None:
|
|
30
|
+
payload["top_n"] = top_n
|
|
31
|
+
if max_length is not None:
|
|
32
|
+
payload["max_length"] = max_length
|
|
33
|
+
|
|
34
|
+
resp = self._client._session.post(
|
|
35
|
+
f"{self._client._base_url}/rerank",
|
|
36
|
+
json=payload,
|
|
37
|
+
headers=self._client._headers(),
|
|
38
|
+
timeout=30,
|
|
39
|
+
)
|
|
40
|
+
resp.raise_for_status()
|
|
41
|
+
body = resp.json()
|
|
42
|
+
|
|
43
|
+
return RerankResult(
|
|
44
|
+
model=body["model"],
|
|
45
|
+
data=[
|
|
46
|
+
RankedDocument(
|
|
47
|
+
index=r["index"],
|
|
48
|
+
score=r["score"],
|
|
49
|
+
document={"text": r["document"]} if r.get("document") else None,
|
|
50
|
+
)
|
|
51
|
+
for r in body["results"]
|
|
52
|
+
],
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class Reranker(SGServiceClient):
|
|
57
|
+
"""
|
|
58
|
+
Client for the BGE reranker service.
|
|
59
|
+
|
|
60
|
+
client = Reranker("http://34.91.24.7:8004", api_key="secret")
|
|
61
|
+
results = client.inference.rerank(
|
|
62
|
+
model="bge-reranker-v2-m3",
|
|
63
|
+
query="What is ML?",
|
|
64
|
+
documents=[...],
|
|
65
|
+
top_n=3,
|
|
66
|
+
)
|
|
67
|
+
for r in results.data:
|
|
68
|
+
print(r.index, r.score, r.document["text"])
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
base_url: str = "http://localhost:8004",
|
|
74
|
+
api_key: Optional[str] = None,
|
|
75
|
+
session: Optional[requests.Session] = None,
|
|
76
|
+
) -> None:
|
|
77
|
+
super().__init__(base_url, api_key, session)
|
|
78
|
+
self.inference = _Inference(self)
|
|
79
|
+
|
|
80
|
+
def health(self) -> dict:
|
|
81
|
+
resp = self._session.get(
|
|
82
|
+
f"{self._base_url}/health",
|
|
83
|
+
headers=self._headers(),
|
|
84
|
+
timeout=10,
|
|
85
|
+
)
|
|
86
|
+
resp.raise_for_status()
|
|
87
|
+
return resp.json()
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class RankedDocument:
|
|
9
|
+
index: int
|
|
10
|
+
score: float
|
|
11
|
+
document: Optional[dict] # {"text": "..."} when return_documents=True, else None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class RerankResult:
|
|
16
|
+
data: list[RankedDocument]
|
|
17
|
+
model: str
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sg-reranker
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight client for the SG reranker service
|
|
5
|
+
License: Proprietary
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: requests>=2.28
|
|
9
|
+
Provides-Extra: dev
|
|
10
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
11
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
12
|
+
|
|
13
|
+
# BGE Reranker v2-m3 — Docker Service
|
|
14
|
+
|
|
15
|
+
A production-ready CPU reranker microservice running
|
|
16
|
+
[BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)
|
|
17
|
+
via ONNX Runtime (INT8 dynamic quantization).
|
|
18
|
+
|
|
19
|
+
Tuned for a GCP **n2-highmem-2** VM: 2 vCPUs, 16 GB RAM.
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Quick start
|
|
24
|
+
|
|
25
|
+
### 1 — Build
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
docker compose build
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
The build downloads the model from Hugging Face and bakes the quantized ONNX
|
|
32
|
+
file into the image (≈ 5–15 min, depending on network speed and CPU).
|
|
33
|
+
Subsequent rebuilds are fast because Docker caches the conversion layer as long
|
|
34
|
+
as `requirements.txt` and `app/convert.py` are unchanged.
|
|
35
|
+
|
|
36
|
+
### 2 — Configure (optional)
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
cp .env.example .env
|
|
40
|
+
# Edit .env to set API_KEY, MAX_LENGTH, thread counts, etc.
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### 3 — Run
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
docker compose up -d
|
|
47
|
+
docker compose logs -f # watch startup + warmup
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
The service is ready when you see **"Model ready — serving on port 8000"** in
|
|
51
|
+
the logs (usually 10–30 s after container start).
|
|
52
|
+
|
|
53
|
+
### 4 — Health check
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
curl http://localhost:8000/health
|
|
57
|
+
# {"status":"ok","model":"BAAI/bge-reranker-v2-m3","max_length":512}
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### 5 — Rerank
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
curl -s -X POST http://localhost:8000/rerank \
|
|
64
|
+
-H "Content-Type: application/json" \
|
|
65
|
+
-d '{
|
|
66
|
+
"query": "What is machine learning?",
|
|
67
|
+
"documents": [
|
|
68
|
+
"Machine learning enables systems to learn from data without explicit programming.",
|
|
69
|
+
"Python is a popular general-purpose programming language.",
|
|
70
|
+
"Deep learning models complex patterns using multi-layer neural networks."
|
|
71
|
+
],
|
|
72
|
+
"top_n": 2,
|
|
73
|
+
"return_documents": true
|
|
74
|
+
}' | python3 -m json.tool
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
With bearer-token auth enabled (`API_KEY` set in `.env`):
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
curl -s -X POST http://localhost:8000/rerank \
|
|
81
|
+
-H "Content-Type: application/json" \
|
|
82
|
+
-H "Authorization: Bearer $API_KEY" \
|
|
83
|
+
-d '{...}'
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### 6 — Python client
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# against local container
|
|
90
|
+
python client.py
|
|
91
|
+
|
|
92
|
+
# against a remote VM
|
|
93
|
+
RERANKER_URL=http://10.0.0.5:8000 API_KEY=secret python client.py
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## API reference
|
|
99
|
+
|
|
100
|
+
### `GET /health`
|
|
101
|
+
|
|
102
|
+
```json
|
|
103
|
+
{"status": "ok", "model": "BAAI/bge-reranker-v2-m3", "max_length": 512}
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### `POST /rerank`
|
|
107
|
+
|
|
108
|
+
**Request body**
|
|
109
|
+
|
|
110
|
+
| Field | Type | Default | Description |
|
|
111
|
+
|--------------------|----------------|---------|------------------------------------------|
|
|
112
|
+
| `query` | string | — | The search query |
|
|
113
|
+
| `documents` | list of string | — | Candidate documents to score |
|
|
114
|
+
| `top_n` | int \| null | null | Return only the top N results |
|
|
115
|
+
| `max_length` | int \| null | null | Token limit per pair (env var fallback) |
|
|
116
|
+
| `return_documents` | bool | true | Include document text in the response |
|
|
117
|
+
|
|
118
|
+
**Response**
|
|
119
|
+
|
|
120
|
+
```json
|
|
121
|
+
{
|
|
122
|
+
"results": [
|
|
123
|
+
{"index": 0, "score": 0.9821, "document": "Machine learning enables …"},
|
|
124
|
+
{"index": 2, "score": 0.7634, "document": "Deep learning models …"}
|
|
125
|
+
],
|
|
126
|
+
"model": "BAAI/bge-reranker-v2-m3"
|
|
127
|
+
}
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Results are sorted by `score` descending. `index` refers to the original
|
|
131
|
+
position in the `documents` list so you can map back to your data.
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## Reaching the service from another machine
|
|
136
|
+
|
|
137
|
+
There are three common approaches; choose based on your threat model.
|
|
138
|
+
|
|
139
|
+
### Option A — Same-VPC internal IP (recommended for GCP)
|
|
140
|
+
|
|
141
|
+
If your client runs in the same GCP VPC as the VM, use the VM's **internal
|
|
142
|
+
IP** directly. No firewall rule is needed because traffic stays on the private
|
|
143
|
+
network.
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
RERANKER_URL=http://10.128.0.X:8000 python client.py
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Find the internal IP: `gcloud compute instances describe <VM_NAME> --format='get(networkInterfaces[0].networkIP)'`
|
|
150
|
+
|
|
151
|
+
### Option B — External IP with a locked-down firewall rule
|
|
152
|
+
|
|
153
|
+
Create a GCP firewall rule that allows TCP port 8000 only from specific source
|
|
154
|
+
IP ranges (your office CIDR, a bastion IP, etc.):
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
gcloud compute firewall-rules create allow-reranker \
|
|
158
|
+
--direction=INGRESS \
|
|
159
|
+
--action=ALLOW \
|
|
160
|
+
--rules=tcp:8000 \
|
|
161
|
+
--source-ranges=<YOUR_CIDR> \
|
|
162
|
+
--target-tags=reranker-vm
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Then tag your VM and hit it on its external IP:
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
RERANKER_URL=http://<EXTERNAL_IP>:8000 python client.py
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
> **Warning** — plain HTTP is unencrypted. Anyone on the network path can
|
|
172
|
+
> read requests and responses, including the documents you are reranking.
|
|
173
|
+
> Use this option only within a trusted network or with TLS termination
|
|
174
|
+
> (e.g. a load balancer or nginx with a certificate).
|
|
175
|
+
|
|
176
|
+
### Option C — SSH tunnel (for local development)
|
|
177
|
+
|
|
178
|
+
Forward a local port to the service through an encrypted SSH session. No
|
|
179
|
+
firewall rule is needed and no traffic leaves the tunnel unencrypted.
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
# In one terminal — keep this open
|
|
183
|
+
gcloud compute ssh <VM_NAME> -- -N -L 8000:localhost:8000
|
|
184
|
+
|
|
185
|
+
# In another terminal
|
|
186
|
+
curl http://localhost:8000/health
|
|
187
|
+
python client.py # uses http://localhost:8000 by default
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
## Environment variables
|
|
193
|
+
|
|
194
|
+
| Variable | Default | Description |
|
|
195
|
+
|-------------------|---------------------------------|------------------------------------------|
|
|
196
|
+
| `MODEL_DIR` | `/models/onnx_reranker_quant` | Path to the quantized ONNX model dir |
|
|
197
|
+
| `MAX_LENGTH` | `512` | Default token limit per query+doc pair |
|
|
198
|
+
| `OMP_NUM_THREADS` | `2` | PyTorch / OpenBLAS thread count |
|
|
199
|
+
| `ORT_NUM_THREADS` | `2` | ONNX Runtime intra-op thread count |
|
|
200
|
+
| `API_KEY` | _(unset = auth disabled)_ | Bearer token for `/rerank` |
|
|
201
|
+
|
|
202
|
+
---
|
|
203
|
+
|
|
204
|
+
## Notes
|
|
205
|
+
|
|
206
|
+
- **Single Uvicorn worker** is intentional. Each worker loads a full copy of
|
|
207
|
+
the model. On a 2-vCPU VM a second worker would double RAM usage and cause
|
|
208
|
+
thread contention rather than improve throughput.
|
|
209
|
+
- **INT8 quantization** cuts model size by ~3× and speeds up matrix
|
|
210
|
+
multiplications on CPUs that support AVX2 or AVX-512 VNNI (most modern Intel
|
|
211
|
+
and AMD cores). Accuracy loss on typical reranking benchmarks is < 1%.
|
|
212
|
+
- **TLS** — this service speaks plain HTTP. For production deployments on the
|
|
213
|
+
public internet, terminate TLS at a load balancer or a reverse proxy (nginx,
|
|
214
|
+
Caddy) and keep the container on an internal network.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/sg/reranker/__init__.py
|
|
4
|
+
src/sg/reranker/_base.py
|
|
5
|
+
src/sg/reranker/_client.py
|
|
6
|
+
src/sg/reranker/_models.py
|
|
7
|
+
src/sg_reranker.egg-info/PKG-INFO
|
|
8
|
+
src/sg_reranker.egg-info/SOURCES.txt
|
|
9
|
+
src/sg_reranker.egg-info/dependency_links.txt
|
|
10
|
+
src/sg_reranker.egg-info/requires.txt
|
|
11
|
+
src/sg_reranker.egg-info/top_level.txt
|
|
12
|
+
tests/test_reranker.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sg
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from unittest.mock import patch
|
|
2
|
+
import pytest
|
|
3
|
+
from sg.reranker import Reranker, RankedDocument, RerankResult
|
|
4
|
+
from conftest import mock_response, RERANK_RESPONSE, HEALTH_RESPONSE
|
|
5
|
+
|
|
6
|
+
DOCS = [
|
|
7
|
+
"Machine learning is a subset of AI.",
|
|
8
|
+
"Paris is the capital of France.",
|
|
9
|
+
"Deep learning uses neural networks.",
|
|
10
|
+
]
|
|
11
|
+
QUERY = "What is machine learning?"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TestHealth:
|
|
15
|
+
def test_returns_status_ok(self, client):
|
|
16
|
+
with patch.object(client._session, "get", return_value=mock_response(HEALTH_RESPONSE)):
|
|
17
|
+
assert client.health()["status"] == "ok"
|
|
18
|
+
|
|
19
|
+
def test_calls_correct_endpoint(self, client):
|
|
20
|
+
with patch.object(client._session, "get", return_value=mock_response(HEALTH_RESPONSE)) as mock_get:
|
|
21
|
+
client.health()
|
|
22
|
+
mock_get.assert_called_once_with(
|
|
23
|
+
"http://test-server:8004/health",
|
|
24
|
+
headers={"Content-Type": "application/json"},
|
|
25
|
+
timeout=10,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def test_raises_on_server_error(self, client):
|
|
29
|
+
resp = mock_response({}, status_code=503)
|
|
30
|
+
resp.raise_for_status.side_effect = Exception("Service unavailable")
|
|
31
|
+
with patch.object(client._session, "get", return_value=resp):
|
|
32
|
+
with pytest.raises(Exception, match="Service unavailable"):
|
|
33
|
+
client.health()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class TestRerank:
|
|
37
|
+
def test_returns_rerank_result(self, client):
|
|
38
|
+
with patch.object(client._session, "post", return_value=mock_response(RERANK_RESPONSE)):
|
|
39
|
+
result = client.inference.rerank(model="bge-reranker-v2-m3", query=QUERY, documents=DOCS)
|
|
40
|
+
assert isinstance(result, RerankResult)
|
|
41
|
+
assert result.model == "BAAI/bge-reranker-v2-m3"
|
|
42
|
+
|
|
43
|
+
def test_data_contains_ranked_documents(self, client):
|
|
44
|
+
with patch.object(client._session, "post", return_value=mock_response(RERANK_RESPONSE)):
|
|
45
|
+
result = client.inference.rerank(model="bge-reranker-v2-m3", query=QUERY, documents=DOCS)
|
|
46
|
+
assert len(result.data) == 3
|
|
47
|
+
assert all(isinstance(r, RankedDocument) for r in result.data)
|
|
48
|
+
|
|
49
|
+
def test_document_text_wrapped_in_dict(self, client):
|
|
50
|
+
with patch.object(client._session, "post", return_value=mock_response(RERANK_RESPONSE)):
|
|
51
|
+
result = client.inference.rerank(model="bge-reranker-v2-m3", query=QUERY, documents=DOCS)
|
|
52
|
+
assert result.data[0].document == {"text": "Machine learning is a subset of AI."}
|
|
53
|
+
|
|
54
|
+
def test_top_n_sent_in_payload(self, client):
|
|
55
|
+
with patch.object(client._session, "post", return_value=mock_response(RERANK_RESPONSE)) as mock_post:
|
|
56
|
+
client.inference.rerank(model="bge-reranker-v2-m3", query=QUERY, documents=DOCS, top_n=2)
|
|
57
|
+
assert mock_post.call_args.kwargs["json"]["top_n"] == 2
|
|
58
|
+
|
|
59
|
+
def test_max_length_sent_in_payload(self, client):
|
|
60
|
+
with patch.object(client._session, "post", return_value=mock_response(RERANK_RESPONSE)) as mock_post:
|
|
61
|
+
client.inference.rerank(model="bge-reranker-v2-m3", query=QUERY, documents=DOCS, max_length=256)
|
|
62
|
+
assert mock_post.call_args.kwargs["json"]["max_length"] == 256
|
|
63
|
+
|
|
64
|
+
def test_return_documents_false_omits_text(self, client):
|
|
65
|
+
response = {"model": "BAAI/bge-reranker-v2-m3", "results": [{"index": 0, "score": 0.9, "document": None}]}
|
|
66
|
+
with patch.object(client._session, "post", return_value=mock_response(response)):
|
|
67
|
+
result = client.inference.rerank(
|
|
68
|
+
model="bge-reranker-v2-m3", query=QUERY, documents=DOCS, return_documents=False
|
|
69
|
+
)
|
|
70
|
+
assert result.data[0].document is None
|
|
71
|
+
|
|
72
|
+
def test_api_key_sent_as_bearer_token(self, auth_client):
|
|
73
|
+
with patch.object(auth_client._session, "post", return_value=mock_response(RERANK_RESPONSE)) as mock_post:
|
|
74
|
+
auth_client.inference.rerank(model="bge-reranker-v2-m3", query=QUERY, documents=DOCS)
|
|
75
|
+
assert mock_post.call_args.kwargs["headers"]["Authorization"] == "Bearer secret"
|
|
76
|
+
|
|
77
|
+
def test_no_api_key_omits_auth_header(self, client):
|
|
78
|
+
with patch.object(client._session, "post", return_value=mock_response(RERANK_RESPONSE)) as mock_post:
|
|
79
|
+
client.inference.rerank(model="bge-reranker-v2-m3", query=QUERY, documents=DOCS)
|
|
80
|
+
assert "Authorization" not in mock_post.call_args.kwargs["headers"]
|
|
81
|
+
|
|
82
|
+
def test_raises_on_401(self, client):
|
|
83
|
+
resp = mock_response({}, status_code=401)
|
|
84
|
+
resp.raise_for_status.side_effect = Exception("Unauthorized")
|
|
85
|
+
with patch.object(client._session, "post", return_value=resp):
|
|
86
|
+
with pytest.raises(Exception, match="Unauthorized"):
|
|
87
|
+
client.inference.rerank(model="bge-reranker-v2-m3", query=QUERY, documents=DOCS)
|