embedprobe 0.0.0__tar.gz → 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- embedprobe-0.1.0/.github/workflows/ci.yml +22 -0
- embedprobe-0.1.0/.gitignore +12 -0
- embedprobe-0.1.0/PKG-INFO +105 -0
- embedprobe-0.1.0/README.md +77 -0
- embedprobe-0.1.0/pyproject.toml +45 -0
- embedprobe-0.1.0/src/embedprobe/__init__.py +19 -0
- embedprobe-0.1.0/src/embedprobe/cli.py +85 -0
- embedprobe-0.1.0/src/embedprobe/data.py +67 -0
- embedprobe-0.1.0/src/embedprobe/embeddings.py +77 -0
- embedprobe-0.1.0/src/embedprobe/levels/__init__.py +15 -0
- embedprobe-0.1.0/src/embedprobe/levels/level0.py +72 -0
- embedprobe-0.1.0/src/embedprobe/levels/level1.py +58 -0
- embedprobe-0.1.0/src/embedprobe/levels/level2.py +84 -0
- embedprobe-0.1.0/src/embedprobe/levels/level3.py +93 -0
- embedprobe-0.1.0/src/embedprobe/probe.py +99 -0
- embedprobe-0.1.0/src/embedprobe/report/__init__.py +3 -0
- embedprobe-0.1.0/src/embedprobe/report/html.py +136 -0
- embedprobe-0.1.0/src/embedprobe/report/plots.py +116 -0
- embedprobe-0.1.0/src/embedprobe/report/result.py +74 -0
- embedprobe-0.1.0/src/embedprobe/taxonomy.py +75 -0
- embedprobe-0.1.0/tests/test_data.py +36 -0
- embedprobe-0.1.0/tests/test_levels.py +95 -0
- embedprobe-0.1.0/tests/test_probe_report_cli.py +63 -0
- embedprobe-0.0.0/PKG-INFO +0 -43
- embedprobe-0.0.0/README.MD +0 -27
- embedprobe-0.0.0/README.md +0 -27
- embedprobe-0.0.0/pyproject.toml +0 -25
- embedprobe-0.0.0/src/embedprobe/__init__.py +0 -3
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
pull_request:
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
test:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
strategy:
|
|
11
|
+
fail-fast: false
|
|
12
|
+
matrix:
|
|
13
|
+
python-version: ["3.10", "3.12"]
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
- uses: actions/setup-python@v5
|
|
17
|
+
with:
|
|
18
|
+
python-version: ${{ matrix.python-version }}
|
|
19
|
+
- name: Install package
|
|
20
|
+
run: python -m pip install -e ".[dev]"
|
|
21
|
+
- name: Run tests
|
|
22
|
+
run: pytest
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: embedprobe
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A diagnostic toolkit for evaluating and selecting multilingual embedding models on your data.
|
|
5
|
+
Project-URL: Homepage, https://github.com/Sainath26/embedprobe
|
|
6
|
+
Author-email: Harish Sainath S <harishsainth036@gmail.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Keywords: diagnostics,embeddings,evaluation,model-selection,nlp,transformers
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Requires-Python: >=3.10
|
|
15
|
+
Requires-Dist: matplotlib>=3.6
|
|
16
|
+
Requires-Dist: numpy>=1.22
|
|
17
|
+
Requires-Dist: pandas>=1.5
|
|
18
|
+
Requires-Dist: rich>=13
|
|
19
|
+
Requires-Dist: scikit-learn>=1.1
|
|
20
|
+
Requires-Dist: scipy>=1.9
|
|
21
|
+
Requires-Dist: sentence-transformers>=2.2
|
|
22
|
+
Requires-Dist: typer>=0.9
|
|
23
|
+
Requires-Dist: umap-learn>=0.5
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: build; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest>=7; extra == 'dev'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# embedprobe
|
|
30
|
+
|
|
31
|
+
**A diagnostic toolkit for evaluating and selecting multilingual embedding models on _your_ data.**
|
|
32
|
+
|
|
33
|
+
Leaderboards like [MTEB](https://github.com/embeddings-benchmark/mteb) tell you _which_ embedding
|
|
34
|
+
model ranks higher on average. **embedprobe tells you _why_ a model fails on your domain and
|
|
35
|
+
language pair,** so you can pick a compact encoder for your task without fine-tuning every
|
|
36
|
+
candidate.
|
|
37
|
+
|
|
38
|
+
Given a parallel dataset (source text, target text, topic), embedprobe dissects each candidate
|
|
39
|
+
model across four diagnostic levels:
|
|
40
|
+
|
|
41
|
+
| Level | Question it answers | Signals |
|
|
42
|
+
| ------------------------ | ----------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------- |
|
|
43
|
+
| **0 — Signal vs. noise** | Does the model separate true pairs from random ones at all? | true-vs-random cosine distributions, SNR, Kolmogorov–Smirnov test |
|
|
44
|
+
| **1 — Retrieval** | How reliably does it retrieve the true counterpart? | Recall@k, Precision@k, MRR, CMC curve |
|
|
45
|
+
| **2 — Topic structure** | Is the space organized by meaning or leaking across topics? | retrieval-based topic confusion, topic-pair average cosine, UMAP projections |
|
|
46
|
+
| **3 — Error taxonomy** | _Why_ do retrievals miss? | Jaccard-based categorization of misses into **lexical confusion**, **semantic confusion**, and **topic-boundary fuzziness** |
|
|
47
|
+
|
|
48
|
+
The Level 3 error taxonomy is the headline: instead of a single aggregate score, each retrieval
|
|
49
|
+
miss is classified by token-overlap between the retrieved and the true target, telling you whether
|
|
50
|
+
a model is being fooled by surface overlap, drifting semantically, or blurring topic boundaries.
|
|
51
|
+
|
|
52
|
+
## Install
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install embedprobe
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
UMAP projection support is included for Level 2 visual diagnostics.
|
|
59
|
+
|
|
60
|
+
## Quickstart
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
import pandas as pd
|
|
64
|
+
from embedprobe import probe
|
|
65
|
+
|
|
66
|
+
# parallel data: one row per pair, plus a topic column
|
|
67
|
+
df = pd.read_csv("pairs.csv") # columns: en, es, topic
|
|
68
|
+
|
|
69
|
+
report = probe(
|
|
70
|
+
models=["sentence-transformers/LaBSE",
|
|
71
|
+
"sentence-transformers/distiluse-base-multilingual-cased-v2"],
|
|
72
|
+
data=df,
|
|
73
|
+
src_col="en",
|
|
74
|
+
tgt_col="es",
|
|
75
|
+
seed=42,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
report.summary() # cross-model DataFrame of all metrics
|
|
79
|
+
report.to_json("report.json")
|
|
80
|
+
report.to_html("report.html") # self-contained diagnostic dashboard
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Or from the command line:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
embedprobe run --models sentence-transformers/LaBSE --data pairs.csv \
|
|
87
|
+
--src en --tgt es --out report
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Status
|
|
91
|
+
|
|
92
|
+
Pre-release (`0.x`). The toolkit originates from an MSc dissertation study of 21
|
|
93
|
+
sentence-transformer models across EN–ES, EN–FR and EN–ZH parallel data; the packaged version is
|
|
94
|
+
being hardened for a 2026 workshop submission. APIs may change until `1.0`.
|
|
95
|
+
|
|
96
|
+
## Roadmap
|
|
97
|
+
|
|
98
|
+
- [ ] Empirical validation of the Jaccard taxonomy thresholds
|
|
99
|
+
- [ ] Selection-prediction study: do these diagnostics predict downstream task ranking?
|
|
100
|
+
- [ ] MTEB adapter: shortlist from the leaderboard → diagnose on your data
|
|
101
|
+
- [ ] Decoder-only LLM support (MEXA-style alignment probing)
|
|
102
|
+
|
|
103
|
+
## License
|
|
104
|
+
|
|
105
|
+
MIT
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# embedprobe
|
|
2
|
+
|
|
3
|
+
**A diagnostic toolkit for evaluating and selecting multilingual embedding models on _your_ data.**
|
|
4
|
+
|
|
5
|
+
Leaderboards like [MTEB](https://github.com/embeddings-benchmark/mteb) tell you _which_ embedding
|
|
6
|
+
model ranks higher on average. **embedprobe tells you _why_ a model fails on your domain and
|
|
7
|
+
language pair,** so you can pick a compact encoder for your task without fine-tuning every
|
|
8
|
+
candidate.
|
|
9
|
+
|
|
10
|
+
Given a parallel dataset (source text, target text, topic), embedprobe dissects each candidate
|
|
11
|
+
model across four diagnostic levels:
|
|
12
|
+
|
|
13
|
+
| Level | Question it answers | Signals |
|
|
14
|
+
| ------------------------ | ----------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------- |
|
|
15
|
+
| **0 — Signal vs. noise** | Does the model separate true pairs from random ones at all? | true-vs-random cosine distributions, SNR, Kolmogorov–Smirnov test |
|
|
16
|
+
| **1 — Retrieval** | How reliably does it retrieve the true counterpart? | Recall@k, Precision@k, MRR, CMC curve |
|
|
17
|
+
| **2 — Topic structure** | Is the space organized by meaning or leaking across topics? | retrieval-based topic confusion, topic-pair average cosine, UMAP projections |
|
|
18
|
+
| **3 — Error taxonomy** | _Why_ do retrievals miss? | Jaccard-based categorization of misses into **lexical confusion**, **semantic confusion**, and **topic-boundary fuzziness** |
|
|
19
|
+
|
|
20
|
+
The Level 3 error taxonomy is the headline: instead of a single aggregate score, each retrieval
|
|
21
|
+
miss is classified by token-overlap between the retrieved and the true target, telling you whether
|
|
22
|
+
a model is being fooled by surface overlap, drifting semantically, or blurring topic boundaries.
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install embedprobe
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
UMAP projection support is included for Level 2 visual diagnostics.
|
|
31
|
+
|
|
32
|
+
## Quickstart
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
import pandas as pd
|
|
36
|
+
from embedprobe import probe
|
|
37
|
+
|
|
38
|
+
# parallel data: one row per pair, plus a topic column
|
|
39
|
+
df = pd.read_csv("pairs.csv") # columns: en, es, topic
|
|
40
|
+
|
|
41
|
+
report = probe(
|
|
42
|
+
models=["sentence-transformers/LaBSE",
|
|
43
|
+
"sentence-transformers/distiluse-base-multilingual-cased-v2"],
|
|
44
|
+
data=df,
|
|
45
|
+
src_col="en",
|
|
46
|
+
tgt_col="es",
|
|
47
|
+
seed=42,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
report.summary() # cross-model DataFrame of all metrics
|
|
51
|
+
report.to_json("report.json")
|
|
52
|
+
report.to_html("report.html") # self-contained diagnostic dashboard
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Or from the command line:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
embedprobe run --models sentence-transformers/LaBSE --data pairs.csv \
|
|
59
|
+
--src en --tgt es --out report
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Status
|
|
63
|
+
|
|
64
|
+
Pre-release (`0.x`). The toolkit originates from an MSc dissertation study of 21
|
|
65
|
+
sentence-transformer models across EN–ES, EN–FR and EN–ZH parallel data; the packaged version is
|
|
66
|
+
being hardened for a 2026 workshop submission. APIs may change until `1.0`.
|
|
67
|
+
|
|
68
|
+
## Roadmap
|
|
69
|
+
|
|
70
|
+
- [ ] Empirical validation of the Jaccard taxonomy thresholds
|
|
71
|
+
- [ ] Selection-prediction study: do these diagnostics predict downstream task ranking?
|
|
72
|
+
- [ ] MTEB adapter: shortlist from the leaderboard → diagnose on your data
|
|
73
|
+
- [ ] Decoder-only LLM support (MEXA-style alignment probing)
|
|
74
|
+
|
|
75
|
+
## License
|
|
76
|
+
|
|
77
|
+
MIT
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "embedprobe"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A diagnostic toolkit for evaluating and selecting multilingual embedding models on your data."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Harish Sainath S", email = "harishsainth036@gmail.com" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["embeddings", "nlp", "evaluation", "diagnostics", "transformers", "model-selection"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"numpy>=1.22",
|
|
25
|
+
"pandas>=1.5",
|
|
26
|
+
"scipy>=1.9",
|
|
27
|
+
"scikit-learn>=1.1",
|
|
28
|
+
"matplotlib>=3.6",
|
|
29
|
+
"sentence-transformers>=2.2",
|
|
30
|
+
"typer>=0.9",
|
|
31
|
+
"rich>=13",
|
|
32
|
+
"umap-learn>=0.5",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.optional-dependencies]
|
|
36
|
+
dev = ["pytest>=7", "build"]
|
|
37
|
+
|
|
38
|
+
[project.scripts]
|
|
39
|
+
embedprobe = "embedprobe.cli:app"
|
|
40
|
+
|
|
41
|
+
[project.urls]
|
|
42
|
+
Homepage = "https://github.com/Sainath26/embedprobe"
|
|
43
|
+
|
|
44
|
+
[tool.hatch.build.targets.wheel]
|
|
45
|
+
packages = ["src/embedprobe"]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""embedprobe — a diagnostic toolkit for evaluating language-model embedding spaces."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def __getattr__(name):
|
|
7
|
+
# Lazy imports keep `import embedprobe` cheap and avoid circular imports.
|
|
8
|
+
if name == "probe":
|
|
9
|
+
from embedprobe.probe import probe
|
|
10
|
+
|
|
11
|
+
return probe
|
|
12
|
+
if name in ("ProbeReport", "ModelDiagnostics"):
|
|
13
|
+
from embedprobe import report
|
|
14
|
+
|
|
15
|
+
return getattr(report, name)
|
|
16
|
+
raise AttributeError(f"module 'embedprobe' has no attribute {name!r}")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
__all__ = ["probe", "ProbeReport", "ModelDiagnostics", "__version__"]
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Command-line interface: ``embedprobe run`` and ``embedprobe compare``."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
import typer
|
|
11
|
+
|
|
12
|
+
app = typer.Typer(help="Diagnose multilingual embedding models on your own data.")
|
|
13
|
+
console = Console()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@app.command()
|
|
17
|
+
def run(
|
|
18
|
+
models: List[str] = typer.Option(..., "--models", "-m", help="Hub model name (repeatable)."),
|
|
19
|
+
data: Path = typer.Option(..., "--data", "-d", help="CSV/Parquet with parallel pairs."),
|
|
20
|
+
src_col: str = typer.Option(..., "--src", "--src-col", help="Source-language text column."),
|
|
21
|
+
tgt_col: str = typer.Option(..., "--tgt", "--tgt-col", help="Target-language text column."),
|
|
22
|
+
topic_col: str = typer.Option("topic", help="Topic column (Level 2 is skipped if absent)."),
|
|
23
|
+
out: Path = typer.Option(Path("embedprobe_report"), "--out", "-o", help="Output basename."),
|
|
24
|
+
seed: int = typer.Option(42, help="Random seed."),
|
|
25
|
+
batch_size: int = typer.Option(32, help="Encoding batch size."),
|
|
26
|
+
device: Optional[str] = typer.Option(None, help="Torch device, e.g. cuda."),
|
|
27
|
+
max_pairs: Optional[int] = typer.Option(None, help="Subsample the dataset to this many pairs."),
|
|
28
|
+
umap: bool = typer.Option(False, "--umap", help="Compute UMAP projections (needs umap-learn)."),
|
|
29
|
+
):
|
|
30
|
+
"""Run the four-level diagnostic and write <out>.json and <out>.html."""
|
|
31
|
+
from embedprobe.probe import probe
|
|
32
|
+
|
|
33
|
+
model_names = _normalize_models(models)
|
|
34
|
+
report = probe(
|
|
35
|
+
models=model_names, data=data, src_col=src_col, tgt_col=tgt_col, topic_col=topic_col,
|
|
36
|
+
seed=seed, batch_size=batch_size, device=device, max_pairs=max_pairs,
|
|
37
|
+
compute_umap=umap,
|
|
38
|
+
)
|
|
39
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
json_path, html_path = out.with_suffix(".json"), out.with_suffix(".html")
|
|
41
|
+
report.to_json(json_path)
|
|
42
|
+
report.to_html(html_path)
|
|
43
|
+
console.print(report.summary().round(4).to_string())
|
|
44
|
+
console.print(f"\n[green]Wrote[/green] {json_path} and {html_path}")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@app.command()
|
|
48
|
+
def compare(
|
|
49
|
+
reports: List[Path] = typer.Argument(..., help="embedprobe JSON reports to merge."),
|
|
50
|
+
out: Optional[Path] = typer.Option(None, "--out", "-o", help="Write merged HTML here."),
|
|
51
|
+
):
|
|
52
|
+
"""Merge previously saved JSON reports into one comparison."""
|
|
53
|
+
from embedprobe.report import ModelDiagnostics, ProbeReport
|
|
54
|
+
|
|
55
|
+
diagnostics, run_meta = [], {}
|
|
56
|
+
for path in reports:
|
|
57
|
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
|
58
|
+
run_meta = run_meta or payload.get("run", {})
|
|
59
|
+
for entry in payload.get("models", []):
|
|
60
|
+
diagnostics.append(
|
|
61
|
+
ModelDiagnostics(
|
|
62
|
+
model_name=entry.get("model", path.stem),
|
|
63
|
+
meta=entry.get("meta", {}),
|
|
64
|
+
level0=entry.get("level0"),
|
|
65
|
+
level1=entry.get("level1"),
|
|
66
|
+
level2=entry.get("level2"),
|
|
67
|
+
level3=entry.get("level3"),
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
merged = ProbeReport(models=diagnostics, run_meta=run_meta)
|
|
71
|
+
console.print(merged.summary().round(4).to_string())
|
|
72
|
+
if out is not None:
|
|
73
|
+
merged.to_html(out)
|
|
74
|
+
console.print(f"\n[green]Wrote[/green] {out}")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _normalize_models(models: List[str]) -> List[str]:
|
|
78
|
+
normalized = []
|
|
79
|
+
for item in models:
|
|
80
|
+
normalized.extend(part.strip() for part in item.split(",") if part.strip())
|
|
81
|
+
return normalized
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
if __name__ == "__main__":
|
|
85
|
+
app()
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Loading and validating parallel-pair datasets."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional, Union
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
TOPIC_COL = "topic"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def load_pairs(
|
|
15
|
+
source: Union[str, Path, pd.DataFrame],
|
|
16
|
+
src_col: str,
|
|
17
|
+
tgt_col: str,
|
|
18
|
+
topic_col: Optional[str] = TOPIC_COL,
|
|
19
|
+
max_pairs: Optional[int] = None,
|
|
20
|
+
seed: Optional[int] = None,
|
|
21
|
+
) -> pd.DataFrame:
|
|
22
|
+
"""Load a parallel dataset from a DataFrame, CSV or Parquet file.
|
|
23
|
+
|
|
24
|
+
Returns a DataFrame with the source, target and (if present) topic columns,
|
|
25
|
+
with empty/missing texts dropped and the index reset. When ``max_pairs`` is
|
|
26
|
+
given, rows are sampled reproducibly with ``seed``.
|
|
27
|
+
"""
|
|
28
|
+
if isinstance(source, pd.DataFrame):
|
|
29
|
+
df = source.copy()
|
|
30
|
+
else:
|
|
31
|
+
path = Path(source)
|
|
32
|
+
if not path.exists():
|
|
33
|
+
raise FileNotFoundError(f"Dataset not found: {path}")
|
|
34
|
+
if path.suffix.lower() == ".parquet":
|
|
35
|
+
df = pd.read_parquet(path)
|
|
36
|
+
else:
|
|
37
|
+
df = pd.read_csv(path)
|
|
38
|
+
|
|
39
|
+
missing = [c for c in (src_col, tgt_col) if c not in df.columns]
|
|
40
|
+
if missing:
|
|
41
|
+
raise ValueError(
|
|
42
|
+
f"Dataset is missing required column(s) {missing}; found {list(df.columns)}"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
cols = [src_col, tgt_col]
|
|
46
|
+
if topic_col is not None and topic_col in df.columns:
|
|
47
|
+
cols.append(topic_col)
|
|
48
|
+
df = df[cols]
|
|
49
|
+
|
|
50
|
+
for col in (src_col, tgt_col):
|
|
51
|
+
df = df[df[col].notna() & (df[col].astype(str).str.strip() != "")]
|
|
52
|
+
|
|
53
|
+
if max_pairs is not None and len(df) > max_pairs:
|
|
54
|
+
df = df.sample(n=max_pairs, random_state=seed)
|
|
55
|
+
|
|
56
|
+
return df.reset_index(drop=True)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def has_topics(df: pd.DataFrame, topic_col: str = TOPIC_COL) -> bool:
|
|
60
|
+
return topic_col in df.columns and df[topic_col].notna().any()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def dataset_hash(df: pd.DataFrame) -> str:
|
|
64
|
+
"""Stable SHA-256 fingerprint of the loaded evaluation pairs."""
|
|
65
|
+
normalized = df.astype("string").fillna("")
|
|
66
|
+
payload = normalized.to_csv(index=False, lineterminator="\n").encode("utf-8")
|
|
67
|
+
return hashlib.sha256(payload).hexdigest()
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Embedding extraction and similarity computation.
|
|
2
|
+
|
|
3
|
+
Heavy dependencies (sentence-transformers / torch) are imported lazily so that
|
|
4
|
+
the pure-numpy diagnostics in :mod:`embedprobe.levels` stay usable without them.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import List, Optional, Sequence, Union
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def load_model(model: Union[str, object]):
|
|
16
|
+
"""Return a SentenceTransformer, loading it from the Hub if given a name."""
|
|
17
|
+
if isinstance(model, str):
|
|
18
|
+
from sentence_transformers import SentenceTransformer
|
|
19
|
+
|
|
20
|
+
return SentenceTransformer(model)
|
|
21
|
+
return model
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def encode(
|
|
25
|
+
model,
|
|
26
|
+
texts: Sequence[str],
|
|
27
|
+
batch_size: int = 32,
|
|
28
|
+
device: Optional[str] = None,
|
|
29
|
+
show_progress: bool = False,
|
|
30
|
+
) -> np.ndarray:
|
|
31
|
+
"""Encode texts with a (loaded) SentenceTransformer into a 2-D array."""
|
|
32
|
+
return model.encode(
|
|
33
|
+
[str(t) for t in texts],
|
|
34
|
+
batch_size=batch_size,
|
|
35
|
+
device=device,
|
|
36
|
+
show_progress_bar=show_progress,
|
|
37
|
+
convert_to_numpy=True,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def pairwise_cosine(emb_a: np.ndarray, emb_b: np.ndarray) -> np.ndarray:
|
|
42
|
+
"""Cosine-similarity matrix between two sets of embeddings."""
|
|
43
|
+
return cosine_similarity(emb_a, emb_b)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def model_info(model, fallback_name: str = "") -> dict:
|
|
47
|
+
"""Best-effort architecture metadata for a SentenceTransformer."""
|
|
48
|
+
info = {
|
|
49
|
+
"name": getattr(model, "model_name_or_path", None) or fallback_name,
|
|
50
|
+
"hub_id": getattr(model, "model_name_or_path", None) or fallback_name,
|
|
51
|
+
"embedding_dim": None,
|
|
52
|
+
"arch": None,
|
|
53
|
+
"layers": None,
|
|
54
|
+
"vocab_size": None,
|
|
55
|
+
"params": None,
|
|
56
|
+
"license": None,
|
|
57
|
+
}
|
|
58
|
+
try:
|
|
59
|
+
info["embedding_dim"] = int(model.get_sentence_embedding_dimension())
|
|
60
|
+
except Exception:
|
|
61
|
+
pass
|
|
62
|
+
try:
|
|
63
|
+
cfg = model._first_module().auto_model.config
|
|
64
|
+
info["arch"] = getattr(cfg, "model_type", None)
|
|
65
|
+
for key in ("num_hidden_layers", "num_layers", "n_layer"):
|
|
66
|
+
if hasattr(cfg, key):
|
|
67
|
+
info["layers"] = int(getattr(cfg, key))
|
|
68
|
+
break
|
|
69
|
+
info["vocab_size"] = getattr(cfg, "vocab_size", None)
|
|
70
|
+
info["license"] = getattr(cfg, "license", None)
|
|
71
|
+
try:
|
|
72
|
+
info["params"] = int(model._first_module().auto_model.num_parameters())
|
|
73
|
+
except Exception:
|
|
74
|
+
pass
|
|
75
|
+
except Exception:
|
|
76
|
+
pass
|
|
77
|
+
return info
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""The four diagnostic levels.
|
|
2
|
+
|
|
3
|
+
Each level is a pure function over a precomputed similarity matrix (and texts /
|
|
4
|
+
topic labels where needed), returning a dict with two keys:
|
|
5
|
+
|
|
6
|
+
- ``"metrics"``: flat, JSON-serializable summary numbers
|
|
7
|
+
- ``"data"``: richer arrays/records used for plotting and drill-down
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from embedprobe.levels.level0 import signal_to_noise
|
|
11
|
+
from embedprobe.levels.level1 import retrieval_metrics
|
|
12
|
+
from embedprobe.levels.level2 import topic_structure
|
|
13
|
+
from embedprobe.levels.level3 import error_taxonomy
|
|
14
|
+
|
|
15
|
+
__all__ = ["signal_to_noise", "retrieval_metrics", "topic_structure", "error_taxonomy"]
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Level 0 — signal-to-noise separability.
|
|
2
|
+
|
|
3
|
+
Compares cosine similarities of true (diagonal) pairs against a matched random
|
|
4
|
+
sample of off-diagonal pairs. A model that has not learned cross-lingual
|
|
5
|
+
alignment shows heavily overlapping distributions and a KS p-value near 1.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
from scipy.stats import ks_2samp
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def signal_to_noise(sim_matrix: np.ndarray, seed: Optional[int] = None) -> dict:
|
|
17
|
+
sim_matrix = np.asarray(sim_matrix)
|
|
18
|
+
n = sim_matrix.shape[0]
|
|
19
|
+
if sim_matrix.shape[0] != sim_matrix.shape[1]:
|
|
20
|
+
raise ValueError("Level 0 expects a square src-x-tgt similarity matrix")
|
|
21
|
+
if n < 2:
|
|
22
|
+
raise ValueError("Need at least 2 pairs for signal-to-noise analysis")
|
|
23
|
+
|
|
24
|
+
true_sims = np.diag(sim_matrix).astype(float)
|
|
25
|
+
off_diag = sim_matrix[~np.eye(n, dtype=bool)].astype(float)
|
|
26
|
+
|
|
27
|
+
rng = np.random.RandomState(seed)
|
|
28
|
+
random_sims = rng.choice(off_diag, size=n, replace=len(off_diag) < n)
|
|
29
|
+
|
|
30
|
+
ks = ks_2samp(true_sims, random_sims)
|
|
31
|
+
random_std = float(random_sims.std())
|
|
32
|
+
snr = float((true_sims.mean() - random_sims.mean()) / random_std) if random_std > 0 else float("inf")
|
|
33
|
+
overlap = _distribution_overlap(true_sims, random_sims)
|
|
34
|
+
|
|
35
|
+
metrics = {
|
|
36
|
+
"mean_true": float(true_sims.mean()),
|
|
37
|
+
"std_true": float(true_sims.std()),
|
|
38
|
+
"mean_random": float(random_sims.mean()),
|
|
39
|
+
"std_random": random_std,
|
|
40
|
+
"snr": snr,
|
|
41
|
+
"ks_statistic": float(ks.statistic),
|
|
42
|
+
"ks_p_value": float(ks.pvalue),
|
|
43
|
+
"p_value": float(ks.pvalue),
|
|
44
|
+
"overlap_fraction": overlap,
|
|
45
|
+
"verdict": _verdict(snr, float(ks.pvalue), overlap),
|
|
46
|
+
"n_pairs": int(n),
|
|
47
|
+
}
|
|
48
|
+
data = {
|
|
49
|
+
"true_sims": true_sims.tolist(),
|
|
50
|
+
"random_sims": random_sims.tolist(),
|
|
51
|
+
}
|
|
52
|
+
return {"metrics": metrics, "data": data}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _distribution_overlap(a: np.ndarray, b: np.ndarray, bins: int = 50) -> float:
|
|
56
|
+
lo = float(min(a.min(), b.min()))
|
|
57
|
+
hi = float(max(a.max(), b.max()))
|
|
58
|
+
if lo == hi:
|
|
59
|
+
return 1.0
|
|
60
|
+
counts_a, edges = np.histogram(a, bins=bins, range=(lo, hi), density=True)
|
|
61
|
+
counts_b, _ = np.histogram(b, bins=edges, density=True)
|
|
62
|
+
widths = np.diff(edges)
|
|
63
|
+
overlap = np.minimum(counts_a, counts_b) * widths
|
|
64
|
+
return float(np.clip(overlap.sum(), 0.0, 1.0))
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _verdict(snr: float, p_value: float, overlap: float) -> str:
|
|
68
|
+
if p_value < 0.01 and snr >= 2.0 and overlap <= 0.25:
|
|
69
|
+
return "strong signal"
|
|
70
|
+
if p_value < 0.05 and snr > 0.5:
|
|
71
|
+
return "weak signal"
|
|
72
|
+
return "poor separation"
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Level 1 — retrieval performance.
|
|
2
|
+
|
|
3
|
+
For each source sentence, rank all target sentences by cosine similarity and
|
|
4
|
+
locate the true counterpart. Ranks are computed on the full similarity matrix,
|
|
5
|
+
so Recall@k, MRR and the CMC curve are exact for any k.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Sequence
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
DEFAULT_KS = (1, 5, 10)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def true_ranks(sim_matrix: np.ndarray) -> np.ndarray:
|
|
18
|
+
"""Rank (1-based) of the true target for each source row."""
|
|
19
|
+
sim_matrix = np.asarray(sim_matrix)
|
|
20
|
+
n = sim_matrix.shape[0]
|
|
21
|
+
diag = sim_matrix[np.arange(n), np.arange(n)]
|
|
22
|
+
# Rank = 1 + number of candidates scoring strictly higher than the true one.
|
|
23
|
+
return (sim_matrix > diag[:, None]).sum(axis=1) + 1
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def retrieval_metrics(
|
|
27
|
+
sim_matrix: np.ndarray,
|
|
28
|
+
ks: Sequence[int] = DEFAULT_KS,
|
|
29
|
+
cmc_max_rank: int = 10,
|
|
30
|
+
) -> dict:
|
|
31
|
+
sim_matrix = np.asarray(sim_matrix)
|
|
32
|
+
if sim_matrix.shape[0] != sim_matrix.shape[1]:
|
|
33
|
+
raise ValueError("Level 1 expects a square src-x-tgt similarity matrix")
|
|
34
|
+
|
|
35
|
+
ranks = true_ranks(sim_matrix)
|
|
36
|
+
n = len(ranks)
|
|
37
|
+
|
|
38
|
+
metrics = {"mrr": float((1.0 / ranks).mean()), "n_queries": int(n)}
|
|
39
|
+
for k in ks:
|
|
40
|
+
recall = float((ranks <= k).mean())
|
|
41
|
+
metrics[f"recall@{k}"] = recall
|
|
42
|
+
metrics[f"precision@{k}"] = recall / k
|
|
43
|
+
|
|
44
|
+
cmc_ranks = list(range(1, cmc_max_rank + 1))
|
|
45
|
+
cmc_values = [float((ranks <= k).mean()) for k in cmc_ranks]
|
|
46
|
+
|
|
47
|
+
data = {
|
|
48
|
+
"ranks": ranks.tolist(),
|
|
49
|
+
"cmc_ranks": cmc_ranks,
|
|
50
|
+
"cmc_values": cmc_values,
|
|
51
|
+
"similarity_histogram": _histogram(sim_matrix),
|
|
52
|
+
}
|
|
53
|
+
return {"metrics": metrics, "data": data}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _histogram(sim_matrix: np.ndarray, bins: int = 20) -> dict:
|
|
57
|
+
counts, edges = np.histogram(sim_matrix.ravel(), bins=bins)
|
|
58
|
+
return {"counts": counts.tolist(), "bin_edges": edges.tolist()}
|